You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
swarms/examples/multi_agent/simulations/euroswarm_parliament/wikipedia_personality_scrap...

682 lines
22 KiB

#!/usr/bin/env python3
"""
Wikipedia Personality Scraper for EuroSwarm Parliament MEPs
This module scrapes Wikipedia data for each MEP to create realistic, personality-driven
AI agents based on their real backgrounds, political history, and personal beliefs.
"""
import json
import os
import time
import re
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
import requests
from loguru import logger
@dataclass
class MEPPersonalityProfile:
"""
Comprehensive personality profile for an MEP based on Wikipedia data.
Attributes:
full_name: Full name of the MEP
mep_id: Unique MEP identifier
wikipedia_url: URL of the MEP's Wikipedia page
summary: Brief summary of the MEP's background
early_life: Early life and education information
political_career: Political career and positions held
political_views: Key political views and positions
policy_focus: Areas of policy expertise and focus
achievements: Notable achievements and accomplishments
controversies: Any controversies or notable incidents
personal_life: Personal background and family information
education: Educational background
professional_background: Professional experience before politics
party_affiliations: Political party history
committee_experience: Parliamentary committee experience
voting_record: Notable voting patterns or positions
public_statements: Key public statements or quotes
interests: Personal and professional interests
languages: Languages spoken
awards: Awards and recognitions
publications: Publications or written works
social_media: Social media presence
last_updated: When the profile was last updated
"""
full_name: str
mep_id: str
wikipedia_url: Optional[str] = None
summary: str = ""
early_life: str = ""
political_career: str = ""
political_views: str = ""
policy_focus: str = ""
achievements: str = ""
controversies: str = ""
personal_life: str = ""
education: str = ""
professional_background: str = ""
party_affiliations: str = ""
committee_experience: str = ""
voting_record: str = ""
public_statements: str = ""
interests: str = ""
languages: str = ""
awards: str = ""
publications: str = ""
social_media: str = ""
last_updated: str = ""
class WikipediaPersonalityScraper:
"""
Scraper for gathering Wikipedia personality data for MEPs.
"""
def __init__(
self,
output_dir: str = "mep_personalities",
verbose: bool = True,
):
"""
Initialize the Wikipedia personality scraper.
Args:
output_dir: Directory to store personality profiles
verbose: Enable verbose logging
"""
self.output_dir = output_dir
self.verbose = verbose
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": "EuroSwarm Parliament Personality Scraper/1.0 (https://github.com/swarms-democracy)"
}
)
# Create output directory
os.makedirs(output_dir, exist_ok=True)
if verbose:
logger.info(
f"Wikipedia Personality Scraper initialized. Output directory: {output_dir}"
)
def extract_mep_data_from_xml(
self, xml_file: str = "EU.xml"
) -> List[Dict[str, str]]:
"""
Extract MEP data from EU.xml file.
Args:
xml_file: Path to EU.xml file
Returns:
List of MEP data dictionaries
"""
meps = []
try:
with open(xml_file, "r", encoding="utf-8") as f:
content = f.read()
# Use regex to extract MEP data
mep_pattern = r"<mep>\s*<fullName>(.*?)</fullName>\s*<country>(.*?)</country>\s*<politicalGroup>(.*?)</politicalGroup>\s*<id>(.*?)</id>\s*<nationalPoliticalGroup>(.*?)</nationalPoliticalGroup>\s*</mep>"
mep_matches = re.findall(mep_pattern, content, re.DOTALL)
for (
full_name,
country,
political_group,
mep_id,
national_party,
) in mep_matches:
meps.append(
{
"full_name": full_name.strip(),
"country": country.strip(),
"political_group": political_group.strip(),
"mep_id": mep_id.strip(),
"national_party": national_party.strip(),
}
)
if self.verbose:
logger.info(
f"Extracted {len(meps)} MEPs from {xml_file}"
)
except Exception as e:
logger.error(
f"Error extracting MEP data from {xml_file}: {e}"
)
return meps
def search_wikipedia_page(
self, mep_name: str, country: str
) -> Optional[str]:
"""
Search for a Wikipedia page for an MEP.
Args:
mep_name: Full name of the MEP
country: Country of the MEP
Returns:
Wikipedia page title if found, None otherwise
"""
try:
# Search for the MEP on Wikipedia
search_url = "https://en.wikipedia.org/w/api.php"
search_params = {
"action": "query",
"format": "json",
"list": "search",
"srsearch": f'"{mep_name}" {country}',
"srlimit": 5,
"srnamespace": 0,
}
response = self.session.get(
search_url, params=search_params
)
response.raise_for_status()
data = response.json()
search_results = data.get("query", {}).get("search", [])
if search_results:
# Return the first result
return search_results[0]["title"]
# Try alternative search without quotes
search_params["srsearch"] = f"{mep_name} {country}"
response = self.session.get(
search_url, params=search_params
)
response.raise_for_status()
data = response.json()
search_results = data.get("query", {}).get("search", [])
if search_results:
return search_results[0]["title"]
except Exception as e:
if self.verbose:
logger.warning(
f"Error searching Wikipedia for {mep_name}: {e}"
)
return None
def get_wikipedia_content(
self, page_title: str
) -> Optional[Dict[str, Any]]:
"""
Get Wikipedia content for a specific page.
Args:
page_title: Wikipedia page title
Returns:
Dictionary containing page content and metadata
"""
try:
# Get page content
content_url = "https://en.wikipedia.org/w/api.php"
content_params = {
"action": "query",
"format": "json",
"titles": page_title,
"prop": "extracts|info|categories",
"exintro": True,
"explaintext": True,
"inprop": "url",
"cllimit": 50,
}
response = self.session.get(
content_url, params=content_params
)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
if pages:
page_id = list(pages.keys())[0]
page_data = pages[page_id]
return {
"title": page_data.get("title", ""),
"extract": page_data.get("extract", ""),
"url": page_data.get("fullurl", ""),
"categories": [
cat["title"]
for cat in page_data.get("categories", [])
],
"pageid": page_data.get("pageid", ""),
"length": page_data.get("length", 0),
}
except Exception as e:
if self.verbose:
logger.warning(
f"Error getting Wikipedia content for {page_title}: {e}"
)
return None
def parse_wikipedia_content(
self, content: str, mep_name: str
) -> Dict[str, str]:
"""
Parse Wikipedia content to extract structured personality information.
Args:
content: Raw Wikipedia content
mep_name: Name of the MEP
Returns:
Dictionary of parsed personality information
"""
personality_data = {
"summary": "",
"early_life": "",
"political_career": "",
"political_views": "",
"policy_focus": "",
"achievements": "",
"controversies": "",
"personal_life": "",
"education": "",
"professional_background": "",
"party_affiliations": "",
"committee_experience": "",
"voting_record": "",
"public_statements": "",
"interests": "",
"languages": "",
"awards": "",
"publications": "",
"social_media": "",
}
# Extract summary (first paragraph)
paragraphs = content.split("\n\n")
if paragraphs:
personality_data["summary"] = paragraphs[0][
:1000
] # Limit summary length
# Look for specific sections
content_lower = content.lower()
# Early life and education
early_life_patterns = [
r"early life[^.]*\.",
r"born[^.]*\.",
r"childhood[^.]*\.",
r"grew up[^.]*\.",
r"education[^.]*\.",
]
for pattern in early_life_patterns:
matches = re.findall(
pattern, content_lower, re.IGNORECASE
)
if matches:
personality_data["early_life"] = " ".join(
matches[:3]
) # Take first 3 matches
break
# Political career
political_patterns = [
r"political career[^.]*\.",
r"elected[^.]*\.",
r"parliament[^.]*\.",
r"minister[^.]*\.",
r"party[^.]*\.",
]
for pattern in political_patterns:
matches = re.findall(
pattern, content_lower, re.IGNORECASE
)
if matches:
personality_data["political_career"] = " ".join(
matches[:5]
) # Take first 5 matches
break
# Political views
views_patterns = [
r"political views[^.]*\.",
r"positions[^.]*\.",
r"advocates[^.]*\.",
r"supports[^.]*\.",
r"opposes[^.]*\.",
]
for pattern in views_patterns:
matches = re.findall(
pattern, content_lower, re.IGNORECASE
)
if matches:
personality_data["political_views"] = " ".join(
matches[:3]
)
break
# Policy focus
policy_patterns = [
r"policy[^.]*\.",
r"focus[^.]*\.",
r"issues[^.]*\.",
r"legislation[^.]*\.",
]
for pattern in policy_patterns:
matches = re.findall(
pattern, content_lower, re.IGNORECASE
)
if matches:
personality_data["policy_focus"] = " ".join(
matches[:3]
)
break
# Achievements
achievement_patterns = [
r"achievements[^.]*\.",
r"accomplishments[^.]*\.",
r"success[^.]*\.",
r"won[^.]*\.",
r"received[^.]*\.",
]
for pattern in achievement_patterns:
matches = re.findall(
pattern, content_lower, re.IGNORECASE
)
if matches:
personality_data["achievements"] = " ".join(
matches[:3]
)
break
return personality_data
def create_personality_profile(
self, mep_data: Dict[str, str]
) -> MEPPersonalityProfile:
"""
Create a personality profile for an MEP.
Args:
mep_data: MEP data from XML file
Returns:
MEPPersonalityProfile object
"""
mep_name = mep_data["full_name"]
country = mep_data["country"]
# Search for Wikipedia page
page_title = self.search_wikipedia_page(mep_name, country)
if page_title:
# Get Wikipedia content
wiki_content = self.get_wikipedia_content(page_title)
if wiki_content:
# Parse content
personality_data = self.parse_wikipedia_content(
wiki_content["extract"], mep_name
)
# Create profile
profile = MEPPersonalityProfile(
full_name=mep_name,
mep_id=mep_data["mep_id"],
wikipedia_url=wiki_content["url"],
summary=personality_data["summary"],
early_life=personality_data["early_life"],
political_career=personality_data[
"political_career"
],
political_views=personality_data[
"political_views"
],
policy_focus=personality_data["policy_focus"],
achievements=personality_data["achievements"],
controversies=personality_data["controversies"],
personal_life=personality_data["personal_life"],
education=personality_data["education"],
professional_background=personality_data[
"professional_background"
],
party_affiliations=personality_data[
"party_affiliations"
],
committee_experience=personality_data[
"committee_experience"
],
voting_record=personality_data["voting_record"],
public_statements=personality_data[
"public_statements"
],
interests=personality_data["interests"],
languages=personality_data["languages"],
awards=personality_data["awards"],
publications=personality_data["publications"],
social_media=personality_data["social_media"],
last_updated=time.strftime("%Y-%m-%d %H:%M:%S"),
)
if self.verbose:
logger.info(
f"Created personality profile for {mep_name} from Wikipedia"
)
return profile
# Create minimal profile if no Wikipedia data found
profile = MEPPersonalityProfile(
full_name=mep_name,
mep_id=mep_data["mep_id"],
summary=f"{mep_name} is a Member of the European Parliament representing {country}.",
political_career=f"Currently serving as MEP for {country}.",
political_views=f"Member of {mep_data['political_group']} and {mep_data['national_party']}.",
last_updated=time.strftime("%Y-%m-%d %H:%M:%S"),
)
if self.verbose:
logger.warning(
f"No Wikipedia data found for {mep_name}, created minimal profile"
)
return profile
def save_personality_profile(
self, profile: MEPPersonalityProfile
) -> str:
"""
Save personality profile to JSON file.
Args:
profile: MEPPersonalityProfile object
Returns:
Path to saved file
"""
# Create safe filename
safe_name = re.sub(r"[^\w\s-]", "", profile.full_name).strip()
safe_name = re.sub(r"[-\s]+", "_", safe_name)
filename = f"{safe_name}_{profile.mep_id}.json"
filepath = os.path.join(self.output_dir, filename)
# Convert to dictionary and save
profile_dict = asdict(profile)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(profile_dict, f, indent=2, ensure_ascii=False)
if self.verbose:
logger.info(f"Saved personality profile: {filepath}")
return filepath
def scrape_all_mep_personalities(
self, xml_file: str = "EU.xml", delay: float = 1.0
) -> Dict[str, str]:
"""
Scrape personality data for all MEPs.
Args:
xml_file: Path to EU.xml file
delay: Delay between requests to be respectful to Wikipedia
Returns:
Dictionary mapping MEP names to their personality profile file paths
"""
meps = self.extract_mep_data_from_xml(xml_file)
profile_files = {}
if self.verbose:
logger.info(
f"Starting personality scraping for {len(meps)} MEPs"
)
for i, mep_data in enumerate(meps, 1):
mep_name = mep_data["full_name"]
if self.verbose:
logger.info(f"Processing {i}/{len(meps)}: {mep_name}")
try:
# Create personality profile
profile = self.create_personality_profile(mep_data)
# Save profile
filepath = self.save_personality_profile(profile)
profile_files[mep_name] = filepath
# Respectful delay
time.sleep(delay)
except Exception as e:
logger.error(f"Error processing {mep_name}: {e}")
continue
if self.verbose:
logger.info(
f"Completed personality scraping. {len(profile_files)} profiles created."
)
return profile_files
def load_personality_profile(
self, filepath: str
) -> MEPPersonalityProfile:
"""
Load personality profile from JSON file.
Args:
filepath: Path to personality profile JSON file
Returns:
MEPPersonalityProfile object
"""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
return MEPPersonalityProfile(**data)
def get_personality_summary(
self, profile: MEPPersonalityProfile
) -> str:
"""
Generate a personality summary for use in AI agent system prompts.
Args:
profile: MEPPersonalityProfile object
Returns:
Formatted personality summary
"""
summary_parts = []
if profile.summary:
summary_parts.append(f"Background: {profile.summary}")
if profile.political_career:
summary_parts.append(
f"Political Career: {profile.political_career}"
)
if profile.political_views:
summary_parts.append(
f"Political Views: {profile.political_views}"
)
if profile.policy_focus:
summary_parts.append(
f"Policy Focus: {profile.policy_focus}"
)
if profile.achievements:
summary_parts.append(
f"Notable Achievements: {profile.achievements}"
)
if profile.education:
summary_parts.append(f"Education: {profile.education}")
if profile.professional_background:
summary_parts.append(
f"Professional Background: {profile.professional_background}"
)
return "\n".join(summary_parts)
def main():
"""Main function to run the Wikipedia personality scraper."""
print("🏛️ WIKIPEDIA PERSONALITY SCRAPER FOR EUROSWARM PARLIAMENT")
print("=" * 70)
# Initialize scraper
scraper = WikipediaPersonalityScraper(
output_dir="mep_personalities", verbose=True
)
# Scrape all MEP personalities
profile_files = scraper.scrape_all_mep_personalities(delay=1.0)
print("\n✅ Scraping completed!")
print(f"📁 Profiles saved to: {scraper.output_dir}")
print(f"📊 Total profiles created: {len(profile_files)}")
# Show sample profile
if profile_files:
sample_name = list(profile_files.keys())[0]
sample_file = profile_files[sample_name]
sample_profile = scraper.load_personality_profile(sample_file)
print(f"\n📋 Sample Profile: {sample_name}")
print("-" * 50)
print(scraper.get_personality_summary(sample_profile))
if __name__ == "__main__":
main()