You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
575 lines
21 KiB
575 lines
21 KiB
#!/usr/bin/env python3
|
|
"""
|
|
Wikipedia Personality Scraper for EuroSwarm Parliament MEPs
|
|
|
|
This module scrapes Wikipedia data for each MEP to create realistic, personality-driven
|
|
AI agents based on their real backgrounds, political history, and personal beliefs.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import re
|
|
from typing import Dict, List, Optional, Any
|
|
from dataclasses import dataclass, asdict
|
|
import requests
|
|
from loguru import logger
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
@dataclass
|
|
class MEPPersonalityProfile:
|
|
"""
|
|
Comprehensive personality profile for an MEP based on Wikipedia data.
|
|
|
|
Attributes:
|
|
full_name: Full name of the MEP
|
|
mep_id: Unique MEP identifier
|
|
wikipedia_url: URL of the MEP's Wikipedia page
|
|
summary: Brief summary of the MEP's background
|
|
early_life: Early life and education information
|
|
political_career: Political career and positions held
|
|
political_views: Key political views and positions
|
|
policy_focus: Areas of policy expertise and focus
|
|
achievements: Notable achievements and accomplishments
|
|
controversies: Any controversies or notable incidents
|
|
personal_life: Personal background and family information
|
|
education: Educational background
|
|
professional_background: Professional experience before politics
|
|
party_affiliations: Political party history
|
|
committee_experience: Parliamentary committee experience
|
|
voting_record: Notable voting patterns or positions
|
|
public_statements: Key public statements or quotes
|
|
interests: Personal and professional interests
|
|
languages: Languages spoken
|
|
awards: Awards and recognitions
|
|
publications: Publications or written works
|
|
social_media: Social media presence
|
|
last_updated: When the profile was last updated
|
|
"""
|
|
|
|
full_name: str
|
|
mep_id: str
|
|
wikipedia_url: Optional[str] = None
|
|
summary: str = ""
|
|
early_life: str = ""
|
|
political_career: str = ""
|
|
political_views: str = ""
|
|
policy_focus: str = ""
|
|
achievements: str = ""
|
|
controversies: str = ""
|
|
personal_life: str = ""
|
|
education: str = ""
|
|
professional_background: str = ""
|
|
party_affiliations: str = ""
|
|
committee_experience: str = ""
|
|
voting_record: str = ""
|
|
public_statements: str = ""
|
|
interests: str = ""
|
|
languages: str = ""
|
|
awards: str = ""
|
|
publications: str = ""
|
|
social_media: str = ""
|
|
last_updated: str = ""
|
|
|
|
|
|
class WikipediaPersonalityScraper:
|
|
"""
|
|
Scraper for gathering Wikipedia personality data for MEPs.
|
|
"""
|
|
|
|
def __init__(self, output_dir: str = "mep_personalities", verbose: bool = True):
|
|
"""
|
|
Initialize the Wikipedia personality scraper.
|
|
|
|
Args:
|
|
output_dir: Directory to store personality profiles
|
|
verbose: Enable verbose logging
|
|
"""
|
|
self.output_dir = output_dir
|
|
self.verbose = verbose
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'EuroSwarm Parliament Personality Scraper/1.0 (https://github.com/swarms-democracy)'
|
|
})
|
|
|
|
# Create output directory
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
if verbose:
|
|
logger.info(f"Wikipedia Personality Scraper initialized. Output directory: {output_dir}")
|
|
|
|
def extract_mep_data_from_xml(self, xml_file: str = "EU.xml") -> List[Dict[str, str]]:
|
|
"""
|
|
Extract MEP data from EU.xml file.
|
|
|
|
Args:
|
|
xml_file: Path to EU.xml file
|
|
|
|
Returns:
|
|
List of MEP data dictionaries
|
|
"""
|
|
meps = []
|
|
|
|
try:
|
|
with open(xml_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Use regex to extract MEP data
|
|
mep_pattern = r'<mep>\s*<fullName>(.*?)</fullName>\s*<country>(.*?)</country>\s*<politicalGroup>(.*?)</politicalGroup>\s*<id>(.*?)</id>\s*<nationalPoliticalGroup>(.*?)</nationalPoliticalGroup>\s*</mep>'
|
|
mep_matches = re.findall(mep_pattern, content, re.DOTALL)
|
|
|
|
for full_name, country, political_group, mep_id, national_party in mep_matches:
|
|
meps.append({
|
|
'full_name': full_name.strip(),
|
|
'country': country.strip(),
|
|
'political_group': political_group.strip(),
|
|
'mep_id': mep_id.strip(),
|
|
'national_party': national_party.strip()
|
|
})
|
|
|
|
if self.verbose:
|
|
logger.info(f"Extracted {len(meps)} MEPs from {xml_file}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting MEP data from {xml_file}: {e}")
|
|
|
|
return meps
|
|
|
|
def search_wikipedia_page(self, mep_name: str, country: str) -> Optional[str]:
|
|
"""
|
|
Search for a Wikipedia page for an MEP.
|
|
|
|
Args:
|
|
mep_name: Full name of the MEP
|
|
country: Country of the MEP
|
|
|
|
Returns:
|
|
Wikipedia page title if found, None otherwise
|
|
"""
|
|
try:
|
|
# Search for the MEP on Wikipedia
|
|
search_url = "https://en.wikipedia.org/w/api.php"
|
|
search_params = {
|
|
'action': 'query',
|
|
'format': 'json',
|
|
'list': 'search',
|
|
'srsearch': f'"{mep_name}" {country}',
|
|
'srlimit': 5,
|
|
'srnamespace': 0
|
|
}
|
|
|
|
response = self.session.get(search_url, params=search_params)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
search_results = data.get('query', {}).get('search', [])
|
|
|
|
if search_results:
|
|
# Return the first result
|
|
return search_results[0]['title']
|
|
|
|
# Try alternative search without quotes
|
|
search_params['srsearch'] = f'{mep_name} {country}'
|
|
response = self.session.get(search_url, params=search_params)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
search_results = data.get('query', {}).get('search', [])
|
|
|
|
if search_results:
|
|
return search_results[0]['title']
|
|
|
|
except Exception as e:
|
|
if self.verbose:
|
|
logger.warning(f"Error searching Wikipedia for {mep_name}: {e}")
|
|
|
|
return None
|
|
|
|
def get_wikipedia_content(self, page_title: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get Wikipedia content for a specific page.
|
|
|
|
Args:
|
|
page_title: Wikipedia page title
|
|
|
|
Returns:
|
|
Dictionary containing page content and metadata
|
|
"""
|
|
try:
|
|
# Get page content
|
|
content_url = "https://en.wikipedia.org/w/api.php"
|
|
content_params = {
|
|
'action': 'query',
|
|
'format': 'json',
|
|
'titles': page_title,
|
|
'prop': 'extracts|info|categories',
|
|
'exintro': True,
|
|
'explaintext': True,
|
|
'inprop': 'url',
|
|
'cllimit': 50
|
|
}
|
|
|
|
response = self.session.get(content_url, params=content_params)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
pages = data.get('query', {}).get('pages', {})
|
|
|
|
if pages:
|
|
page_id = list(pages.keys())[0]
|
|
page_data = pages[page_id]
|
|
|
|
return {
|
|
'title': page_data.get('title', ''),
|
|
'extract': page_data.get('extract', ''),
|
|
'url': page_data.get('fullurl', ''),
|
|
'categories': [cat['title'] for cat in page_data.get('categories', [])],
|
|
'pageid': page_data.get('pageid', ''),
|
|
'length': page_data.get('length', 0)
|
|
}
|
|
|
|
except Exception as e:
|
|
if self.verbose:
|
|
logger.warning(f"Error getting Wikipedia content for {page_title}: {e}")
|
|
|
|
return None
|
|
|
|
def parse_wikipedia_content(self, content: str, mep_name: str) -> Dict[str, str]:
|
|
"""
|
|
Parse Wikipedia content to extract structured personality information.
|
|
|
|
Args:
|
|
content: Raw Wikipedia content
|
|
mep_name: Name of the MEP
|
|
|
|
Returns:
|
|
Dictionary of parsed personality information
|
|
"""
|
|
personality_data = {
|
|
'summary': '',
|
|
'early_life': '',
|
|
'political_career': '',
|
|
'political_views': '',
|
|
'policy_focus': '',
|
|
'achievements': '',
|
|
'controversies': '',
|
|
'personal_life': '',
|
|
'education': '',
|
|
'professional_background': '',
|
|
'party_affiliations': '',
|
|
'committee_experience': '',
|
|
'voting_record': '',
|
|
'public_statements': '',
|
|
'interests': '',
|
|
'languages': '',
|
|
'awards': '',
|
|
'publications': '',
|
|
'social_media': ''
|
|
}
|
|
|
|
# Extract summary (first paragraph)
|
|
paragraphs = content.split('\n\n')
|
|
if paragraphs:
|
|
personality_data['summary'] = paragraphs[0][:1000] # Limit summary length
|
|
|
|
# Look for specific sections
|
|
content_lower = content.lower()
|
|
|
|
# Early life and education
|
|
early_life_patterns = [
|
|
r'early life[^.]*\.',
|
|
r'born[^.]*\.',
|
|
r'childhood[^.]*\.',
|
|
r'grew up[^.]*\.',
|
|
r'education[^.]*\.'
|
|
]
|
|
|
|
for pattern in early_life_patterns:
|
|
matches = re.findall(pattern, content_lower, re.IGNORECASE)
|
|
if matches:
|
|
personality_data['early_life'] = ' '.join(matches[:3]) # Take first 3 matches
|
|
break
|
|
|
|
# Political career
|
|
political_patterns = [
|
|
r'political career[^.]*\.',
|
|
r'elected[^.]*\.',
|
|
r'parliament[^.]*\.',
|
|
r'minister[^.]*\.',
|
|
r'party[^.]*\.'
|
|
]
|
|
|
|
for pattern in political_patterns:
|
|
matches = re.findall(pattern, content_lower, re.IGNORECASE)
|
|
if matches:
|
|
personality_data['political_career'] = ' '.join(matches[:5]) # Take first 5 matches
|
|
break
|
|
|
|
# Political views
|
|
views_patterns = [
|
|
r'political views[^.]*\.',
|
|
r'positions[^.]*\.',
|
|
r'advocates[^.]*\.',
|
|
r'supports[^.]*\.',
|
|
r'opposes[^.]*\.'
|
|
]
|
|
|
|
for pattern in views_patterns:
|
|
matches = re.findall(pattern, content_lower, re.IGNORECASE)
|
|
if matches:
|
|
personality_data['political_views'] = ' '.join(matches[:3])
|
|
break
|
|
|
|
# Policy focus
|
|
policy_patterns = [
|
|
r'policy[^.]*\.',
|
|
r'focus[^.]*\.',
|
|
r'issues[^.]*\.',
|
|
r'legislation[^.]*\.'
|
|
]
|
|
|
|
for pattern in policy_patterns:
|
|
matches = re.findall(pattern, content_lower, re.IGNORECASE)
|
|
if matches:
|
|
personality_data['policy_focus'] = ' '.join(matches[:3])
|
|
break
|
|
|
|
# Achievements
|
|
achievement_patterns = [
|
|
r'achievements[^.]*\.',
|
|
r'accomplishments[^.]*\.',
|
|
r'success[^.]*\.',
|
|
r'won[^.]*\.',
|
|
r'received[^.]*\.'
|
|
]
|
|
|
|
for pattern in achievement_patterns:
|
|
matches = re.findall(pattern, content_lower, re.IGNORECASE)
|
|
if matches:
|
|
personality_data['achievements'] = ' '.join(matches[:3])
|
|
break
|
|
|
|
return personality_data
|
|
|
|
def create_personality_profile(self, mep_data: Dict[str, str]) -> MEPPersonalityProfile:
|
|
"""
|
|
Create a personality profile for an MEP.
|
|
|
|
Args:
|
|
mep_data: MEP data from XML file
|
|
|
|
Returns:
|
|
MEPPersonalityProfile object
|
|
"""
|
|
mep_name = mep_data['full_name']
|
|
country = mep_data['country']
|
|
|
|
# Search for Wikipedia page
|
|
page_title = self.search_wikipedia_page(mep_name, country)
|
|
|
|
if page_title:
|
|
# Get Wikipedia content
|
|
wiki_content = self.get_wikipedia_content(page_title)
|
|
|
|
if wiki_content:
|
|
# Parse content
|
|
personality_data = self.parse_wikipedia_content(wiki_content['extract'], mep_name)
|
|
|
|
# Create profile
|
|
profile = MEPPersonalityProfile(
|
|
full_name=mep_name,
|
|
mep_id=mep_data['mep_id'],
|
|
wikipedia_url=wiki_content['url'],
|
|
summary=personality_data['summary'],
|
|
early_life=personality_data['early_life'],
|
|
political_career=personality_data['political_career'],
|
|
political_views=personality_data['political_views'],
|
|
policy_focus=personality_data['policy_focus'],
|
|
achievements=personality_data['achievements'],
|
|
controversies=personality_data['controversies'],
|
|
personal_life=personality_data['personal_life'],
|
|
education=personality_data['education'],
|
|
professional_background=personality_data['professional_background'],
|
|
party_affiliations=personality_data['party_affiliations'],
|
|
committee_experience=personality_data['committee_experience'],
|
|
voting_record=personality_data['voting_record'],
|
|
public_statements=personality_data['public_statements'],
|
|
interests=personality_data['interests'],
|
|
languages=personality_data['languages'],
|
|
awards=personality_data['awards'],
|
|
publications=personality_data['publications'],
|
|
social_media=personality_data['social_media'],
|
|
last_updated=time.strftime("%Y-%m-%d %H:%M:%S")
|
|
)
|
|
|
|
if self.verbose:
|
|
logger.info(f"Created personality profile for {mep_name} from Wikipedia")
|
|
|
|
return profile
|
|
|
|
# Create minimal profile if no Wikipedia data found
|
|
profile = MEPPersonalityProfile(
|
|
full_name=mep_name,
|
|
mep_id=mep_data['mep_id'],
|
|
summary=f"{mep_name} is a Member of the European Parliament representing {country}.",
|
|
political_career=f"Currently serving as MEP for {country}.",
|
|
political_views=f"Member of {mep_data['political_group']} and {mep_data['national_party']}.",
|
|
last_updated=time.strftime("%Y-%m-%d %H:%M:%S")
|
|
)
|
|
|
|
if self.verbose:
|
|
logger.warning(f"No Wikipedia data found for {mep_name}, created minimal profile")
|
|
|
|
return profile
|
|
|
|
def save_personality_profile(self, profile: MEPPersonalityProfile) -> str:
|
|
"""
|
|
Save personality profile to JSON file.
|
|
|
|
Args:
|
|
profile: MEPPersonalityProfile object
|
|
|
|
Returns:
|
|
Path to saved file
|
|
"""
|
|
# Create safe filename
|
|
safe_name = re.sub(r'[^\w\s-]', '', profile.full_name).strip()
|
|
safe_name = re.sub(r'[-\s]+', '_', safe_name)
|
|
filename = f"{safe_name}_{profile.mep_id}.json"
|
|
filepath = os.path.join(self.output_dir, filename)
|
|
|
|
# Convert to dictionary and save
|
|
profile_dict = asdict(profile)
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(profile_dict, f, indent=2, ensure_ascii=False)
|
|
|
|
if self.verbose:
|
|
logger.info(f"Saved personality profile: {filepath}")
|
|
|
|
return filepath
|
|
|
|
def scrape_all_mep_personalities(self, xml_file: str = "EU.xml", delay: float = 1.0) -> Dict[str, str]:
|
|
"""
|
|
Scrape personality data for all MEPs.
|
|
|
|
Args:
|
|
xml_file: Path to EU.xml file
|
|
delay: Delay between requests to be respectful to Wikipedia
|
|
|
|
Returns:
|
|
Dictionary mapping MEP names to their personality profile file paths
|
|
"""
|
|
meps = self.extract_mep_data_from_xml(xml_file)
|
|
profile_files = {}
|
|
|
|
if self.verbose:
|
|
logger.info(f"Starting personality scraping for {len(meps)} MEPs")
|
|
|
|
for i, mep_data in enumerate(meps, 1):
|
|
mep_name = mep_data['full_name']
|
|
|
|
if self.verbose:
|
|
logger.info(f"Processing {i}/{len(meps)}: {mep_name}")
|
|
|
|
try:
|
|
# Create personality profile
|
|
profile = self.create_personality_profile(mep_data)
|
|
|
|
# Save profile
|
|
filepath = self.save_personality_profile(profile)
|
|
profile_files[mep_name] = filepath
|
|
|
|
# Respectful delay
|
|
time.sleep(delay)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {mep_name}: {e}")
|
|
continue
|
|
|
|
if self.verbose:
|
|
logger.info(f"Completed personality scraping. {len(profile_files)} profiles created.")
|
|
|
|
return profile_files
|
|
|
|
def load_personality_profile(self, filepath: str) -> MEPPersonalityProfile:
|
|
"""
|
|
Load personality profile from JSON file.
|
|
|
|
Args:
|
|
filepath: Path to personality profile JSON file
|
|
|
|
Returns:
|
|
MEPPersonalityProfile object
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
return MEPPersonalityProfile(**data)
|
|
|
|
def get_personality_summary(self, profile: MEPPersonalityProfile) -> str:
|
|
"""
|
|
Generate a personality summary for use in AI agent system prompts.
|
|
|
|
Args:
|
|
profile: MEPPersonalityProfile object
|
|
|
|
Returns:
|
|
Formatted personality summary
|
|
"""
|
|
summary_parts = []
|
|
|
|
if profile.summary:
|
|
summary_parts.append(f"Background: {profile.summary}")
|
|
|
|
if profile.political_career:
|
|
summary_parts.append(f"Political Career: {profile.political_career}")
|
|
|
|
if profile.political_views:
|
|
summary_parts.append(f"Political Views: {profile.political_views}")
|
|
|
|
if profile.policy_focus:
|
|
summary_parts.append(f"Policy Focus: {profile.policy_focus}")
|
|
|
|
if profile.achievements:
|
|
summary_parts.append(f"Notable Achievements: {profile.achievements}")
|
|
|
|
if profile.education:
|
|
summary_parts.append(f"Education: {profile.education}")
|
|
|
|
if profile.professional_background:
|
|
summary_parts.append(f"Professional Background: {profile.professional_background}")
|
|
|
|
return "\n".join(summary_parts)
|
|
|
|
|
|
def main():
|
|
"""Main function to run the Wikipedia personality scraper."""
|
|
|
|
print("🏛️ WIKIPEDIA PERSONALITY SCRAPER FOR EUROSWARM PARLIAMENT")
|
|
print("=" * 70)
|
|
|
|
# Initialize scraper
|
|
scraper = WikipediaPersonalityScraper(output_dir="mep_personalities", verbose=True)
|
|
|
|
# Scrape all MEP personalities
|
|
profile_files = scraper.scrape_all_mep_personalities(delay=1.0)
|
|
|
|
print(f"\n✅ Scraping completed!")
|
|
print(f"📁 Profiles saved to: {scraper.output_dir}")
|
|
print(f"📊 Total profiles created: {len(profile_files)}")
|
|
|
|
# Show sample profile
|
|
if profile_files:
|
|
sample_name = list(profile_files.keys())[0]
|
|
sample_file = profile_files[sample_name]
|
|
sample_profile = scraper.load_personality_profile(sample_file)
|
|
|
|
print(f"\n📋 Sample Profile: {sample_name}")
|
|
print("-" * 50)
|
|
print(scraper.get_personality_summary(sample_profile))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |