You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							682 lines
						
					
					
						
							22 KiB
						
					
					
				
			
		
		
	
	
							682 lines
						
					
					
						
							22 KiB
						
					
					
				| #!/usr/bin/env python3
 | |
| """
 | |
| Wikipedia Personality Scraper for EuroSwarm Parliament MEPs
 | |
| 
 | |
| This module scrapes Wikipedia data for each MEP to create realistic, personality-driven
 | |
| AI agents based on their real backgrounds, political history, and personal beliefs.
 | |
| """
 | |
| 
 | |
| import json
 | |
| import os
 | |
| import time
 | |
| import re
 | |
| from typing import Dict, List, Optional, Any
 | |
| from dataclasses import dataclass, asdict
 | |
| import requests
 | |
| from loguru import logger
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class MEPPersonalityProfile:
 | |
|     """
 | |
|     Comprehensive personality profile for an MEP based on Wikipedia data.
 | |
| 
 | |
|     Attributes:
 | |
|         full_name: Full name of the MEP
 | |
|         mep_id: Unique MEP identifier
 | |
|         wikipedia_url: URL of the MEP's Wikipedia page
 | |
|         summary: Brief summary of the MEP's background
 | |
|         early_life: Early life and education information
 | |
|         political_career: Political career and positions held
 | |
|         political_views: Key political views and positions
 | |
|         policy_focus: Areas of policy expertise and focus
 | |
|         achievements: Notable achievements and accomplishments
 | |
|         controversies: Any controversies or notable incidents
 | |
|         personal_life: Personal background and family information
 | |
|         education: Educational background
 | |
|         professional_background: Professional experience before politics
 | |
|         party_affiliations: Political party history
 | |
|         committee_experience: Parliamentary committee experience
 | |
|         voting_record: Notable voting patterns or positions
 | |
|         public_statements: Key public statements or quotes
 | |
|         interests: Personal and professional interests
 | |
|         languages: Languages spoken
 | |
|         awards: Awards and recognitions
 | |
|         publications: Publications or written works
 | |
|         social_media: Social media presence
 | |
|         last_updated: When the profile was last updated
 | |
|     """
 | |
| 
 | |
|     full_name: str
 | |
|     mep_id: str
 | |
|     wikipedia_url: Optional[str] = None
 | |
|     summary: str = ""
 | |
|     early_life: str = ""
 | |
|     political_career: str = ""
 | |
|     political_views: str = ""
 | |
|     policy_focus: str = ""
 | |
|     achievements: str = ""
 | |
|     controversies: str = ""
 | |
|     personal_life: str = ""
 | |
|     education: str = ""
 | |
|     professional_background: str = ""
 | |
|     party_affiliations: str = ""
 | |
|     committee_experience: str = ""
 | |
|     voting_record: str = ""
 | |
|     public_statements: str = ""
 | |
|     interests: str = ""
 | |
|     languages: str = ""
 | |
|     awards: str = ""
 | |
|     publications: str = ""
 | |
|     social_media: str = ""
 | |
|     last_updated: str = ""
 | |
| 
 | |
| 
 | |
| class WikipediaPersonalityScraper:
 | |
|     """
 | |
|     Scraper for gathering Wikipedia personality data for MEPs.
 | |
|     """
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         output_dir: str = "mep_personalities",
 | |
|         verbose: bool = True,
 | |
|     ):
 | |
|         """
 | |
|         Initialize the Wikipedia personality scraper.
 | |
| 
 | |
|         Args:
 | |
|             output_dir: Directory to store personality profiles
 | |
|             verbose: Enable verbose logging
 | |
|         """
 | |
|         self.output_dir = output_dir
 | |
|         self.verbose = verbose
 | |
|         self.session = requests.Session()
 | |
|         self.session.headers.update(
 | |
|             {
 | |
|                 "User-Agent": "EuroSwarm Parliament Personality Scraper/1.0 (https://github.com/swarms-democracy)"
 | |
|             }
 | |
|         )
 | |
| 
 | |
|         # Create output directory
 | |
|         os.makedirs(output_dir, exist_ok=True)
 | |
| 
 | |
|         if verbose:
 | |
|             logger.info(
 | |
|                 f"Wikipedia Personality Scraper initialized. Output directory: {output_dir}"
 | |
|             )
 | |
| 
 | |
|     def extract_mep_data_from_xml(
 | |
|         self, xml_file: str = "EU.xml"
 | |
|     ) -> List[Dict[str, str]]:
 | |
|         """
 | |
|         Extract MEP data from EU.xml file.
 | |
| 
 | |
|         Args:
 | |
|             xml_file: Path to EU.xml file
 | |
| 
 | |
|         Returns:
 | |
|             List of MEP data dictionaries
 | |
|         """
 | |
|         meps = []
 | |
| 
 | |
|         try:
 | |
|             with open(xml_file, "r", encoding="utf-8") as f:
 | |
|                 content = f.read()
 | |
| 
 | |
|             # Use regex to extract MEP data
 | |
|             mep_pattern = r"<mep>\s*<fullName>(.*?)</fullName>\s*<country>(.*?)</country>\s*<politicalGroup>(.*?)</politicalGroup>\s*<id>(.*?)</id>\s*<nationalPoliticalGroup>(.*?)</nationalPoliticalGroup>\s*</mep>"
 | |
|             mep_matches = re.findall(mep_pattern, content, re.DOTALL)
 | |
| 
 | |
|             for (
 | |
|                 full_name,
 | |
|                 country,
 | |
|                 political_group,
 | |
|                 mep_id,
 | |
|                 national_party,
 | |
|             ) in mep_matches:
 | |
|                 meps.append(
 | |
|                     {
 | |
|                         "full_name": full_name.strip(),
 | |
|                         "country": country.strip(),
 | |
|                         "political_group": political_group.strip(),
 | |
|                         "mep_id": mep_id.strip(),
 | |
|                         "national_party": national_party.strip(),
 | |
|                     }
 | |
|                 )
 | |
| 
 | |
|             if self.verbose:
 | |
|                 logger.info(
 | |
|                     f"Extracted {len(meps)} MEPs from {xml_file}"
 | |
|                 )
 | |
| 
 | |
|         except Exception as e:
 | |
|             logger.error(
 | |
|                 f"Error extracting MEP data from {xml_file}: {e}"
 | |
|             )
 | |
| 
 | |
|         return meps
 | |
| 
 | |
|     def search_wikipedia_page(
 | |
|         self, mep_name: str, country: str
 | |
|     ) -> Optional[str]:
 | |
|         """
 | |
|         Search for a Wikipedia page for an MEP.
 | |
| 
 | |
|         Args:
 | |
|             mep_name: Full name of the MEP
 | |
|             country: Country of the MEP
 | |
| 
 | |
|         Returns:
 | |
|             Wikipedia page title if found, None otherwise
 | |
|         """
 | |
|         try:
 | |
|             # Search for the MEP on Wikipedia
 | |
|             search_url = "https://en.wikipedia.org/w/api.php"
 | |
|             search_params = {
 | |
|                 "action": "query",
 | |
|                 "format": "json",
 | |
|                 "list": "search",
 | |
|                 "srsearch": f'"{mep_name}" {country}',
 | |
|                 "srlimit": 5,
 | |
|                 "srnamespace": 0,
 | |
|             }
 | |
| 
 | |
|             response = self.session.get(
 | |
|                 search_url, params=search_params
 | |
|             )
 | |
|             response.raise_for_status()
 | |
| 
 | |
|             data = response.json()
 | |
|             search_results = data.get("query", {}).get("search", [])
 | |
| 
 | |
|             if search_results:
 | |
|                 # Return the first result
 | |
|                 return search_results[0]["title"]
 | |
| 
 | |
|             # Try alternative search without quotes
 | |
|             search_params["srsearch"] = f"{mep_name} {country}"
 | |
|             response = self.session.get(
 | |
|                 search_url, params=search_params
 | |
|             )
 | |
|             response.raise_for_status()
 | |
| 
 | |
|             data = response.json()
 | |
|             search_results = data.get("query", {}).get("search", [])
 | |
| 
 | |
|             if search_results:
 | |
|                 return search_results[0]["title"]
 | |
| 
 | |
|         except Exception as e:
 | |
|             if self.verbose:
 | |
|                 logger.warning(
 | |
|                     f"Error searching Wikipedia for {mep_name}: {e}"
 | |
|                 )
 | |
| 
 | |
|         return None
 | |
| 
 | |
|     def get_wikipedia_content(
 | |
|         self, page_title: str
 | |
|     ) -> Optional[Dict[str, Any]]:
 | |
|         """
 | |
|         Get Wikipedia content for a specific page.
 | |
| 
 | |
|         Args:
 | |
|             page_title: Wikipedia page title
 | |
| 
 | |
|         Returns:
 | |
|             Dictionary containing page content and metadata
 | |
|         """
 | |
|         try:
 | |
|             # Get page content
 | |
|             content_url = "https://en.wikipedia.org/w/api.php"
 | |
|             content_params = {
 | |
|                 "action": "query",
 | |
|                 "format": "json",
 | |
|                 "titles": page_title,
 | |
|                 "prop": "extracts|info|categories",
 | |
|                 "exintro": True,
 | |
|                 "explaintext": True,
 | |
|                 "inprop": "url",
 | |
|                 "cllimit": 50,
 | |
|             }
 | |
| 
 | |
|             response = self.session.get(
 | |
|                 content_url, params=content_params
 | |
|             )
 | |
|             response.raise_for_status()
 | |
| 
 | |
|             data = response.json()
 | |
|             pages = data.get("query", {}).get("pages", {})
 | |
| 
 | |
|             if pages:
 | |
|                 page_id = list(pages.keys())[0]
 | |
|                 page_data = pages[page_id]
 | |
| 
 | |
|                 return {
 | |
|                     "title": page_data.get("title", ""),
 | |
|                     "extract": page_data.get("extract", ""),
 | |
|                     "url": page_data.get("fullurl", ""),
 | |
|                     "categories": [
 | |
|                         cat["title"]
 | |
|                         for cat in page_data.get("categories", [])
 | |
|                     ],
 | |
|                     "pageid": page_data.get("pageid", ""),
 | |
|                     "length": page_data.get("length", 0),
 | |
|                 }
 | |
| 
 | |
|         except Exception as e:
 | |
|             if self.verbose:
 | |
|                 logger.warning(
 | |
|                     f"Error getting Wikipedia content for {page_title}: {e}"
 | |
|                 )
 | |
| 
 | |
|         return None
 | |
| 
 | |
|     def parse_wikipedia_content(
 | |
|         self, content: str, mep_name: str
 | |
|     ) -> Dict[str, str]:
 | |
|         """
 | |
|         Parse Wikipedia content to extract structured personality information.
 | |
| 
 | |
|         Args:
 | |
|             content: Raw Wikipedia content
 | |
|             mep_name: Name of the MEP
 | |
| 
 | |
|         Returns:
 | |
|             Dictionary of parsed personality information
 | |
|         """
 | |
|         personality_data = {
 | |
|             "summary": "",
 | |
|             "early_life": "",
 | |
|             "political_career": "",
 | |
|             "political_views": "",
 | |
|             "policy_focus": "",
 | |
|             "achievements": "",
 | |
|             "controversies": "",
 | |
|             "personal_life": "",
 | |
|             "education": "",
 | |
|             "professional_background": "",
 | |
|             "party_affiliations": "",
 | |
|             "committee_experience": "",
 | |
|             "voting_record": "",
 | |
|             "public_statements": "",
 | |
|             "interests": "",
 | |
|             "languages": "",
 | |
|             "awards": "",
 | |
|             "publications": "",
 | |
|             "social_media": "",
 | |
|         }
 | |
| 
 | |
|         # Extract summary (first paragraph)
 | |
|         paragraphs = content.split("\n\n")
 | |
|         if paragraphs:
 | |
|             personality_data["summary"] = paragraphs[0][
 | |
|                 :1000
 | |
|             ]  # Limit summary length
 | |
| 
 | |
|         # Look for specific sections
 | |
|         content_lower = content.lower()
 | |
| 
 | |
|         # Early life and education
 | |
|         early_life_patterns = [
 | |
|             r"early life[^.]*\.",
 | |
|             r"born[^.]*\.",
 | |
|             r"childhood[^.]*\.",
 | |
|             r"grew up[^.]*\.",
 | |
|             r"education[^.]*\.",
 | |
|         ]
 | |
| 
 | |
|         for pattern in early_life_patterns:
 | |
|             matches = re.findall(
 | |
|                 pattern, content_lower, re.IGNORECASE
 | |
|             )
 | |
|             if matches:
 | |
|                 personality_data["early_life"] = " ".join(
 | |
|                     matches[:3]
 | |
|                 )  # Take first 3 matches
 | |
|                 break
 | |
| 
 | |
|         # Political career
 | |
|         political_patterns = [
 | |
|             r"political career[^.]*\.",
 | |
|             r"elected[^.]*\.",
 | |
|             r"parliament[^.]*\.",
 | |
|             r"minister[^.]*\.",
 | |
|             r"party[^.]*\.",
 | |
|         ]
 | |
| 
 | |
|         for pattern in political_patterns:
 | |
|             matches = re.findall(
 | |
|                 pattern, content_lower, re.IGNORECASE
 | |
|             )
 | |
|             if matches:
 | |
|                 personality_data["political_career"] = " ".join(
 | |
|                     matches[:5]
 | |
|                 )  # Take first 5 matches
 | |
|                 break
 | |
| 
 | |
|         # Political views
 | |
|         views_patterns = [
 | |
|             r"political views[^.]*\.",
 | |
|             r"positions[^.]*\.",
 | |
|             r"advocates[^.]*\.",
 | |
|             r"supports[^.]*\.",
 | |
|             r"opposes[^.]*\.",
 | |
|         ]
 | |
| 
 | |
|         for pattern in views_patterns:
 | |
|             matches = re.findall(
 | |
|                 pattern, content_lower, re.IGNORECASE
 | |
|             )
 | |
|             if matches:
 | |
|                 personality_data["political_views"] = " ".join(
 | |
|                     matches[:3]
 | |
|                 )
 | |
|                 break
 | |
| 
 | |
|         # Policy focus
 | |
|         policy_patterns = [
 | |
|             r"policy[^.]*\.",
 | |
|             r"focus[^.]*\.",
 | |
|             r"issues[^.]*\.",
 | |
|             r"legislation[^.]*\.",
 | |
|         ]
 | |
| 
 | |
|         for pattern in policy_patterns:
 | |
|             matches = re.findall(
 | |
|                 pattern, content_lower, re.IGNORECASE
 | |
|             )
 | |
|             if matches:
 | |
|                 personality_data["policy_focus"] = " ".join(
 | |
|                     matches[:3]
 | |
|                 )
 | |
|                 break
 | |
| 
 | |
|         # Achievements
 | |
|         achievement_patterns = [
 | |
|             r"achievements[^.]*\.",
 | |
|             r"accomplishments[^.]*\.",
 | |
|             r"success[^.]*\.",
 | |
|             r"won[^.]*\.",
 | |
|             r"received[^.]*\.",
 | |
|         ]
 | |
| 
 | |
|         for pattern in achievement_patterns:
 | |
|             matches = re.findall(
 | |
|                 pattern, content_lower, re.IGNORECASE
 | |
|             )
 | |
|             if matches:
 | |
|                 personality_data["achievements"] = " ".join(
 | |
|                     matches[:3]
 | |
|                 )
 | |
|                 break
 | |
| 
 | |
|         return personality_data
 | |
| 
 | |
|     def create_personality_profile(
 | |
|         self, mep_data: Dict[str, str]
 | |
|     ) -> MEPPersonalityProfile:
 | |
|         """
 | |
|         Create a personality profile for an MEP.
 | |
| 
 | |
|         Args:
 | |
|             mep_data: MEP data from XML file
 | |
| 
 | |
|         Returns:
 | |
|             MEPPersonalityProfile object
 | |
|         """
 | |
|         mep_name = mep_data["full_name"]
 | |
|         country = mep_data["country"]
 | |
| 
 | |
|         # Search for Wikipedia page
 | |
|         page_title = self.search_wikipedia_page(mep_name, country)
 | |
| 
 | |
|         if page_title:
 | |
|             # Get Wikipedia content
 | |
|             wiki_content = self.get_wikipedia_content(page_title)
 | |
| 
 | |
|             if wiki_content:
 | |
|                 # Parse content
 | |
|                 personality_data = self.parse_wikipedia_content(
 | |
|                     wiki_content["extract"], mep_name
 | |
|                 )
 | |
| 
 | |
|                 # Create profile
 | |
|                 profile = MEPPersonalityProfile(
 | |
|                     full_name=mep_name,
 | |
|                     mep_id=mep_data["mep_id"],
 | |
|                     wikipedia_url=wiki_content["url"],
 | |
|                     summary=personality_data["summary"],
 | |
|                     early_life=personality_data["early_life"],
 | |
|                     political_career=personality_data[
 | |
|                         "political_career"
 | |
|                     ],
 | |
|                     political_views=personality_data[
 | |
|                         "political_views"
 | |
|                     ],
 | |
|                     policy_focus=personality_data["policy_focus"],
 | |
|                     achievements=personality_data["achievements"],
 | |
|                     controversies=personality_data["controversies"],
 | |
|                     personal_life=personality_data["personal_life"],
 | |
|                     education=personality_data["education"],
 | |
|                     professional_background=personality_data[
 | |
|                         "professional_background"
 | |
|                     ],
 | |
|                     party_affiliations=personality_data[
 | |
|                         "party_affiliations"
 | |
|                     ],
 | |
|                     committee_experience=personality_data[
 | |
|                         "committee_experience"
 | |
|                     ],
 | |
|                     voting_record=personality_data["voting_record"],
 | |
|                     public_statements=personality_data[
 | |
|                         "public_statements"
 | |
|                     ],
 | |
|                     interests=personality_data["interests"],
 | |
|                     languages=personality_data["languages"],
 | |
|                     awards=personality_data["awards"],
 | |
|                     publications=personality_data["publications"],
 | |
|                     social_media=personality_data["social_media"],
 | |
|                     last_updated=time.strftime("%Y-%m-%d %H:%M:%S"),
 | |
|                 )
 | |
| 
 | |
|                 if self.verbose:
 | |
|                     logger.info(
 | |
|                         f"Created personality profile for {mep_name} from Wikipedia"
 | |
|                     )
 | |
| 
 | |
|                 return profile
 | |
| 
 | |
|         # Create minimal profile if no Wikipedia data found
 | |
|         profile = MEPPersonalityProfile(
 | |
|             full_name=mep_name,
 | |
|             mep_id=mep_data["mep_id"],
 | |
|             summary=f"{mep_name} is a Member of the European Parliament representing {country}.",
 | |
|             political_career=f"Currently serving as MEP for {country}.",
 | |
|             political_views=f"Member of {mep_data['political_group']} and {mep_data['national_party']}.",
 | |
|             last_updated=time.strftime("%Y-%m-%d %H:%M:%S"),
 | |
|         )
 | |
| 
 | |
|         if self.verbose:
 | |
|             logger.warning(
 | |
|                 f"No Wikipedia data found for {mep_name}, created minimal profile"
 | |
|             )
 | |
| 
 | |
|         return profile
 | |
| 
 | |
|     def save_personality_profile(
 | |
|         self, profile: MEPPersonalityProfile
 | |
|     ) -> str:
 | |
|         """
 | |
|         Save personality profile to JSON file.
 | |
| 
 | |
|         Args:
 | |
|             profile: MEPPersonalityProfile object
 | |
| 
 | |
|         Returns:
 | |
|             Path to saved file
 | |
|         """
 | |
|         # Create safe filename
 | |
|         safe_name = re.sub(r"[^\w\s-]", "", profile.full_name).strip()
 | |
|         safe_name = re.sub(r"[-\s]+", "_", safe_name)
 | |
|         filename = f"{safe_name}_{profile.mep_id}.json"
 | |
|         filepath = os.path.join(self.output_dir, filename)
 | |
| 
 | |
|         # Convert to dictionary and save
 | |
|         profile_dict = asdict(profile)
 | |
| 
 | |
|         with open(filepath, "w", encoding="utf-8") as f:
 | |
|             json.dump(profile_dict, f, indent=2, ensure_ascii=False)
 | |
| 
 | |
|         if self.verbose:
 | |
|             logger.info(f"Saved personality profile: {filepath}")
 | |
| 
 | |
|         return filepath
 | |
| 
 | |
|     def scrape_all_mep_personalities(
 | |
|         self, xml_file: str = "EU.xml", delay: float = 1.0
 | |
|     ) -> Dict[str, str]:
 | |
|         """
 | |
|         Scrape personality data for all MEPs.
 | |
| 
 | |
|         Args:
 | |
|             xml_file: Path to EU.xml file
 | |
|             delay: Delay between requests to be respectful to Wikipedia
 | |
| 
 | |
|         Returns:
 | |
|             Dictionary mapping MEP names to their personality profile file paths
 | |
|         """
 | |
|         meps = self.extract_mep_data_from_xml(xml_file)
 | |
|         profile_files = {}
 | |
| 
 | |
|         if self.verbose:
 | |
|             logger.info(
 | |
|                 f"Starting personality scraping for {len(meps)} MEPs"
 | |
|             )
 | |
| 
 | |
|         for i, mep_data in enumerate(meps, 1):
 | |
|             mep_name = mep_data["full_name"]
 | |
| 
 | |
|             if self.verbose:
 | |
|                 logger.info(f"Processing {i}/{len(meps)}: {mep_name}")
 | |
| 
 | |
|             try:
 | |
|                 # Create personality profile
 | |
|                 profile = self.create_personality_profile(mep_data)
 | |
| 
 | |
|                 # Save profile
 | |
|                 filepath = self.save_personality_profile(profile)
 | |
|                 profile_files[mep_name] = filepath
 | |
| 
 | |
|                 # Respectful delay
 | |
|                 time.sleep(delay)
 | |
| 
 | |
|             except Exception as e:
 | |
|                 logger.error(f"Error processing {mep_name}: {e}")
 | |
|                 continue
 | |
| 
 | |
|         if self.verbose:
 | |
|             logger.info(
 | |
|                 f"Completed personality scraping. {len(profile_files)} profiles created."
 | |
|             )
 | |
| 
 | |
|         return profile_files
 | |
| 
 | |
|     def load_personality_profile(
 | |
|         self, filepath: str
 | |
|     ) -> MEPPersonalityProfile:
 | |
|         """
 | |
|         Load personality profile from JSON file.
 | |
| 
 | |
|         Args:
 | |
|             filepath: Path to personality profile JSON file
 | |
| 
 | |
|         Returns:
 | |
|             MEPPersonalityProfile object
 | |
|         """
 | |
|         with open(filepath, "r", encoding="utf-8") as f:
 | |
|             data = json.load(f)
 | |
| 
 | |
|         return MEPPersonalityProfile(**data)
 | |
| 
 | |
|     def get_personality_summary(
 | |
|         self, profile: MEPPersonalityProfile
 | |
|     ) -> str:
 | |
|         """
 | |
|         Generate a personality summary for use in AI agent system prompts.
 | |
| 
 | |
|         Args:
 | |
|             profile: MEPPersonalityProfile object
 | |
| 
 | |
|         Returns:
 | |
|             Formatted personality summary
 | |
|         """
 | |
|         summary_parts = []
 | |
| 
 | |
|         if profile.summary:
 | |
|             summary_parts.append(f"Background: {profile.summary}")
 | |
| 
 | |
|         if profile.political_career:
 | |
|             summary_parts.append(
 | |
|                 f"Political Career: {profile.political_career}"
 | |
|             )
 | |
| 
 | |
|         if profile.political_views:
 | |
|             summary_parts.append(
 | |
|                 f"Political Views: {profile.political_views}"
 | |
|             )
 | |
| 
 | |
|         if profile.policy_focus:
 | |
|             summary_parts.append(
 | |
|                 f"Policy Focus: {profile.policy_focus}"
 | |
|             )
 | |
| 
 | |
|         if profile.achievements:
 | |
|             summary_parts.append(
 | |
|                 f"Notable Achievements: {profile.achievements}"
 | |
|             )
 | |
| 
 | |
|         if profile.education:
 | |
|             summary_parts.append(f"Education: {profile.education}")
 | |
| 
 | |
|         if profile.professional_background:
 | |
|             summary_parts.append(
 | |
|                 f"Professional Background: {profile.professional_background}"
 | |
|             )
 | |
| 
 | |
|         return "\n".join(summary_parts)
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     """Main function to run the Wikipedia personality scraper."""
 | |
| 
 | |
|     print("🏛️  WIKIPEDIA PERSONALITY SCRAPER FOR EUROSWARM PARLIAMENT")
 | |
|     print("=" * 70)
 | |
| 
 | |
|     # Initialize scraper
 | |
|     scraper = WikipediaPersonalityScraper(
 | |
|         output_dir="mep_personalities", verbose=True
 | |
|     )
 | |
| 
 | |
|     # Scrape all MEP personalities
 | |
|     profile_files = scraper.scrape_all_mep_personalities(delay=1.0)
 | |
| 
 | |
|     print("\n✅ Scraping completed!")
 | |
|     print(f"📁 Profiles saved to: {scraper.output_dir}")
 | |
|     print(f"📊 Total profiles created: {len(profile_files)}")
 | |
| 
 | |
|     # Show sample profile
 | |
|     if profile_files:
 | |
|         sample_name = list(profile_files.keys())[0]
 | |
|         sample_file = profile_files[sample_name]
 | |
|         sample_profile = scraper.load_personality_profile(sample_file)
 | |
| 
 | |
|         print(f"\n📋 Sample Profile: {sample_name}")
 | |
|         print("-" * 50)
 | |
|         print(scraper.get_personality_summary(sample_profile))
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 |