#!/usr/bin/env python3 """ Download Wikipedia articles for LightRAG ingestion testing. This script fetches plain text from Wikipedia articles across diverse domains to create a test dataset with intentional entity overlap for testing: - Entity merging and summarization - Cross-domain relationships - Parallel processing optimizations Usage: python lightrag/evaluation/download_wikipedia.py python lightrag/evaluation/download_wikipedia.py --output wiki_docs/ python lightrag/evaluation/download_wikipedia.py --domains medical,climate """ import argparse import asyncio from pathlib import Path import httpx # Wikipedia API endpoint (no auth required) WIKI_API = "https://en.wikipedia.org/w/api.php" # User-Agent required by Wikipedia API policy # See: https://meta.wikimedia.org/wiki/User-Agent_policy USER_AGENT = "LightRAG-Test-Downloader/1.0 (https://github.com/HKUDS/LightRAG; claude@example.com)" # Article selection by domain - chosen for entity overlap # WHO → Medical + Climate # Carbon/Emissions → Climate + Finance (ESG) # Germany/Brazil → Sports + general knowledge ARTICLES = { "medical": ["Diabetes", "COVID-19"], "finance": ["Stock_market", "Cryptocurrency"], "climate": ["Climate_change", "Renewable_energy"], "sports": ["FIFA_World_Cup", "Olympic_Games"], } async def fetch_article(title: str, client: httpx.AsyncClient) -> dict | None: """Fetch Wikipedia article text via API. Args: title: Wikipedia article title (use underscores for spaces) client: Async HTTP client Returns: Dict with title, content, and source; or None if not found """ params = { "action": "query", "titles": title, "prop": "extracts", "explaintext": True, # Plain text, no HTML "format": "json", } response = await client.get(WIKI_API, params=params) # Check for HTTP errors if response.status_code != 200: print(f" HTTP {response.status_code} for {title}") return None # Handle empty response if not response.content: print(f" Empty response for {title}") return None try: data = response.json() except Exception as e: print(f" JSON parse error for {title}: {e}") return None pages = data.get("query", {}).get("pages", {}) for page_id, page in pages.items(): if page_id != "-1": # -1 = not found return { "title": page.get("title", title), "content": page.get("extract", ""), "source": f"wikipedia_{title}", } return None async def download_articles( domains: list[str], output_dir: Path, ) -> list[dict]: """Download all articles for selected domains. Args: domains: List of domain names (e.g., ["medical", "climate"]) output_dir: Directory to save downloaded articles Returns: List of article metadata dicts """ output_dir.mkdir(parents=True, exist_ok=True) articles = [] headers = {"User-Agent": USER_AGENT} async with httpx.AsyncClient(timeout=30.0, headers=headers) as client: for domain in domains: titles = ARTICLES.get(domain, []) if not titles: print(f"[{domain.upper()}] Unknown domain, skipping") continue print(f"[{domain.upper()}] Downloading {len(titles)} articles...") for title in titles: article = await fetch_article(title, client) if article: # Save to file filename = f"{domain}_{title.lower().replace(' ', '_')}.txt" filepath = output_dir / filename filepath.write_text(article["content"]) word_count = len(article["content"].split()) print(f" ✓ {title}: {word_count:,} words") articles.append( { "domain": domain, "title": article["title"], "file": str(filepath), "words": word_count, "source": article["source"], } ) else: print(f" ✗ {title}: Not found") return articles async def main(): parser = argparse.ArgumentParser(description="Download Wikipedia test articles") parser.add_argument( "--output", "-o", type=str, default="lightrag/evaluation/wiki_documents", help="Output directory for downloaded articles", ) parser.add_argument( "--domains", "-d", type=str, default="medical,finance,climate,sports", help="Comma-separated domains to download", ) args = parser.parse_args() domains = [d.strip() for d in args.domains.split(",")] output_dir = Path(args.output) print("=== Wikipedia Article Downloader ===") print(f"Domains: {', '.join(domains)}") print(f"Output: {output_dir}/") print() articles = await download_articles(domains, output_dir) total_words = sum(a["words"] for a in articles) print() print(f"✓ Downloaded {len(articles)} articles ({total_words:,} words total)") print(f" Output: {output_dir}/") # Print summary by domain print("\nBy domain:") for domain in domains: domain_articles = [a for a in articles if a["domain"] == domain] domain_words = sum(a["words"] for a in domain_articles) print(f" {domain}: {len(domain_articles)} articles, {domain_words:,} words") if __name__ == "__main__": asyncio.run(main())