#!/usr/bin/env python3 """ Download Wikipedia articles for LightRAG ingestion testing. This script fetches plain text from Wikipedia articles across diverse domains to create a test dataset with intentional entity overlap for testing: - Entity merging and summarization - Cross-domain relationships - Parallel processing optimizations Usage: python lightrag/evaluation/download_wikipedia.py python lightrag/evaluation/download_wikipedia.py --output wiki_docs/ python lightrag/evaluation/download_wikipedia.py --domains medical,climate """ import argparse import asyncio from pathlib import Path import httpx # Wikipedia API endpoint (no auth required) WIKI_API = 'https://en.wikipedia.org/w/api.php' # User-Agent required by Wikipedia API policy # See: https://meta.wikimedia.org/wiki/User-Agent_policy USER_AGENT = 'LightRAG-Test-Downloader/1.0 (https://github.com/HKUDS/LightRAG; claude@example.com)' # Article selection by domain - chosen for entity overlap # WHO → Medical + Climate # Carbon/Emissions → Climate + Finance (ESG) # Germany/Brazil → Sports + general knowledge ARTICLES = { 'medical': ['Diabetes', 'COVID-19'], 'finance': ['Stock_market', 'Cryptocurrency'], 'climate': ['Climate_change', 'Renewable_energy'], 'sports': ['FIFA_World_Cup', 'Olympic_Games'], } async def fetch_article(title: str, client: httpx.AsyncClient) -> dict | None: """Fetch Wikipedia article text via API. Args: title: Wikipedia article title (use underscores for spaces) client: Async HTTP client Returns: Dict with title, content, and source; or None if not found """ params = { 'action': 'query', 'titles': title, 'prop': 'extracts', 'explaintext': True, # Plain text, no HTML 'format': 'json', } response = await client.get(WIKI_API, params=params) # Check for HTTP errors if response.status_code != 200: print(f' HTTP {response.status_code} for {title}') return None # Handle empty response if not response.content: print(f' Empty response for {title}') return None try: data = response.json() except Exception as e: print(f' JSON parse error for {title}: {e}') return None pages = data.get('query', {}).get('pages', {}) for page_id, page in pages.items(): if page_id != '-1': # -1 = not found return { 'title': page.get('title', title), 'content': page.get('extract', ''), 'source': f'wikipedia_{title}', } return None async def download_articles( domains: list[str], output_dir: Path, ) -> list[dict]: """Download all articles for selected domains. Args: domains: List of domain names (e.g., ["medical", "climate"]) output_dir: Directory to save downloaded articles Returns: List of article metadata dicts """ output_dir.mkdir(parents=True, exist_ok=True) articles = [] headers = {'User-Agent': USER_AGENT} async with httpx.AsyncClient(timeout=30.0, headers=headers) as client: for domain in domains: titles = ARTICLES.get(domain, []) if not titles: print(f'[{domain.upper()}] Unknown domain, skipping') continue print(f'[{domain.upper()}] Downloading {len(titles)} articles...') for title in titles: article = await fetch_article(title, client) if article: # Save to file filename = f'{domain}_{title.lower().replace(" ", "_")}.txt' filepath = output_dir / filename filepath.write_text(article['content']) word_count = len(article['content'].split()) print(f' ✓ {title}: {word_count:,} words') articles.append( { 'domain': domain, 'title': article['title'], 'file': str(filepath), 'words': word_count, 'source': article['source'], } ) else: print(f' ✗ {title}: Not found') return articles async def main(): parser = argparse.ArgumentParser(description='Download Wikipedia test articles') parser.add_argument( '--output', '-o', type=str, default='lightrag/evaluation/wiki_documents', help='Output directory for downloaded articles', ) parser.add_argument( '--domains', '-d', type=str, default='medical,finance,climate,sports', help='Comma-separated domains to download', ) args = parser.parse_args() domains = [d.strip() for d in args.domains.split(',')] output_dir = Path(args.output) print('=== Wikipedia Article Downloader ===') print(f'Domains: {", ".join(domains)}') print(f'Output: {output_dir}/') print() articles = await download_articles(domains, output_dir) total_words = sum(a['words'] for a in articles) print() print(f'✓ Downloaded {len(articles)} articles ({total_words:,} words total)') print(f' Output: {output_dir}/') # Print summary by domain print('\nBy domain:') for domain in domains: domain_articles = [a for a in articles if a['domain'] == domain] domain_words = sum(a['words'] for a in domain_articles) print(f' {domain}: {len(domain_articles)} articles, {domain_words:,} words') if __name__ == '__main__': asyncio.run(main())