LightRAG/lightrag/evaluation/download_wikipedia.py
clssck ef7327bb3e chore(docker-compose, lightrag): optimize test infrastructure and add evaluation tools
Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning,
Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality
assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test
analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py
for automated evaluation pipelines, and ingest_test_docs.py for batch document
ingestion. Updates docker-compose.test.yml with aggressive async settings, memory
limits, and optimized chunking parameters. Parallelize entity summarization in
operate.py for improved extraction performance. Fix typos in merge node/edge logs.
2025-11-29 10:39:20 +01:00

183 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Download Wikipedia articles for LightRAG ingestion testing.
This script fetches plain text from Wikipedia articles across diverse domains
to create a test dataset with intentional entity overlap for testing:
- Entity merging and summarization
- Cross-domain relationships
- Parallel processing optimizations
Usage:
python lightrag/evaluation/download_wikipedia.py
python lightrag/evaluation/download_wikipedia.py --output wiki_docs/
python lightrag/evaluation/download_wikipedia.py --domains medical,climate
"""
import argparse
import asyncio
from pathlib import Path
import httpx
# Wikipedia API endpoint (no auth required)
WIKI_API = "https://en.wikipedia.org/w/api.php"
# User-Agent required by Wikipedia API policy
# See: https://meta.wikimedia.org/wiki/User-Agent_policy
USER_AGENT = "LightRAG-Test-Downloader/1.0 (https://github.com/HKUDS/LightRAG; claude@example.com)"
# Article selection by domain - chosen for entity overlap
# WHO → Medical + Climate
# Carbon/Emissions → Climate + Finance (ESG)
# Germany/Brazil → Sports + general knowledge
ARTICLES = {
"medical": ["Diabetes", "COVID-19"],
"finance": ["Stock_market", "Cryptocurrency"],
"climate": ["Climate_change", "Renewable_energy"],
"sports": ["FIFA_World_Cup", "Olympic_Games"],
}
async def fetch_article(title: str, client: httpx.AsyncClient) -> dict | None:
"""Fetch Wikipedia article text via API.
Args:
title: Wikipedia article title (use underscores for spaces)
client: Async HTTP client
Returns:
Dict with title, content, and source; or None if not found
"""
params = {
"action": "query",
"titles": title,
"prop": "extracts",
"explaintext": True, # Plain text, no HTML
"format": "json",
}
response = await client.get(WIKI_API, params=params)
# Check for HTTP errors
if response.status_code != 200:
print(f" HTTP {response.status_code} for {title}")
return None
# Handle empty response
if not response.content:
print(f" Empty response for {title}")
return None
try:
data = response.json()
except Exception as e:
print(f" JSON parse error for {title}: {e}")
return None
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if page_id != "-1": # -1 = not found
return {
"title": page.get("title", title),
"content": page.get("extract", ""),
"source": f"wikipedia_{title}",
}
return None
async def download_articles(
domains: list[str],
output_dir: Path,
) -> list[dict]:
"""Download all articles for selected domains.
Args:
domains: List of domain names (e.g., ["medical", "climate"])
output_dir: Directory to save downloaded articles
Returns:
List of article metadata dicts
"""
output_dir.mkdir(parents=True, exist_ok=True)
articles = []
headers = {"User-Agent": USER_AGENT}
async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
for domain in domains:
titles = ARTICLES.get(domain, [])
if not titles:
print(f"[{domain.upper()}] Unknown domain, skipping")
continue
print(f"[{domain.upper()}] Downloading {len(titles)} articles...")
for title in titles:
article = await fetch_article(title, client)
if article:
# Save to file
filename = f"{domain}_{title.lower().replace(' ', '_')}.txt"
filepath = output_dir / filename
filepath.write_text(article["content"])
word_count = len(article["content"].split())
print(f"{title}: {word_count:,} words")
articles.append(
{
"domain": domain,
"title": article["title"],
"file": str(filepath),
"words": word_count,
"source": article["source"],
}
)
else:
print(f"{title}: Not found")
return articles
async def main():
parser = argparse.ArgumentParser(description="Download Wikipedia test articles")
parser.add_argument(
"--output",
"-o",
type=str,
default="lightrag/evaluation/wiki_documents",
help="Output directory for downloaded articles",
)
parser.add_argument(
"--domains",
"-d",
type=str,
default="medical,finance,climate,sports",
help="Comma-separated domains to download",
)
args = parser.parse_args()
domains = [d.strip() for d in args.domains.split(",")]
output_dir = Path(args.output)
print("=== Wikipedia Article Downloader ===")
print(f"Domains: {', '.join(domains)}")
print(f"Output: {output_dir}/")
print()
articles = await download_articles(domains, output_dir)
total_words = sum(a["words"] for a in articles)
print()
print(f"✓ Downloaded {len(articles)} articles ({total_words:,} words total)")
print(f" Output: {output_dir}/")
# Print summary by domain
print("\nBy domain:")
for domain in domains:
domain_articles = [a for a in articles if a["domain"] == domain]
domain_words = sum(a["words"] for a in domain_articles)
print(f" {domain}: {len(domain_articles)} articles, {domain_words:,} words")
if __name__ == "__main__":
asyncio.run(main())