Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning, Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py for automated evaluation pipelines, and ingest_test_docs.py for batch document ingestion. Updates docker-compose.test.yml with aggressive async settings, memory limits, and optimized chunking parameters. Parallelize entity summarization in operate.py for improved extraction performance. Fix typos in merge node/edge logs.
183 lines
5.6 KiB
Python
183 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Download Wikipedia articles for LightRAG ingestion testing.
|
|
|
|
This script fetches plain text from Wikipedia articles across diverse domains
|
|
to create a test dataset with intentional entity overlap for testing:
|
|
- Entity merging and summarization
|
|
- Cross-domain relationships
|
|
- Parallel processing optimizations
|
|
|
|
Usage:
|
|
python lightrag/evaluation/download_wikipedia.py
|
|
python lightrag/evaluation/download_wikipedia.py --output wiki_docs/
|
|
python lightrag/evaluation/download_wikipedia.py --domains medical,climate
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
|
|
# Wikipedia API endpoint (no auth required)
|
|
WIKI_API = "https://en.wikipedia.org/w/api.php"
|
|
|
|
# User-Agent required by Wikipedia API policy
|
|
# See: https://meta.wikimedia.org/wiki/User-Agent_policy
|
|
USER_AGENT = "LightRAG-Test-Downloader/1.0 (https://github.com/HKUDS/LightRAG; claude@example.com)"
|
|
|
|
# Article selection by domain - chosen for entity overlap
|
|
# WHO → Medical + Climate
|
|
# Carbon/Emissions → Climate + Finance (ESG)
|
|
# Germany/Brazil → Sports + general knowledge
|
|
ARTICLES = {
|
|
"medical": ["Diabetes", "COVID-19"],
|
|
"finance": ["Stock_market", "Cryptocurrency"],
|
|
"climate": ["Climate_change", "Renewable_energy"],
|
|
"sports": ["FIFA_World_Cup", "Olympic_Games"],
|
|
}
|
|
|
|
|
|
async def fetch_article(title: str, client: httpx.AsyncClient) -> dict | None:
|
|
"""Fetch Wikipedia article text via API.
|
|
|
|
Args:
|
|
title: Wikipedia article title (use underscores for spaces)
|
|
client: Async HTTP client
|
|
|
|
Returns:
|
|
Dict with title, content, and source; or None if not found
|
|
"""
|
|
params = {
|
|
"action": "query",
|
|
"titles": title,
|
|
"prop": "extracts",
|
|
"explaintext": True, # Plain text, no HTML
|
|
"format": "json",
|
|
}
|
|
response = await client.get(WIKI_API, params=params)
|
|
|
|
# Check for HTTP errors
|
|
if response.status_code != 200:
|
|
print(f" HTTP {response.status_code} for {title}")
|
|
return None
|
|
|
|
# Handle empty response
|
|
if not response.content:
|
|
print(f" Empty response for {title}")
|
|
return None
|
|
|
|
try:
|
|
data = response.json()
|
|
except Exception as e:
|
|
print(f" JSON parse error for {title}: {e}")
|
|
return None
|
|
|
|
pages = data.get("query", {}).get("pages", {})
|
|
|
|
for page_id, page in pages.items():
|
|
if page_id != "-1": # -1 = not found
|
|
return {
|
|
"title": page.get("title", title),
|
|
"content": page.get("extract", ""),
|
|
"source": f"wikipedia_{title}",
|
|
}
|
|
return None
|
|
|
|
|
|
async def download_articles(
|
|
domains: list[str],
|
|
output_dir: Path,
|
|
) -> list[dict]:
|
|
"""Download all articles for selected domains.
|
|
|
|
Args:
|
|
domains: List of domain names (e.g., ["medical", "climate"])
|
|
output_dir: Directory to save downloaded articles
|
|
|
|
Returns:
|
|
List of article metadata dicts
|
|
"""
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
articles = []
|
|
|
|
headers = {"User-Agent": USER_AGENT}
|
|
async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
|
|
for domain in domains:
|
|
titles = ARTICLES.get(domain, [])
|
|
if not titles:
|
|
print(f"[{domain.upper()}] Unknown domain, skipping")
|
|
continue
|
|
|
|
print(f"[{domain.upper()}] Downloading {len(titles)} articles...")
|
|
|
|
for title in titles:
|
|
article = await fetch_article(title, client)
|
|
if article:
|
|
# Save to file
|
|
filename = f"{domain}_{title.lower().replace(' ', '_')}.txt"
|
|
filepath = output_dir / filename
|
|
filepath.write_text(article["content"])
|
|
|
|
word_count = len(article["content"].split())
|
|
print(f" ✓ {title}: {word_count:,} words")
|
|
|
|
articles.append(
|
|
{
|
|
"domain": domain,
|
|
"title": article["title"],
|
|
"file": str(filepath),
|
|
"words": word_count,
|
|
"source": article["source"],
|
|
}
|
|
)
|
|
else:
|
|
print(f" ✗ {title}: Not found")
|
|
|
|
return articles
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="Download Wikipedia test articles")
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
default="lightrag/evaluation/wiki_documents",
|
|
help="Output directory for downloaded articles",
|
|
)
|
|
parser.add_argument(
|
|
"--domains",
|
|
"-d",
|
|
type=str,
|
|
default="medical,finance,climate,sports",
|
|
help="Comma-separated domains to download",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
domains = [d.strip() for d in args.domains.split(",")]
|
|
output_dir = Path(args.output)
|
|
|
|
print("=== Wikipedia Article Downloader ===")
|
|
print(f"Domains: {', '.join(domains)}")
|
|
print(f"Output: {output_dir}/")
|
|
print()
|
|
|
|
articles = await download_articles(domains, output_dir)
|
|
|
|
total_words = sum(a["words"] for a in articles)
|
|
print()
|
|
print(f"✓ Downloaded {len(articles)} articles ({total_words:,} words total)")
|
|
print(f" Output: {output_dir}/")
|
|
|
|
# Print summary by domain
|
|
print("\nBy domain:")
|
|
for domain in domains:
|
|
domain_articles = [a for a in articles if a["domain"] == domain]
|
|
domain_words = sum(a["words"] for a in domain_articles)
|
|
print(f" {domain}: {len(domain_articles)} articles, {domain_words:,} words")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|