LightRAG/lightrag/evaluation/download_wikipedia.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

185 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Download Wikipedia articles for LightRAG ingestion testing.
This script fetches plain text from Wikipedia articles across diverse domains
to create a test dataset with intentional entity overlap for testing:
- Entity merging and summarization
- Cross-domain relationships
- Parallel processing optimizations
Usage:
python lightrag/evaluation/download_wikipedia.py
python lightrag/evaluation/download_wikipedia.py --output wiki_docs/
python lightrag/evaluation/download_wikipedia.py --domains medical,climate
"""
import argparse
import asyncio
from pathlib import Path
import httpx
from lightrag.utils import logger
# Wikipedia API endpoint (no auth required)
WIKI_API = 'https://en.wikipedia.org/w/api.php'
# User-Agent required by Wikipedia API policy
# See: https://meta.wikimedia.org/wiki/User-Agent_policy
USER_AGENT = 'LightRAG-Test-Downloader/1.0 (https://github.com/HKUDS/LightRAG; claude@example.com)'
# Article selection by domain - chosen for entity overlap
# WHO → Medical + Climate
# Carbon/Emissions → Climate + Finance (ESG)
# Germany/Brazil → Sports + general knowledge
ARTICLES = {
'medical': ['Diabetes', 'COVID-19'],
'finance': ['Stock_market', 'Cryptocurrency'],
'climate': ['Climate_change', 'Renewable_energy'],
'sports': ['FIFA_World_Cup', 'Olympic_Games'],
}
async def fetch_article(title: str, client: httpx.AsyncClient) -> dict | None:
"""Fetch Wikipedia article text via API.
Args:
title: Wikipedia article title (use underscores for spaces)
client: Async HTTP client
Returns:
Dict with title, content, and source; or None if not found
"""
params = {
'action': 'query',
'titles': title,
'prop': 'extracts',
'explaintext': True, # Plain text, no HTML
'format': 'json',
}
response = await client.get(WIKI_API, params=params)
# Check for HTTP errors
if response.status_code != 200:
logger.error('HTTP %s for %s', response.status_code, title)
return None
# Handle empty response
if not response.content:
logger.warning('Empty response for %s', title)
return None
try:
data = response.json()
except Exception as e:
logger.error('JSON parse error for %s: %s', title, e)
return None
pages = data.get('query', {}).get('pages', {})
for page_id, page in pages.items():
if page_id != '-1': # -1 = not found
return {
'title': page.get('title', title),
'content': page.get('extract', ''),
'source': f'wikipedia_{title}',
}
return None
async def download_articles(
domains: list[str],
output_dir: Path,
) -> list[dict]:
"""Download all articles for selected domains.
Args:
domains: List of domain names (e.g., ["medical", "climate"])
output_dir: Directory to save downloaded articles
Returns:
List of article metadata dicts
"""
output_dir.mkdir(parents=True, exist_ok=True)
articles = []
headers = {'User-Agent': USER_AGENT}
async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
for domain in domains:
titles = ARTICLES.get(domain, [])
if not titles:
logger.warning('[%s] Unknown domain, skipping', domain.upper())
continue
logger.info('[%s] Downloading %d articles...', domain.upper(), len(titles))
for title in titles:
article = await fetch_article(title, client)
if article:
# Save to file
filename = f'{domain}_{title.lower().replace(" ", "_")}.txt'
filepath = output_dir / filename
filepath.write_text(article['content'])
word_count = len(article['content'].split())
logger.info('%s: %s words', title, f'{word_count:,}')
articles.append(
{
'domain': domain,
'title': article['title'],
'file': str(filepath),
'words': word_count,
'source': article['source'],
}
)
else:
logger.warning('%s: Not found', title)
return articles
async def main():
parser = argparse.ArgumentParser(description='Download Wikipedia test articles')
parser.add_argument(
'--output',
'-o',
type=str,
default='lightrag/evaluation/wiki_documents',
help='Output directory for downloaded articles',
)
parser.add_argument(
'--domains',
'-d',
type=str,
default='medical,finance,climate,sports',
help='Comma-separated domains to download',
)
args = parser.parse_args()
domains = [d.strip() for d in args.domains.split(',')]
output_dir = Path(args.output)
print('=== Wikipedia Article Downloader ===')
print(f'Domains: {", ".join(domains)}')
print(f'Output: {output_dir}/')
print()
articles = await download_articles(domains, output_dir)
total_words = sum(a['words'] for a in articles)
print()
print(f'✓ Downloaded {len(articles)} articles ({total_words:,} words total)')
print(f' Output: {output_dir}/')
# Print summary by domain
print('\nBy domain:')
for domain in domains:
domain_articles = [a for a in articles if a['domain'] == domain]
domain_words = sum(a['words'] for a in domain_articles)
print(f' {domain}: {len(domain_articles)} articles, {domain_words:,} words')
if __name__ == '__main__':
asyncio.run(main())