Add extensive test suites for API routes and utilities: - Implement test_search_routes.py (406 lines) for search endpoint validation - Implement test_upload_routes.py (724 lines) for document upload workflows - Implement test_s3_client.py (618 lines) for S3 storage operations - Implement test_citation_utils.py (352 lines) for citation extraction - Implement test_chunking.py (216 lines) for text chunking validation Add S3 storage client implementation: - Create lightrag/storage/s3_client.py with S3 operations - Add storage module initialization with exports - Integrate S3 client with document upload handling Enhance API routes and core functionality: - Add search_routes.py with full-text and graph search endpoints - Add upload_routes.py with multipart document upload support - Update operate.py with bulk operations and health checks - Enhance postgres_impl.py with bulk upsert and parameterized queries - Update lightrag_server.py to register new API routes - Improve utils.py with citation and formatting utilities Update dependencies and configuration: - Add S3 and test dependencies to pyproject.toml - Update docker-compose.test.yml for testing environment - Sync uv.lock with new dependencies Apply code quality improvements across all modified files: - Add type hints to function signatures - Update imports and router initialization - Fix logging and error handling
512 lines
17 KiB
Python
512 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
E2E RAGAS Test Harness for LightRAG
|
|
|
|
Complete end-to-end testing pipeline:
|
|
1. Download arXiv papers (reproducible test data)
|
|
2. Clear existing data (optional)
|
|
3. Ingest papers into LightRAG
|
|
4. Wait for processing
|
|
5. Generate Q&A dataset
|
|
6. Run RAGAS evaluation
|
|
7. Optional: A/B comparison
|
|
|
|
Usage:
|
|
# Full E2E test
|
|
python lightrag/evaluation/e2e_test_harness.py
|
|
|
|
# A/B comparison (with/without orphan connections)
|
|
python lightrag/evaluation/e2e_test_harness.py --ab-test
|
|
|
|
# Skip download if papers exist
|
|
python lightrag/evaluation/e2e_test_harness.py --skip-download
|
|
|
|
# Use existing dataset
|
|
python lightrag/evaluation/e2e_test_harness.py --dataset existing_dataset.json
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
from lightrag.utils import logger
|
|
|
|
# Add parent directory to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
# Load environment variables
|
|
load_dotenv(dotenv_path='.env', override=False)
|
|
|
|
# Configuration
|
|
DEFAULT_RAG_URL = 'http://localhost:9622'
|
|
DEFAULT_PAPERS = ['2312.10997', '2404.10981', '2005.11401']
|
|
POLL_INTERVAL_SECONDS = 10
|
|
MAX_WAIT_SECONDS = 600 # 10 minutes max wait for processing
|
|
|
|
|
|
class E2ETestHarness:
|
|
"""End-to-end test harness for LightRAG RAGAS evaluation."""
|
|
|
|
def __init__(
|
|
self,
|
|
rag_url: str | None = None,
|
|
paper_ids: list[str] | None = None,
|
|
questions_per_paper: int = 5,
|
|
skip_download: bool = False,
|
|
skip_ingest: bool = False,
|
|
dataset_path: str | None = None,
|
|
output_dir: str | None = None,
|
|
):
|
|
self.rag_url = (rag_url or os.getenv('LIGHTRAG_API_URL', DEFAULT_RAG_URL)).rstrip('/')
|
|
self.paper_ids = paper_ids or DEFAULT_PAPERS
|
|
self.questions_per_paper = questions_per_paper
|
|
self.skip_download = skip_download
|
|
self.skip_ingest = skip_ingest
|
|
self.dataset_path = Path(dataset_path) if dataset_path else None
|
|
|
|
# Determine directories
|
|
self.eval_dir = Path(__file__).parent
|
|
self.papers_dir = self.eval_dir / 'papers'
|
|
self.results_dir = Path(output_dir) if output_dir else self.eval_dir / 'results'
|
|
self.results_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# API key for LightRAG
|
|
self.api_key = os.getenv('LIGHTRAG_API_KEY')
|
|
|
|
async def check_lightrag_health(self) -> bool:
|
|
"""Check if LightRAG API is accessible."""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
response = await client.get(f'{self.rag_url}/health')
|
|
response.raise_for_status()
|
|
logger.info(f'LightRAG API accessible at {self.rag_url}')
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f'Cannot connect to LightRAG API: {e}')
|
|
return False
|
|
|
|
async def download_papers(self) -> list[str]:
|
|
"""Download arXiv papers."""
|
|
if self.skip_download:
|
|
logger.info('Paper download skipped (--skip-download)')
|
|
# Check existing papers
|
|
existing = [
|
|
str(self.papers_dir / f'{pid}.pdf')
|
|
for pid in self.paper_ids
|
|
if (self.papers_dir / f'{pid}.pdf').exists()
|
|
]
|
|
logger.info(f'Found {len(existing)} existing papers')
|
|
return existing
|
|
|
|
logger.info('STEP 1: Download arXiv Papers')
|
|
|
|
from lightrag.evaluation.download_arxiv import download_papers
|
|
|
|
results = await download_papers(self.paper_ids, self.papers_dir)
|
|
return [r['path'] for r in results if r['status'] in ('downloaded', 'exists')]
|
|
|
|
async def clear_existing_data(self) -> bool:
|
|
"""Clear existing documents in LightRAG (optional)."""
|
|
logger.info('Clearing existing data...')
|
|
try:
|
|
headers = {'X-API-Key': self.api_key} if self.api_key else {}
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
# Get current documents
|
|
response = await client.get(
|
|
f'{self.rag_url}/documents',
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
docs = response.json()
|
|
|
|
# Clear all documents
|
|
statuses = docs.get('statuses', {})
|
|
all_docs = []
|
|
for status_docs in statuses.values():
|
|
all_docs.extend(status_docs)
|
|
|
|
if all_docs:
|
|
logger.info(f'Clearing {len(all_docs)} existing documents...')
|
|
for doc in all_docs:
|
|
doc_id = doc.get('id')
|
|
if doc_id:
|
|
await client.delete(
|
|
f'{self.rag_url}/documents/{doc_id}',
|
|
headers=headers,
|
|
)
|
|
logger.info('Cleared existing documents')
|
|
else:
|
|
logger.info('No existing documents to clear')
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.warning(f'Could not clear data: {e}')
|
|
return False
|
|
|
|
async def ingest_papers(self, paper_paths: list[str]) -> bool:
|
|
"""Ingest papers into LightRAG."""
|
|
if self.skip_ingest:
|
|
logger.info('Paper ingestion skipped (--skip-ingest)')
|
|
return True
|
|
|
|
logger.info('STEP 2: Ingest Papers into LightRAG')
|
|
|
|
headers = {'X-API-Key': self.api_key} if self.api_key else {}
|
|
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
for paper_path in paper_paths:
|
|
path = Path(paper_path)
|
|
if not path.exists():
|
|
logger.warning(f'Paper not found: {paper_path}')
|
|
continue
|
|
|
|
logger.info(f'Uploading {path.name}')
|
|
|
|
try:
|
|
with open(path, 'rb') as f:
|
|
files = {'file': (path.name, f, 'application/pdf')}
|
|
response = await client.post(
|
|
f'{self.rag_url}/documents/upload',
|
|
files=files,
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
logger.info(f'Uploaded: {result}')
|
|
except Exception as e:
|
|
logger.error(f'Upload failed: {e}')
|
|
|
|
return True
|
|
|
|
async def wait_for_processing(self) -> bool:
|
|
"""Wait for all documents to finish processing."""
|
|
logger.info('STEP 3: Wait for Document Processing')
|
|
|
|
headers = {'X-API-Key': self.api_key} if self.api_key else {}
|
|
start_time = time.time()
|
|
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
while time.time() - start_time < MAX_WAIT_SECONDS:
|
|
try:
|
|
response = await client.get(
|
|
f'{self.rag_url}/documents',
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
docs = response.json()
|
|
|
|
statuses = docs.get('statuses', {})
|
|
# API returns lowercase status keys
|
|
processing = len(statuses.get('processing', []))
|
|
pending = len(statuses.get('pending', []))
|
|
completed = len(statuses.get('processed', [])) # Note: "processed" not "completed"
|
|
failed = len(statuses.get('failed', []))
|
|
|
|
elapsed = int(time.time() - start_time)
|
|
logger.info(
|
|
f'[{elapsed}s] Processing: {processing}, Pending: {pending}, Completed: {completed}, Failed: {failed}'
|
|
)
|
|
|
|
if processing == 0 and pending == 0:
|
|
logger.info('All documents processed')
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.warning(f'Status check failed: {e}')
|
|
|
|
await asyncio.sleep(POLL_INTERVAL_SECONDS)
|
|
|
|
logger.error('Timeout waiting for document processing')
|
|
return False
|
|
|
|
async def generate_dataset(self) -> Path:
|
|
"""Generate Q&A dataset from ingested papers."""
|
|
if self.dataset_path and self.dataset_path.exists():
|
|
logger.info(f'Using existing dataset: {self.dataset_path}')
|
|
return self.dataset_path
|
|
|
|
logger.info('STEP 4: Generate Q&A Dataset')
|
|
|
|
from lightrag.evaluation.generate_arxiv_dataset import generate_dataset
|
|
|
|
output_path = self.eval_dir / f'arxiv_dataset_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
|
|
|
|
await generate_dataset(
|
|
paper_ids=self.paper_ids,
|
|
questions_per_paper=self.questions_per_paper,
|
|
rag_url=self.rag_url,
|
|
output_path=output_path,
|
|
)
|
|
|
|
return output_path
|
|
|
|
async def run_ragas_evaluation(self, dataset_path: Path) -> dict:
|
|
"""Run RAGAS evaluation."""
|
|
logger.info('STEP 5: Run RAGAS Evaluation')
|
|
|
|
from lightrag.evaluation.eval_rag_quality import RAGEvaluator
|
|
|
|
evaluator = RAGEvaluator(
|
|
test_dataset_path=str(dataset_path),
|
|
rag_api_url=self.rag_url,
|
|
)
|
|
|
|
results = await evaluator.run()
|
|
return results
|
|
|
|
async def run_full_pipeline(self) -> dict:
|
|
"""Run the complete E2E test pipeline."""
|
|
logger.info('E2E RAGAS TEST HARNESS FOR LIGHTRAG')
|
|
logger.info(f'RAG URL: {self.rag_url}')
|
|
logger.info(f'Papers: {", ".join(self.paper_ids)}')
|
|
logger.info(f'Questions: {self.questions_per_paper} per paper')
|
|
logger.info(f'Results: {self.results_dir}')
|
|
|
|
start_time = time.time()
|
|
|
|
# Check LightRAG is accessible
|
|
if not await self.check_lightrag_health():
|
|
return {'error': 'LightRAG API not accessible'}
|
|
|
|
# Step 1: Download papers
|
|
paper_paths = await self.download_papers()
|
|
if not paper_paths:
|
|
return {'error': 'No papers to process'}
|
|
|
|
# Step 2: Ingest papers
|
|
if not await self.ingest_papers(paper_paths):
|
|
return {'error': 'Paper ingestion failed'}
|
|
|
|
# Step 3: Wait for processing
|
|
if not self.skip_ingest and not await self.wait_for_processing():
|
|
return {'error': 'Document processing timeout'}
|
|
|
|
# Step 4: Generate dataset
|
|
dataset_path = await self.generate_dataset()
|
|
|
|
# Step 5: Run RAGAS evaluation
|
|
results = await self.run_ragas_evaluation(dataset_path)
|
|
|
|
elapsed_time = time.time() - start_time
|
|
|
|
# Save summary
|
|
summary = {
|
|
'pipeline_completed_at': datetime.now().isoformat(),
|
|
'total_elapsed_seconds': round(elapsed_time, 2),
|
|
'papers': self.paper_ids,
|
|
'dataset_path': str(dataset_path),
|
|
'ragas_results': results.get('benchmark_stats', {}),
|
|
}
|
|
|
|
summary_path = self.results_dir / f'e2e_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
|
|
with open(summary_path, 'w') as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
logger.info('E2E PIPELINE COMPLETE')
|
|
logger.info(f'Total time: {elapsed_time:.1f} seconds')
|
|
logger.info(f'Summary saved: {summary_path}')
|
|
|
|
return summary
|
|
|
|
|
|
async def run_ab_test(
|
|
harness_config: dict,
|
|
clear_between_runs: bool = True,
|
|
) -> dict:
|
|
"""
|
|
Run A/B test comparing with/without orphan connections.
|
|
|
|
Args:
|
|
harness_config: Configuration for E2ETestHarness
|
|
clear_between_runs: Clear data between A and B runs
|
|
|
|
Returns:
|
|
A/B comparison results
|
|
"""
|
|
logger.info('A/B TEST: WITH vs WITHOUT ORPHAN CONNECTIONS')
|
|
|
|
results = {}
|
|
|
|
# Test A: WITHOUT orphan connections
|
|
logger.info('[A] Running WITHOUT orphan connections...')
|
|
os.environ['AUTO_CONNECT_ORPHANS'] = 'false'
|
|
|
|
harness_a = E2ETestHarness(**harness_config)
|
|
results['without_orphans'] = await harness_a.run_full_pipeline()
|
|
|
|
# Clear for next run
|
|
if clear_between_runs:
|
|
await harness_a.clear_existing_data()
|
|
|
|
# Test B: WITH orphan connections
|
|
logger.info('[B] Running WITH orphan connections...')
|
|
os.environ['AUTO_CONNECT_ORPHANS'] = 'true'
|
|
|
|
# Force re-ingest for test B
|
|
harness_config_b = harness_config.copy()
|
|
harness_config_b['skip_download'] = True # Papers already downloaded
|
|
harness_config_b['skip_ingest'] = False # Need to re-ingest
|
|
|
|
harness_b = E2ETestHarness(**harness_config_b)
|
|
results['with_orphans'] = await harness_b.run_full_pipeline()
|
|
|
|
# Compare results
|
|
logger.info('A/B COMPARISON')
|
|
|
|
a_stats = results['without_orphans'].get('ragas_results', {}).get('average_metrics', {})
|
|
b_stats = results['with_orphans'].get('ragas_results', {}).get('average_metrics', {})
|
|
|
|
comparison = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'without_orphans': a_stats,
|
|
'with_orphans': b_stats,
|
|
'improvement': {},
|
|
}
|
|
|
|
for metric in ['faithfulness', 'answer_relevance', 'context_recall', 'context_precision', 'ragas_score']:
|
|
a_val = a_stats.get(metric, 0)
|
|
b_val = b_stats.get(metric, 0)
|
|
diff = b_val - a_val
|
|
pct = (diff / a_val * 100) if a_val > 0 else 0
|
|
|
|
comparison['improvement'][metric] = {
|
|
'absolute': round(diff, 4),
|
|
'percent': round(pct, 2),
|
|
}
|
|
|
|
status = 'UP' if diff > 0 else ('DOWN' if diff < 0 else '~')
|
|
logger.info(f'{metric:<20} A: {a_val:.4f} B: {b_val:.4f} [{status}] {pct:+.1f}%')
|
|
|
|
# Verdict
|
|
ragas_improvement = comparison['improvement'].get('ragas_score', {}).get('percent', 0)
|
|
if ragas_improvement > 5:
|
|
verdict = 'ORPHAN CONNECTIONS IMPROVE QUALITY'
|
|
elif ragas_improvement < -5:
|
|
verdict = 'ORPHAN CONNECTIONS DEGRADE QUALITY'
|
|
else:
|
|
verdict = 'NO SIGNIFICANT DIFFERENCE'
|
|
|
|
comparison['verdict'] = verdict
|
|
logger.info(f'VERDICT: {verdict}')
|
|
|
|
# Save comparison
|
|
comp_path = harness_a.results_dir / f'ab_comparison_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
|
|
with open(comp_path, 'w') as f:
|
|
json.dump(comparison, f, indent=2)
|
|
logger.info(f'Comparison saved: {comp_path}')
|
|
|
|
return comparison
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='E2E RAGAS Test Harness for LightRAG',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Full E2E test
|
|
python lightrag/evaluation/e2e_test_harness.py
|
|
|
|
# A/B test (with/without orphan connections)
|
|
python lightrag/evaluation/e2e_test_harness.py --ab-test
|
|
|
|
# Skip paper download
|
|
python lightrag/evaluation/e2e_test_harness.py --skip-download
|
|
|
|
# Use existing dataset
|
|
python lightrag/evaluation/e2e_test_harness.py --dataset arxiv_dataset.json
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--rag-url',
|
|
'-r',
|
|
type=str,
|
|
default=None,
|
|
help=f'LightRAG API URL (default: {DEFAULT_RAG_URL})',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--papers',
|
|
'-p',
|
|
type=str,
|
|
default=None,
|
|
help='Comma-separated arXiv paper IDs',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--questions',
|
|
'-q',
|
|
type=int,
|
|
default=5,
|
|
help='Questions per paper (default: 5)',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--skip-download',
|
|
action='store_true',
|
|
help='Skip paper download (use existing)',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--skip-ingest',
|
|
action='store_true',
|
|
help='Skip paper ingestion (use existing data)',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dataset',
|
|
'-d',
|
|
type=str,
|
|
default=None,
|
|
help='Path to existing Q&A dataset (skip generation)',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
'-o',
|
|
type=str,
|
|
default=None,
|
|
help='Output directory for results',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--ab-test',
|
|
action='store_true',
|
|
help='Run A/B test comparing with/without orphan connections',
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Parse paper IDs
|
|
paper_ids = None
|
|
if args.papers:
|
|
paper_ids = [p.strip() for p in args.papers.split(',')]
|
|
|
|
harness_config = {
|
|
'rag_url': args.rag_url,
|
|
'paper_ids': paper_ids,
|
|
'questions_per_paper': args.questions,
|
|
'skip_download': args.skip_download,
|
|
'skip_ingest': args.skip_ingest,
|
|
'dataset_path': args.dataset,
|
|
'output_dir': args.output_dir,
|
|
}
|
|
|
|
if args.ab_test:
|
|
await run_ab_test(harness_config)
|
|
else:
|
|
harness = E2ETestHarness(**harness_config)
|
|
await harness.run_full_pipeline()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|