Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning, Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py for automated evaluation pipelines, and ingest_test_docs.py for batch document ingestion. Updates docker-compose.test.yml with aggressive async settings, memory limits, and optimized chunking parameters. Parallelize entity summarization in operate.py for improved extraction performance. Fix typos in merge node/edge logs.
170 lines
5.3 KiB
Python
170 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ingest test documents into LightRAG for testing.
|
|
|
|
This script reads text files from a directory and batch-uploads them to
|
|
LightRAG via the /documents/texts API endpoint, then polls for completion.
|
|
|
|
Usage:
|
|
python lightrag/evaluation/ingest_test_docs.py
|
|
python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
|
|
DEFAULT_RAG_URL = "http://localhost:9622"
|
|
|
|
|
|
async def ingest_documents(
|
|
input_dir: Path,
|
|
rag_url: str,
|
|
) -> dict:
|
|
"""Ingest all text files from directory into LightRAG.
|
|
|
|
Args:
|
|
input_dir: Directory containing .txt or .md files
|
|
rag_url: LightRAG API base URL
|
|
|
|
Returns:
|
|
Dict with ingestion statistics
|
|
"""
|
|
timeout = httpx.Timeout(120.0, connect=30.0)
|
|
api_key = os.getenv("LIGHTRAG_API_KEY")
|
|
headers = {"X-API-Key": api_key} if api_key else {}
|
|
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
# Check health
|
|
try:
|
|
health = await client.get(f"{rag_url}/health")
|
|
if health.status_code != 200:
|
|
raise ConnectionError(f"LightRAG not healthy: {health.status_code}")
|
|
except httpx.ConnectError:
|
|
raise ConnectionError(f"Cannot connect to LightRAG at {rag_url}")
|
|
|
|
print(f"✓ Connected to LightRAG at {rag_url}")
|
|
|
|
# Collect all text files
|
|
files = list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md"))
|
|
if not files:
|
|
print(f"✗ No .txt or .md files found in {input_dir}")
|
|
return {"documents": 0, "elapsed_seconds": 0}
|
|
|
|
print(f" Found {len(files)} documents to ingest")
|
|
|
|
# Read all texts
|
|
texts = []
|
|
sources = []
|
|
for file in sorted(files):
|
|
content = file.read_text()
|
|
texts.append(content)
|
|
sources.append(file.name)
|
|
word_count = len(content.split())
|
|
print(f" {file.name}: {word_count:,} words")
|
|
|
|
# Batch ingest via /documents/texts
|
|
print(f"\n Uploading {len(texts)} documents...")
|
|
start = time.time()
|
|
|
|
response = await client.post(
|
|
f"{rag_url}/documents/texts",
|
|
json={"texts": texts, "file_sources": sources},
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
|
|
track_id = result.get("track_id", "")
|
|
print(f" Track ID: {track_id}")
|
|
|
|
# Poll for completion - wait for processing to start first
|
|
print(" Waiting for processing to start...")
|
|
await asyncio.sleep(2) # Give server time to queue documents
|
|
|
|
last_status = ""
|
|
processed_count = 0
|
|
expected_total = len(texts)
|
|
initial_check = True
|
|
|
|
while True:
|
|
status_response = await client.get(f"{rag_url}/documents")
|
|
docs = status_response.json()
|
|
statuses = docs.get("statuses", {})
|
|
|
|
processing = len(statuses.get("processing", []))
|
|
pending = len(statuses.get("pending", []))
|
|
processed = len(statuses.get("processed", []))
|
|
total_visible = processing + pending + processed
|
|
|
|
current_status = f"Pending: {pending}, Processing: {processing}, Processed: {processed}"
|
|
if current_status != last_status:
|
|
print(f" {current_status}")
|
|
last_status = current_status
|
|
processed_count = processed
|
|
|
|
# Wait until we see at least some of our docs in the queue
|
|
if initial_check and (pending > 0 or processing > 0):
|
|
initial_check = False
|
|
print(" Processing started!")
|
|
|
|
# Only exit when processing is done AND we've processed something new
|
|
if processing == 0 and pending == 0 and not initial_check:
|
|
break
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
elapsed = time.time() - start
|
|
print(f"\n✓ Ingestion complete in {elapsed:.1f}s")
|
|
print(f" Documents processed: {processed_count}")
|
|
print(f" Average: {elapsed / len(texts):.1f}s per document")
|
|
|
|
return {
|
|
"documents": len(texts),
|
|
"processed": processed_count,
|
|
"elapsed_seconds": elapsed,
|
|
"track_id": track_id,
|
|
}
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="Ingest test documents into LightRAG")
|
|
parser.add_argument(
|
|
"--input",
|
|
"-i",
|
|
type=str,
|
|
default="lightrag/evaluation/wiki_documents",
|
|
help="Input directory with text files",
|
|
)
|
|
parser.add_argument(
|
|
"--rag-url",
|
|
"-r",
|
|
type=str,
|
|
default=None,
|
|
help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
input_dir = Path(args.input)
|
|
rag_url = args.rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)
|
|
|
|
print("=== LightRAG Document Ingestion ===")
|
|
print(f"Input: {input_dir}/")
|
|
print(f"RAG URL: {rag_url}")
|
|
print()
|
|
|
|
if not input_dir.exists():
|
|
print(f"✗ Input directory not found: {input_dir}")
|
|
print(" Run download_wikipedia.py first:")
|
|
print(" python lightrag/evaluation/download_wikipedia.py")
|
|
return
|
|
|
|
await ingest_documents(input_dir, rag_url)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|