LightRAG/lightrag/evaluation/ingest_test_docs.py
clssck ef7327bb3e chore(docker-compose, lightrag): optimize test infrastructure and add evaluation tools
Add comprehensive E2E testing infrastructure with PostgreSQL performance tuning,
Gunicorn multi-worker support, and evaluation scripts for RAGAS-based quality
assessment. Introduces 4 new evaluation utilities: compare_results.py for A/B test
analysis, download_wikipedia.py for reproducible test datasets, e2e_test_harness.py
for automated evaluation pipelines, and ingest_test_docs.py for batch document
ingestion. Updates docker-compose.test.yml with aggressive async settings, memory
limits, and optimized chunking parameters. Parallelize entity summarization in
operate.py for improved extraction performance. Fix typos in merge node/edge logs.
2025-11-29 10:39:20 +01:00

170 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Ingest test documents into LightRAG for testing.
This script reads text files from a directory and batch-uploads them to
LightRAG via the /documents/texts API endpoint, then polls for completion.
Usage:
python lightrag/evaluation/ingest_test_docs.py
python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622
"""
import argparse
import asyncio
import os
import time
from pathlib import Path
import httpx
DEFAULT_RAG_URL = "http://localhost:9622"
async def ingest_documents(
input_dir: Path,
rag_url: str,
) -> dict:
"""Ingest all text files from directory into LightRAG.
Args:
input_dir: Directory containing .txt or .md files
rag_url: LightRAG API base URL
Returns:
Dict with ingestion statistics
"""
timeout = httpx.Timeout(120.0, connect=30.0)
api_key = os.getenv("LIGHTRAG_API_KEY")
headers = {"X-API-Key": api_key} if api_key else {}
async with httpx.AsyncClient(timeout=timeout) as client:
# Check health
try:
health = await client.get(f"{rag_url}/health")
if health.status_code != 200:
raise ConnectionError(f"LightRAG not healthy: {health.status_code}")
except httpx.ConnectError:
raise ConnectionError(f"Cannot connect to LightRAG at {rag_url}")
print(f"✓ Connected to LightRAG at {rag_url}")
# Collect all text files
files = list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md"))
if not files:
print(f"✗ No .txt or .md files found in {input_dir}")
return {"documents": 0, "elapsed_seconds": 0}
print(f" Found {len(files)} documents to ingest")
# Read all texts
texts = []
sources = []
for file in sorted(files):
content = file.read_text()
texts.append(content)
sources.append(file.name)
word_count = len(content.split())
print(f" {file.name}: {word_count:,} words")
# Batch ingest via /documents/texts
print(f"\n Uploading {len(texts)} documents...")
start = time.time()
response = await client.post(
f"{rag_url}/documents/texts",
json={"texts": texts, "file_sources": sources},
headers=headers,
)
response.raise_for_status()
result = response.json()
track_id = result.get("track_id", "")
print(f" Track ID: {track_id}")
# Poll for completion - wait for processing to start first
print(" Waiting for processing to start...")
await asyncio.sleep(2) # Give server time to queue documents
last_status = ""
processed_count = 0
expected_total = len(texts)
initial_check = True
while True:
status_response = await client.get(f"{rag_url}/documents")
docs = status_response.json()
statuses = docs.get("statuses", {})
processing = len(statuses.get("processing", []))
pending = len(statuses.get("pending", []))
processed = len(statuses.get("processed", []))
total_visible = processing + pending + processed
current_status = f"Pending: {pending}, Processing: {processing}, Processed: {processed}"
if current_status != last_status:
print(f" {current_status}")
last_status = current_status
processed_count = processed
# Wait until we see at least some of our docs in the queue
if initial_check and (pending > 0 or processing > 0):
initial_check = False
print(" Processing started!")
# Only exit when processing is done AND we've processed something new
if processing == 0 and pending == 0 and not initial_check:
break
await asyncio.sleep(5)
elapsed = time.time() - start
print(f"\n✓ Ingestion complete in {elapsed:.1f}s")
print(f" Documents processed: {processed_count}")
print(f" Average: {elapsed / len(texts):.1f}s per document")
return {
"documents": len(texts),
"processed": processed_count,
"elapsed_seconds": elapsed,
"track_id": track_id,
}
async def main():
parser = argparse.ArgumentParser(description="Ingest test documents into LightRAG")
parser.add_argument(
"--input",
"-i",
type=str,
default="lightrag/evaluation/wiki_documents",
help="Input directory with text files",
)
parser.add_argument(
"--rag-url",
"-r",
type=str,
default=None,
help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
)
args = parser.parse_args()
input_dir = Path(args.input)
rag_url = args.rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)
print("=== LightRAG Document Ingestion ===")
print(f"Input: {input_dir}/")
print(f"RAG URL: {rag_url}")
print()
if not input_dir.exists():
print(f"✗ Input directory not found: {input_dir}")
print(" Run download_wikipedia.py first:")
print(" python lightrag/evaluation/download_wikipedia.py")
return
await ingest_documents(input_dir, rag_url)
if __name__ == "__main__":
asyncio.run(main())