#!/usr/bin/env python3 """ Ingest test documents into LightRAG for testing. This script reads text files from a directory and batch-uploads them to LightRAG via the /documents/texts API endpoint, then polls for completion. Usage: python lightrag/evaluation/ingest_test_docs.py python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622 """ import argparse import asyncio import os import time from pathlib import Path import httpx DEFAULT_RAG_URL = "http://localhost:9622" async def ingest_documents( input_dir: Path, rag_url: str, ) -> dict: """Ingest all text files from directory into LightRAG. Args: input_dir: Directory containing .txt or .md files rag_url: LightRAG API base URL Returns: Dict with ingestion statistics """ timeout = httpx.Timeout(120.0, connect=30.0) api_key = os.getenv("LIGHTRAG_API_KEY") headers = {"X-API-Key": api_key} if api_key else {} async with httpx.AsyncClient(timeout=timeout) as client: # Check health try: health = await client.get(f"{rag_url}/health") if health.status_code != 200: raise ConnectionError(f"LightRAG not healthy: {health.status_code}") except httpx.ConnectError: raise ConnectionError(f"Cannot connect to LightRAG at {rag_url}") print(f"āœ“ Connected to LightRAG at {rag_url}") # Collect all text files files = list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md")) if not files: print(f"āœ— No .txt or .md files found in {input_dir}") return {"documents": 0, "elapsed_seconds": 0} print(f" Found {len(files)} documents to ingest") # Read all texts texts = [] sources = [] for file in sorted(files): content = file.read_text() texts.append(content) sources.append(file.name) word_count = len(content.split()) print(f" {file.name}: {word_count:,} words") # Batch ingest via /documents/texts print(f"\n Uploading {len(texts)} documents...") start = time.time() response = await client.post( f"{rag_url}/documents/texts", json={"texts": texts, "file_sources": sources}, headers=headers, ) response.raise_for_status() result = response.json() track_id = result.get("track_id", "") print(f" Track ID: {track_id}") # Poll for completion - wait for processing to start first print(" Waiting for processing to start...") await asyncio.sleep(2) # Give server time to queue documents last_status = "" processed_count = 0 expected_total = len(texts) initial_check = True while True: status_response = await client.get(f"{rag_url}/documents") docs = status_response.json() statuses = docs.get("statuses", {}) processing = len(statuses.get("processing", [])) pending = len(statuses.get("pending", [])) processed = len(statuses.get("processed", [])) total_visible = processing + pending + processed current_status = f"Pending: {pending}, Processing: {processing}, Processed: {processed}" if current_status != last_status: print(f" {current_status}") last_status = current_status processed_count = processed # Wait until we see at least some of our docs in the queue if initial_check and (pending > 0 or processing > 0): initial_check = False print(" Processing started!") # Only exit when processing is done AND we've processed something new if processing == 0 and pending == 0 and not initial_check: break await asyncio.sleep(5) elapsed = time.time() - start print(f"\nāœ“ Ingestion complete in {elapsed:.1f}s") print(f" Documents processed: {processed_count}") print(f" Average: {elapsed / len(texts):.1f}s per document") return { "documents": len(texts), "processed": processed_count, "elapsed_seconds": elapsed, "track_id": track_id, } async def main(): parser = argparse.ArgumentParser(description="Ingest test documents into LightRAG") parser.add_argument( "--input", "-i", type=str, default="lightrag/evaluation/wiki_documents", help="Input directory with text files", ) parser.add_argument( "--rag-url", "-r", type=str, default=None, help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})", ) args = parser.parse_args() input_dir = Path(args.input) rag_url = args.rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL) print("=== LightRAG Document Ingestion ===") print(f"Input: {input_dir}/") print(f"RAG URL: {rag_url}") print() if not input_dir.exists(): print(f"āœ— Input directory not found: {input_dir}") print(" Run download_wikipedia.py first:") print(" python lightrag/evaluation/download_wikipedia.py") return await ingest_documents(input_dir, rag_url) if __name__ == "__main__": asyncio.run(main())