LightRAG/lightrag/evaluation/ingest_test_docs.py

#!/usr/bin/env python3
"""
Ingest test documents into LightRAG for testing.

This script reads text files from a directory and batch-uploads them to
LightRAG via the /documents/texts API endpoint, then polls for completion.

Usage:
    python lightrag/evaluation/ingest_test_docs.py
    python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622
"""

import argparse
import asyncio
import os
import time
from pathlib import Path

import httpx

DEFAULT_RAG_URL = "http://localhost:9622"


async def ingest_documents(
    input_dir: Path,
    rag_url: str,
) -> dict:
    """Ingest all text files from directory into LightRAG.

    Args:
        input_dir: Directory containing .txt or .md files
        rag_url: LightRAG API base URL

    Returns:
        Dict with ingestion statistics
    """
    timeout = httpx.Timeout(120.0, connect=30.0)
    api_key = os.getenv("LIGHTRAG_API_KEY")
    headers = {"X-API-Key": api_key} if api_key else {}

    async with httpx.AsyncClient(timeout=timeout) as client:
        # Check health
        try:
            health = await client.get(f"{rag_url}/health")
            if health.status_code != 200:
                raise ConnectionError(f"LightRAG not healthy: {health.status_code}")
        except httpx.ConnectError:
            raise ConnectionError(f"Cannot connect to LightRAG at {rag_url}")

        print(f"✓ Connected to LightRAG at {rag_url}")

        # Collect all text files
        files = list(input_dir.glob("*.txt")) + list(input_dir.glob("*.md"))
        if not files:
            print(f"✗ No .txt or .md files found in {input_dir}")
            return {"documents": 0, "elapsed_seconds": 0}

        print(f"  Found {len(files)} documents to ingest")

        # Read all texts
        texts = []
        sources = []
        for file in sorted(files):
            content = file.read_text()
            texts.append(content)
            sources.append(file.name)
            word_count = len(content.split())
            print(f"    {file.name}: {word_count:,} words")

        # Batch ingest via /documents/texts
        print(f"\n  Uploading {len(texts)} documents...")
        start = time.time()

        response = await client.post(
            f"{rag_url}/documents/texts",
            json={"texts": texts, "file_sources": sources},
            headers=headers,
        )
        response.raise_for_status()
        result = response.json()

        track_id = result.get("track_id", "")
        print(f"  Track ID: {track_id}")

        # Poll for completion - wait for processing to start first
        print("  Waiting for processing to start...")
        await asyncio.sleep(2)  # Give server time to queue documents

        last_status = ""
        processed_count = 0
        expected_total = len(texts)
        initial_check = True

        while True:
            status_response = await client.get(f"{rag_url}/documents")
            docs = status_response.json()
            statuses = docs.get("statuses", {})

            processing = len(statuses.get("processing", []))
            pending = len(statuses.get("pending", []))
            processed = len(statuses.get("processed", []))
            total_visible = processing + pending + processed

            current_status = f"Pending: {pending}, Processing: {processing}, Processed: {processed}"
            if current_status != last_status:
                print(f"    {current_status}")
                last_status = current_status
                processed_count = processed

            # Wait until we see at least some of our docs in the queue
            if initial_check and (pending > 0 or processing > 0):
                initial_check = False
                print("  Processing started!")

            # Only exit when processing is done AND we've processed something new
            if processing == 0 and pending == 0 and not initial_check:
                break

            await asyncio.sleep(5)

        elapsed = time.time() - start
        print(f"\n✓ Ingestion complete in {elapsed:.1f}s")
        print(f"  Documents processed: {processed_count}")
        print(f"  Average: {elapsed / len(texts):.1f}s per document")

        return {
            "documents": len(texts),
            "processed": processed_count,
            "elapsed_seconds": elapsed,
            "track_id": track_id,
        }


async def main():
    parser = argparse.ArgumentParser(description="Ingest test documents into LightRAG")
    parser.add_argument(
        "--input",
        "-i",
        type=str,
        default="lightrag/evaluation/wiki_documents",
        help="Input directory with text files",
    )
    parser.add_argument(
        "--rag-url",
        "-r",
        type=str,
        default=None,
        help=f"LightRAG API URL (default: {DEFAULT_RAG_URL})",
    )
    args = parser.parse_args()

    input_dir = Path(args.input)
    rag_url = args.rag_url or os.getenv("LIGHTRAG_API_URL", DEFAULT_RAG_URL)

    print("=== LightRAG Document Ingestion ===")
    print(f"Input: {input_dir}/")
    print(f"RAG URL: {rag_url}")
    print()

    if not input_dir.exists():
        print(f"✗ Input directory not found: {input_dir}")
        print("  Run download_wikipedia.py first:")
        print("    python lightrag/evaluation/download_wikipedia.py")
        return

    await ingest_documents(input_dir, rag_url)


if __name__ == "__main__":
    asyncio.run(main())