LightRAG/lightrag/evaluation/ingest_test_docs.py
clssck 69358d830d test(lightrag,examples,api): comprehensive ruff formatting and type hints
Format entire codebase with ruff and add type hints across all modules:
- Apply ruff formatting to all Python files (121 files, 17K insertions)
- Add type hints to function signatures throughout lightrag core and API
- Update test suite with improved type annotations and docstrings
- Add pyrightconfig.json for static type checking configuration
- Create prompt_optimized.py and test_extraction_prompt_ab.py test files
- Update ruff.toml and .gitignore for improved linting configuration
- Standardize code style across examples, reproduce scripts, and utilities
2025-12-05 15:17:06 +01:00

170 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Ingest test documents into LightRAG for testing.
This script reads text files from a directory and batch-uploads them to
LightRAG via the /documents/texts API endpoint, then polls for completion.
Usage:
python lightrag/evaluation/ingest_test_docs.py
python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622
"""
import argparse
import asyncio
import os
import time
from pathlib import Path
import httpx
DEFAULT_RAG_URL = 'http://localhost:9622'
async def ingest_documents(
input_dir: Path,
rag_url: str,
) -> dict:
"""Ingest all text files from directory into LightRAG.
Args:
input_dir: Directory containing .txt or .md files
rag_url: LightRAG API base URL
Returns:
Dict with ingestion statistics
"""
timeout = httpx.Timeout(120.0, connect=30.0)
api_key = os.getenv('LIGHTRAG_API_KEY')
headers = {'X-API-Key': api_key} if api_key else {}
async with httpx.AsyncClient(timeout=timeout) as client:
# Check health
try:
health = await client.get(f'{rag_url}/health')
if health.status_code != 200:
raise ConnectionError(f'LightRAG not healthy: {health.status_code}')
except httpx.ConnectError as e:
raise ConnectionError(f'Cannot connect to LightRAG at {rag_url}') from e
print(f'✓ Connected to LightRAG at {rag_url}')
# Collect all text files
files = list(input_dir.glob('*.txt')) + list(input_dir.glob('*.md'))
if not files:
print(f'✗ No .txt or .md files found in {input_dir}')
return {'documents': 0, 'elapsed_seconds': 0}
print(f' Found {len(files)} documents to ingest')
# Read all texts
texts = []
sources = []
for file in sorted(files):
content = file.read_text()
texts.append(content)
sources.append(file.name)
word_count = len(content.split())
print(f' {file.name}: {word_count:,} words')
# Batch ingest via /documents/texts
print(f'\n Uploading {len(texts)} documents...')
start = time.time()
response = await client.post(
f'{rag_url}/documents/texts',
json={'texts': texts, 'file_sources': sources},
headers=headers,
)
response.raise_for_status()
result = response.json()
track_id = result.get('track_id', '')
print(f' Track ID: {track_id}')
# Poll for completion - wait for processing to start first
print(' Waiting for processing to start...')
await asyncio.sleep(2) # Give server time to queue documents
last_status = ''
processed_count = 0
len(texts)
initial_check = True
while True:
status_response = await client.get(f'{rag_url}/documents')
docs = status_response.json()
statuses = docs.get('statuses', {})
processing = len(statuses.get('processing', []))
pending = len(statuses.get('pending', []))
processed = len(statuses.get('processed', []))
processing + pending + processed
current_status = f'Pending: {pending}, Processing: {processing}, Processed: {processed}'
if current_status != last_status:
print(f' {current_status}')
last_status = current_status
processed_count = processed
# Wait until we see at least some of our docs in the queue
if initial_check and (pending > 0 or processing > 0):
initial_check = False
print(' Processing started!')
# Only exit when processing is done AND we've processed something new
if processing == 0 and pending == 0 and not initial_check:
break
await asyncio.sleep(5)
elapsed = time.time() - start
print(f'\n✓ Ingestion complete in {elapsed:.1f}s')
print(f' Documents processed: {processed_count}')
print(f' Average: {elapsed / len(texts):.1f}s per document')
return {
'documents': len(texts),
'processed': processed_count,
'elapsed_seconds': elapsed,
'track_id': track_id,
}
async def main():
parser = argparse.ArgumentParser(description='Ingest test documents into LightRAG')
parser.add_argument(
'--input',
'-i',
type=str,
default='lightrag/evaluation/wiki_documents',
help='Input directory with text files',
)
parser.add_argument(
'--rag-url',
'-r',
type=str,
default=None,
help=f'LightRAG API URL (default: {DEFAULT_RAG_URL})',
)
args = parser.parse_args()
input_dir = Path(args.input)
rag_url = args.rag_url or os.getenv('LIGHTRAG_API_URL', DEFAULT_RAG_URL)
print('=== LightRAG Document Ingestion ===')
print(f'Input: {input_dir}/')
print(f'RAG URL: {rag_url}')
print()
if not input_dir.exists():
print(f'✗ Input directory not found: {input_dir}')
print(' Run download_wikipedia.py first:')
print(' python lightrag/evaluation/download_wikipedia.py')
return
await ingest_documents(input_dir, rag_url)
if __name__ == '__main__':
asyncio.run(main())