Format entire codebase with ruff and add type hints across all modules: - Apply ruff formatting to all Python files (121 files, 17K insertions) - Add type hints to function signatures throughout lightrag core and API - Update test suite with improved type annotations and docstrings - Add pyrightconfig.json for static type checking configuration - Create prompt_optimized.py and test_extraction_prompt_ab.py test files - Update ruff.toml and .gitignore for improved linting configuration - Standardize code style across examples, reproduce scripts, and utilities
170 lines
5.3 KiB
Python
170 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ingest test documents into LightRAG for testing.
|
|
|
|
This script reads text files from a directory and batch-uploads them to
|
|
LightRAG via the /documents/texts API endpoint, then polls for completion.
|
|
|
|
Usage:
|
|
python lightrag/evaluation/ingest_test_docs.py
|
|
python lightrag/evaluation/ingest_test_docs.py --input wiki_documents/ --rag-url http://localhost:9622
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
|
|
DEFAULT_RAG_URL = 'http://localhost:9622'
|
|
|
|
|
|
async def ingest_documents(
|
|
input_dir: Path,
|
|
rag_url: str,
|
|
) -> dict:
|
|
"""Ingest all text files from directory into LightRAG.
|
|
|
|
Args:
|
|
input_dir: Directory containing .txt or .md files
|
|
rag_url: LightRAG API base URL
|
|
|
|
Returns:
|
|
Dict with ingestion statistics
|
|
"""
|
|
timeout = httpx.Timeout(120.0, connect=30.0)
|
|
api_key = os.getenv('LIGHTRAG_API_KEY')
|
|
headers = {'X-API-Key': api_key} if api_key else {}
|
|
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
# Check health
|
|
try:
|
|
health = await client.get(f'{rag_url}/health')
|
|
if health.status_code != 200:
|
|
raise ConnectionError(f'LightRAG not healthy: {health.status_code}')
|
|
except httpx.ConnectError as e:
|
|
raise ConnectionError(f'Cannot connect to LightRAG at {rag_url}') from e
|
|
|
|
print(f'✓ Connected to LightRAG at {rag_url}')
|
|
|
|
# Collect all text files
|
|
files = list(input_dir.glob('*.txt')) + list(input_dir.glob('*.md'))
|
|
if not files:
|
|
print(f'✗ No .txt or .md files found in {input_dir}')
|
|
return {'documents': 0, 'elapsed_seconds': 0}
|
|
|
|
print(f' Found {len(files)} documents to ingest')
|
|
|
|
# Read all texts
|
|
texts = []
|
|
sources = []
|
|
for file in sorted(files):
|
|
content = file.read_text()
|
|
texts.append(content)
|
|
sources.append(file.name)
|
|
word_count = len(content.split())
|
|
print(f' {file.name}: {word_count:,} words')
|
|
|
|
# Batch ingest via /documents/texts
|
|
print(f'\n Uploading {len(texts)} documents...')
|
|
start = time.time()
|
|
|
|
response = await client.post(
|
|
f'{rag_url}/documents/texts',
|
|
json={'texts': texts, 'file_sources': sources},
|
|
headers=headers,
|
|
)
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
|
|
track_id = result.get('track_id', '')
|
|
print(f' Track ID: {track_id}')
|
|
|
|
# Poll for completion - wait for processing to start first
|
|
print(' Waiting for processing to start...')
|
|
await asyncio.sleep(2) # Give server time to queue documents
|
|
|
|
last_status = ''
|
|
processed_count = 0
|
|
len(texts)
|
|
initial_check = True
|
|
|
|
while True:
|
|
status_response = await client.get(f'{rag_url}/documents')
|
|
docs = status_response.json()
|
|
statuses = docs.get('statuses', {})
|
|
|
|
processing = len(statuses.get('processing', []))
|
|
pending = len(statuses.get('pending', []))
|
|
processed = len(statuses.get('processed', []))
|
|
processing + pending + processed
|
|
|
|
current_status = f'Pending: {pending}, Processing: {processing}, Processed: {processed}'
|
|
if current_status != last_status:
|
|
print(f' {current_status}')
|
|
last_status = current_status
|
|
processed_count = processed
|
|
|
|
# Wait until we see at least some of our docs in the queue
|
|
if initial_check and (pending > 0 or processing > 0):
|
|
initial_check = False
|
|
print(' Processing started!')
|
|
|
|
# Only exit when processing is done AND we've processed something new
|
|
if processing == 0 and pending == 0 and not initial_check:
|
|
break
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
elapsed = time.time() - start
|
|
print(f'\n✓ Ingestion complete in {elapsed:.1f}s')
|
|
print(f' Documents processed: {processed_count}')
|
|
print(f' Average: {elapsed / len(texts):.1f}s per document')
|
|
|
|
return {
|
|
'documents': len(texts),
|
|
'processed': processed_count,
|
|
'elapsed_seconds': elapsed,
|
|
'track_id': track_id,
|
|
}
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description='Ingest test documents into LightRAG')
|
|
parser.add_argument(
|
|
'--input',
|
|
'-i',
|
|
type=str,
|
|
default='lightrag/evaluation/wiki_documents',
|
|
help='Input directory with text files',
|
|
)
|
|
parser.add_argument(
|
|
'--rag-url',
|
|
'-r',
|
|
type=str,
|
|
default=None,
|
|
help=f'LightRAG API URL (default: {DEFAULT_RAG_URL})',
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
input_dir = Path(args.input)
|
|
rag_url = args.rag_url or os.getenv('LIGHTRAG_API_URL', DEFAULT_RAG_URL)
|
|
|
|
print('=== LightRAG Document Ingestion ===')
|
|
print(f'Input: {input_dir}/')
|
|
print(f'RAG URL: {rag_url}')
|
|
print()
|
|
|
|
if not input_dir.exists():
|
|
print(f'✗ Input directory not found: {input_dir}')
|
|
print(' Run download_wikipedia.py first:')
|
|
print(' python lightrag/evaluation/download_wikipedia.py')
|
|
return
|
|
|
|
await ingest_documents(input_dir, rag_url)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|