LightRAG/upload_pdfs.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

60 lines
2 KiB
Python

#!/usr/bin/env python3
"""Upload PDFs to LightRAG server."""
import argparse
import os
import sys
from pathlib import Path
from typing import Any
import requests
def upload_pdfs(pdf_dir: Path, api_url: str, timeout: float) -> int:
if not pdf_dir.exists() or not pdf_dir.is_dir():
print(f'ERROR: PDF directory does not exist or is not a directory: {pdf_dir}')
return 1
pdf_files = list(pdf_dir.glob('*.pdf'))
print(f'Found {len(pdf_files)} PDFs to upload in {pdf_dir}')
for i, pdf_path in enumerate(pdf_files, 1):
print(f'[{i}/{len(pdf_files)}] Uploading: {pdf_path.name}')
try:
with open(pdf_path, 'rb') as f:
files = {'file': (pdf_path.name, f, 'application/pdf')}
response = requests.post(api_url, files=files, timeout=timeout)
response.raise_for_status()
result: Any = response.json()
print(f' -> {result.get("status", "unknown")}: {result.get("message", "No message")[:80]}')
except Exception as e:
print(f' -> ERROR: {e}')
return 0
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Upload PDFs to LightRAG')
parser.add_argument(
'--pdf-dir',
default=os.getenv('PDF_DIR', 'documents/questions/docs/pdf'),
help='Directory containing PDF files (default: env PDF_DIR or documents/questions/docs/pdf)',
)
parser.add_argument(
'--api-url',
default=os.getenv('API_URL', 'http://localhost:9621/documents/upload'),
help='LightRAG upload endpoint (default: env API_URL or http://localhost:9621/documents/upload)',
)
parser.add_argument(
'--timeout',
type=float,
default=float(os.getenv('UPLOAD_TIMEOUT', '120')),
help='Request timeout in seconds (default: env UPLOAD_TIMEOUT or 120)',
)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
exit_code = upload_pdfs(Path(args.pdf_dir), args.api_url, args.timeout)
sys.exit(exit_code)