LightRAG/docker-compose.test.yml
clssck 082a5a8fad test(lightrag,api): add comprehensive test coverage and S3 support
Add extensive test suites for API routes and utilities:
- Implement test_search_routes.py (406 lines) for search endpoint validation
- Implement test_upload_routes.py (724 lines) for document upload workflows
- Implement test_s3_client.py (618 lines) for S3 storage operations
- Implement test_citation_utils.py (352 lines) for citation extraction
- Implement test_chunking.py (216 lines) for text chunking validation
Add S3 storage client implementation:
- Create lightrag/storage/s3_client.py with S3 operations
- Add storage module initialization with exports
- Integrate S3 client with document upload handling
Enhance API routes and core functionality:
- Add search_routes.py with full-text and graph search endpoints
- Add upload_routes.py with multipart document upload support
- Update operate.py with bulk operations and health checks
- Enhance postgres_impl.py with bulk upsert and parameterized queries
- Update lightrag_server.py to register new API routes
- Improve utils.py with citation and formatting utilities
Update dependencies and configuration:
- Add S3 and test dependencies to pyproject.toml
- Update docker-compose.test.yml for testing environment
- Sync uv.lock with new dependencies
Apply code quality improvements across all modified files:
- Add type hints to function signatures
- Update imports and router initialization
- Fix logging and error handling
2025-12-05 23:13:39 +01:00

174 lines
5.7 KiB
YAML

name: lightrag-entity-resolution-test
services:
postgres:
container_name: lightrag-postgres
build:
context: ./docker/postgres-age-vector
dockerfile: Dockerfile
environment:
POSTGRES_DB: lightrag
POSTGRES_USER: lightrag
POSTGRES_PASSWORD: lightrag_pass
ports:
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
volumes:
- pgdata_test:/var/lib/postgresql/data
command: |
postgres
-c shared_preload_libraries='vector,age'
-c max_connections=150
-c shared_buffers=768MB
-c work_mem=32MB
-c checkpoint_completion_target=0.9
-c effective_cache_size=2GB
-c maintenance_work_mem=192MB
-c wal_compression=on
-c checkpoint_timeout=10min
-c max_wal_size=1GB
-c random_page_cost=1.1
-c effective_io_concurrency=200
-c max_worker_processes=12
-c max_parallel_workers_per_gather=4
-c max_parallel_workers=8
-c max_parallel_maintenance_workers=4
-c jit_above_cost=50000
-c jit_inline_above_cost=250000
-c jit_optimize_above_cost=250000
-c default_statistics_target=200
-c hash_mem_multiplier=4
healthcheck:
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
interval: 5s
timeout: 5s
retries: 5
mem_limit: 2g
rustfs:
image: rustfs/rustfs:latest
container_name: rustfs-test
ports:
- "9000:9000" # S3 API
- "9001:9001" # Web console
environment:
RUSTFS_ACCESS_KEY: rustfsadmin
RUSTFS_SECRET_KEY: rustfsadmin
command: /data
volumes:
- rustfs_data:/data
healthcheck:
# RustFS returns AccessDenied for unauth requests, but that means it's alive
test: ["CMD-SHELL", "curl -s http://localhost:9000/ | grep -q 'AccessDenied' || curl -sf http://localhost:9000/"]
interval: 10s
timeout: 5s
retries: 5
mem_limit: 512m
lightrag:
container_name: lightrag-test
build:
context: .
dockerfile: Dockerfile
ports:
- "9622:9621" # Use 9622 to avoid conflict
volumes:
- ./data/rag_storage_test:/app/data/rag_storage
- ./data/inputs_test:/app/data/inputs
# Live reload: Use absolute host path for Docker-in-Docker compatibility (Coder)
- /var/lib/docker/volumes/coder-shared-projects-optimized/_data/LightRAG/lightrag:/app/lightrag
environment:
# Live reload: PYTHONPATH makes mounted /app/lightrag take precedence over site-packages
- PYTHONPATH=/app
# Server
- HOST=0.0.0.0
- PORT=9621
- LOG_LEVEL=DEBUG
# LLM (OpenAI - gpt-4o-mini for reliable fast extraction)
- LLM_BINDING=openai
- LLM_MODEL=gpt-4o-mini
- LLM_BINDING_HOST=https://api.openai.com/v1
- LLM_BINDING_API_KEY=${OPENAI_API_KEY}
# Embedding
- EMBEDDING_BINDING=openai
- EMBEDDING_MODEL=text-embedding-3-small
- EMBEDDING_DIM=1536
- EMBEDDING_BINDING_HOST=https://api.openai.com/v1
- EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
# Storage Configuration - Full PostgreSQL!
# Custom postgres image has pgvector + Apache AGE
- LIGHTRAG_KV_STORAGE=PGKVStorage
- LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
- LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
- LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_USER=lightrag
- POSTGRES_PASSWORD=lightrag_pass
- POSTGRES_DATABASE=lightrag
# Entity Resolution - DISABLED for faster ingestion (testing Context Precision changes)
- ENTITY_RESOLUTION_ENABLED=false
- ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
# Orphan Connection - MANUAL (use UI button instead of auto)
- AUTO_CONNECT_ORPHANS=false
# Processing - Matching agent-sdk working settings
- MAX_ASYNC=96
- MAX_PARALLEL_INSERT=10
- EMBEDDING_FUNC_MAX_ASYNC=2 # Match llamacpp parallel slots (prevent queue backlog)
- EMBEDDING_BATCH_NUM=48
# Gunicorn - 8 workers x 4 threads = 32 concurrent handlers
- GUNICORN_CMD_ARGS=--workers=8 --worker-class=gthread --threads=4 --worker-connections=1000 --timeout=120 --keep-alive=5 --graceful-timeout=30
# Extraction - Using agent-sdk defaults for reliable ingestion
- CHUNK_SIZE=1200 # Default chunk size (agent-sdk default)
- CHUNK_OVERLAP_SIZE=100 # Default overlap
# MAX_GLEANING defaults to 1 (removed override of 2)
# Orphan Connection - Use UI button for manual triggering
# AUTO_CONNECT_ORPHANS is set to false above (manual mode)
- ORPHAN_CONNECTION_THRESHOLD=0.3 # Vector similarity pre-filter threshold
- ORPHAN_CONFIDENCE_THRESHOLD=0.7 # LLM confidence required for connection
- ORPHAN_CROSS_CONNECT=true # Allow orphan-to-orphan connections
# S3/RustFS Configuration - Document staging and archival
- S3_ENDPOINT_URL=http://rustfs:9000
- S3_ACCESS_KEY_ID=rustfsadmin
- S3_SECRET_ACCESS_KEY=rustfsadmin
- S3_BUCKET_NAME=lightrag
- S3_REGION=us-east-1
depends_on:
postgres:
condition: service_healthy
rustfs:
condition: service_healthy
entrypoint: []
command:
- python
- -m
- lightrag.api.run_with_gunicorn
- --workers
- "8"
- --llm-binding
- openai
- --embedding-binding
- openai
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
interval: 10s
timeout: 5s
retries: 10
start_period: 60s
mem_limit: 2g
volumes:
pgdata_test:
rustfs_data: