name: lightrag-entity-resolution-test services: postgres: container_name: lightrag-postgres build: context: ./docker/postgres-age-vector dockerfile: Dockerfile environment: POSTGRES_DB: lightrag POSTGRES_USER: lightrag POSTGRES_PASSWORD: lightrag_pass ports: - "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres volumes: - pgdata_test:/var/lib/postgresql/data command: | postgres -c shared_preload_libraries='vector,age' -c max_connections=150 -c shared_buffers=768MB -c work_mem=32MB -c checkpoint_completion_target=0.9 -c effective_cache_size=2GB -c maintenance_work_mem=192MB -c wal_compression=on -c checkpoint_timeout=10min -c max_wal_size=1GB -c random_page_cost=1.1 -c effective_io_concurrency=200 -c max_worker_processes=12 -c max_parallel_workers_per_gather=4 -c max_parallel_workers=8 -c max_parallel_maintenance_workers=4 -c jit_above_cost=50000 -c jit_inline_above_cost=250000 -c jit_optimize_above_cost=250000 -c default_statistics_target=200 -c hash_mem_multiplier=4 healthcheck: test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"] interval: 5s timeout: 5s retries: 5 mem_limit: 2g rustfs: image: rustfs/rustfs:latest container_name: rustfs-test ports: - "9000:9000" # S3 API - "9001:9001" # Web console environment: RUSTFS_ACCESS_KEY: rustfsadmin RUSTFS_SECRET_KEY: rustfsadmin command: /data volumes: - rustfs_data:/data healthcheck: # RustFS returns AccessDenied for unauth requests, but that means it's alive test: ["CMD-SHELL", "curl -s http://localhost:9000/ | grep -q 'AccessDenied' || curl -sf http://localhost:9000/"] interval: 10s timeout: 5s retries: 5 mem_limit: 512m lightrag: container_name: lightrag-test build: context: . dockerfile: Dockerfile ports: - "9622:9621" # Use 9622 to avoid conflict volumes: - ./data/rag_storage_test:/app/data/rag_storage - ./data/inputs_test:/app/data/inputs # Live reload: Use absolute host path for Docker-in-Docker compatibility (Coder) - /var/lib/docker/volumes/coder-shared-projects-optimized/_data/LightRAG/lightrag:/app/lightrag environment: # Live reload: PYTHONPATH makes mounted /app/lightrag take precedence over site-packages - PYTHONPATH=/app # Server - HOST=0.0.0.0 - PORT=9621 - LOG_LEVEL=DEBUG # LLM (OpenAI - gpt-4o-mini for reliable fast extraction) - LLM_BINDING=openai - LLM_MODEL=gpt-4o-mini - LLM_BINDING_HOST=https://api.openai.com/v1 - LLM_BINDING_API_KEY=${OPENAI_API_KEY} # Embedding - EMBEDDING_BINDING=openai - EMBEDDING_MODEL=text-embedding-3-small - EMBEDDING_DIM=1536 - EMBEDDING_BINDING_HOST=https://api.openai.com/v1 - EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY} # Storage Configuration - Full PostgreSQL! # Custom postgres image has pgvector + Apache AGE - LIGHTRAG_KV_STORAGE=PGKVStorage - LIGHTRAG_VECTOR_STORAGE=PGVectorStorage - LIGHTRAG_GRAPH_STORAGE=PGGraphStorage - LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage - POSTGRES_HOST=postgres - POSTGRES_PORT=5432 - POSTGRES_USER=lightrag - POSTGRES_PASSWORD=lightrag_pass - POSTGRES_DATABASE=lightrag # Entity Resolution - DISABLED for faster ingestion (testing Context Precision changes) - ENTITY_RESOLUTION_ENABLED=false - ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85 - ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5 - ENTITY_RESOLUTION_MAX_CANDIDATES=3 # Orphan Connection - MANUAL (use UI button instead of auto) - AUTO_CONNECT_ORPHANS=false # Processing - Matching agent-sdk working settings - MAX_ASYNC=96 - MAX_PARALLEL_INSERT=10 - EMBEDDING_FUNC_MAX_ASYNC=2 # Match llamacpp parallel slots (prevent queue backlog) - EMBEDDING_BATCH_NUM=48 # Gunicorn - 8 workers x 4 threads = 32 concurrent handlers - GUNICORN_CMD_ARGS=--workers=8 --worker-class=gthread --threads=4 --worker-connections=1000 --timeout=120 --keep-alive=5 --graceful-timeout=30 # Extraction - Using agent-sdk defaults for reliable ingestion - CHUNK_SIZE=1200 # Default chunk size (agent-sdk default) - CHUNK_OVERLAP_SIZE=100 # Default overlap # MAX_GLEANING defaults to 1 (removed override of 2) # Orphan Connection - Use UI button for manual triggering # AUTO_CONNECT_ORPHANS is set to false above (manual mode) - ORPHAN_CONNECTION_THRESHOLD=0.3 # Vector similarity pre-filter threshold - ORPHAN_CONFIDENCE_THRESHOLD=0.7 # LLM confidence required for connection - ORPHAN_CROSS_CONNECT=true # Allow orphan-to-orphan connections # S3/RustFS Configuration - Document staging and archival - S3_ENDPOINT_URL=http://rustfs:9000 - S3_ACCESS_KEY_ID=rustfsadmin - S3_SECRET_ACCESS_KEY=rustfsadmin - S3_BUCKET_NAME=lightrag - S3_REGION=us-east-1 depends_on: postgres: condition: service_healthy rustfs: condition: service_healthy entrypoint: [] command: - python - -m - lightrag.api.run_with_gunicorn - --workers - "8" - --llm-binding - openai - --embedding-binding - openai healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"] interval: 10s timeout: 5s retries: 10 start_period: 60s mem_limit: 2g volumes: pgdata_test: rustfs_data: