LightRAG/docker-compose.test.yml
clssck d2c9e6e2ec test(lightrag): add orphan connection feature with quality validation tests
Implement automatic orphan entity connection system that identifies entities with
no relationships and creates meaningful connections via vector similarity + LLM
validation. This improves knowledge graph connectivity and retrieval quality.
Changes:
- Add orphan connection configuration parameters (thresholds, cross-connect settings)
- Implement aconnect_orphan_entities() method with 4-step validation pipeline
- Add SQL templates for efficient orphan and candidate entity queries
- Create POST /graph/orphans/connect API endpoint with configurable parameters
- Add orphan connection validation prompt for LLM-based relationship verification
- Include relationship density requirement in extraction prompts to prevent orphans
- Update docker-compose.test.yml with optimized extraction parameters
- Add quality validation test suite (run_quality_tests.py) for retrieval evaluation
- Add unit test framework (test_orphan_connection_quality.py) with test cases
- Enable auto-run of orphan connection after document processing
2025-11-28 18:23:30 +01:00

95 lines
3.1 KiB
YAML

name: lightrag-entity-resolution-test
services:
postgres:
container_name: lightrag-postgres
build:
context: ./docker/postgres-age-vector
dockerfile: Dockerfile
environment:
POSTGRES_DB: lightrag
POSTGRES_USER: lightrag
POSTGRES_PASSWORD: lightrag_pass
ports:
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
volumes:
- pgdata_test:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
interval: 5s
timeout: 5s
retries: 5
lightrag:
container_name: lightrag-test
build:
context: .
dockerfile: Dockerfile
ports:
- "9622:9621" # Use 9622 to avoid conflict
volumes:
- ./data/rag_storage_test:/app/data/rag_storage
- ./data/inputs_test:/app/data/inputs
environment:
# Server
- HOST=0.0.0.0
- PORT=9621
- LOG_LEVEL=DEBUG
# LLM (OpenAI)
- LLM_BINDING=openai
- LLM_MODEL=gpt-4o-mini
- LLM_BINDING_HOST=https://api.openai.com/v1
- LLM_BINDING_API_KEY=${OPENAI_API_KEY}
# Embedding
- EMBEDDING_BINDING=openai
- EMBEDDING_MODEL=text-embedding-3-small
- EMBEDDING_DIM=1536
- EMBEDDING_BINDING_HOST=https://api.openai.com/v1
- EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
# Storage Configuration - Full PostgreSQL!
# Custom postgres image has pgvector + Apache AGE
- LIGHTRAG_KV_STORAGE=PGKVStorage
- LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
- LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
- LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
- POSTGRES_HOST=postgres
- POSTGRES_PORT=5432
- POSTGRES_USER=lightrag
- POSTGRES_PASSWORD=lightrag_pass
- POSTGRES_DATABASE=lightrag
# Entity Resolution - ENABLED!
- ENTITY_RESOLUTION_ENABLED=true
- ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
# Processing
- MAX_ASYNC=4
# Extraction Optimization - Reduce Orphan Nodes
- CHUNK_SIZE=800 # Smaller chunks for focused extraction
- CHUNK_OVERLAP_SIZE=400 # 50% overlap captures cross-boundary relationships
- MAX_GLEANING=1 # Enable gleaning refinement pass
- FORCE_LLM_SUMMARY_ON_MERGE=4 # More aggressive entity consolidation
# Orphan Connection - Self-healing graph
- AUTO_CONNECT_ORPHANS=true # Run orphan connection after each doc
- ORPHAN_CONNECTION_THRESHOLD=0.3 # Vector similarity pre-filter threshold
- ORPHAN_CONFIDENCE_THRESHOLD=0.7 # LLM confidence required for connection
- ORPHAN_CROSS_CONNECT=true # Allow orphan-to-orphan connections
depends_on:
postgres:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
interval: 10s
timeout: 5s
retries: 10
start_period: 30s
volumes:
pgdata_test: