Implement automatic orphan entity connection system that identifies entities with no relationships and creates meaningful connections via vector similarity + LLM validation. This improves knowledge graph connectivity and retrieval quality. Changes: - Add orphan connection configuration parameters (thresholds, cross-connect settings) - Implement aconnect_orphan_entities() method with 4-step validation pipeline - Add SQL templates for efficient orphan and candidate entity queries - Create POST /graph/orphans/connect API endpoint with configurable parameters - Add orphan connection validation prompt for LLM-based relationship verification - Include relationship density requirement in extraction prompts to prevent orphans - Update docker-compose.test.yml with optimized extraction parameters - Add quality validation test suite (run_quality_tests.py) for retrieval evaluation - Add unit test framework (test_orphan_connection_quality.py) with test cases - Enable auto-run of orphan connection after document processing
95 lines
3.1 KiB
YAML
95 lines
3.1 KiB
YAML
name: lightrag-entity-resolution-test
|
|
|
|
services:
|
|
postgres:
|
|
container_name: lightrag-postgres
|
|
build:
|
|
context: ./docker/postgres-age-vector
|
|
dockerfile: Dockerfile
|
|
environment:
|
|
POSTGRES_DB: lightrag
|
|
POSTGRES_USER: lightrag
|
|
POSTGRES_PASSWORD: lightrag_pass
|
|
ports:
|
|
- "5433:5432" # Use 5433 to avoid conflict with agent-sdk postgres
|
|
volumes:
|
|
- pgdata_test:/var/lib/postgresql/data
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U lightrag -d lightrag"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 5
|
|
|
|
lightrag:
|
|
container_name: lightrag-test
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
ports:
|
|
- "9622:9621" # Use 9622 to avoid conflict
|
|
volumes:
|
|
- ./data/rag_storage_test:/app/data/rag_storage
|
|
- ./data/inputs_test:/app/data/inputs
|
|
environment:
|
|
# Server
|
|
- HOST=0.0.0.0
|
|
- PORT=9621
|
|
- LOG_LEVEL=DEBUG
|
|
|
|
# LLM (OpenAI)
|
|
- LLM_BINDING=openai
|
|
- LLM_MODEL=gpt-4o-mini
|
|
- LLM_BINDING_HOST=https://api.openai.com/v1
|
|
- LLM_BINDING_API_KEY=${OPENAI_API_KEY}
|
|
|
|
# Embedding
|
|
- EMBEDDING_BINDING=openai
|
|
- EMBEDDING_MODEL=text-embedding-3-small
|
|
- EMBEDDING_DIM=1536
|
|
- EMBEDDING_BINDING_HOST=https://api.openai.com/v1
|
|
- EMBEDDING_BINDING_API_KEY=${OPENAI_API_KEY}
|
|
|
|
# Storage Configuration - Full PostgreSQL!
|
|
# Custom postgres image has pgvector + Apache AGE
|
|
- LIGHTRAG_KV_STORAGE=PGKVStorage
|
|
- LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
|
- LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
|
|
- LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
|
- POSTGRES_HOST=postgres
|
|
- POSTGRES_PORT=5432
|
|
- POSTGRES_USER=lightrag
|
|
- POSTGRES_PASSWORD=lightrag_pass
|
|
- POSTGRES_DATABASE=lightrag
|
|
|
|
# Entity Resolution - ENABLED!
|
|
- ENTITY_RESOLUTION_ENABLED=true
|
|
- ENTITY_RESOLUTION_FUZZY_THRESHOLD=0.85
|
|
- ENTITY_RESOLUTION_VECTOR_THRESHOLD=0.5
|
|
- ENTITY_RESOLUTION_MAX_CANDIDATES=3
|
|
|
|
# Processing
|
|
- MAX_ASYNC=4
|
|
|
|
# Extraction Optimization - Reduce Orphan Nodes
|
|
- CHUNK_SIZE=800 # Smaller chunks for focused extraction
|
|
- CHUNK_OVERLAP_SIZE=400 # 50% overlap captures cross-boundary relationships
|
|
- MAX_GLEANING=1 # Enable gleaning refinement pass
|
|
- FORCE_LLM_SUMMARY_ON_MERGE=4 # More aggressive entity consolidation
|
|
|
|
# Orphan Connection - Self-healing graph
|
|
- AUTO_CONNECT_ORPHANS=true # Run orphan connection after each doc
|
|
- ORPHAN_CONNECTION_THRESHOLD=0.3 # Vector similarity pre-filter threshold
|
|
- ORPHAN_CONFIDENCE_THRESHOLD=0.7 # LLM confidence required for connection
|
|
- ORPHAN_CROSS_CONNECT=true # Allow orphan-to-orphan connections
|
|
depends_on:
|
|
postgres:
|
|
condition: service_healthy
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "curl -f http://localhost:9621/health || exit 1"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 10
|
|
start_period: 30s
|
|
|
|
volumes:
|
|
pgdata_test:
|