Implement comprehensive configuration management system with: **Core Components:** - config/config.schema.yaml: Configuration metadata (single source of truth) - scripts/lib/generate_from_schema.py: Schema → local.yaml generator - scripts/lib/generate_env.py: local.yaml → .env converter - scripts/setup.sh: One-click configuration initialization **Key Features:** - Deep merge logic preserves existing values - Auto-generation of secrets (32-char random strings) - Type inference for configuration values - Nested YAML → flat environment variables - Git-safe: local.yaml and .env excluded from version control **Configuration Coverage:** - Trilingual entity extractor (Chinese/English/Swedish) - LightRAG API, database, vector DB settings - LLM provider configuration - Entity/relation extraction settings - Security and performance tuning **Documentation:** - docs/ConfigurationGuide-zh.md: Complete usage guide with examples **Usage:** ```bash ./scripts/setup.sh # Generate config/local.yaml and .env ``` This enables centralized configuration management with automatic secret generation and safe handling of sensitive data.
199 lines
5.6 KiB
YAML
199 lines
5.6 KiB
YAML
# LightRAG 三语言实体提取器配置 Schema
|
||
#
|
||
# 此文件定义配置字段的元数据,包括:
|
||
# - 字段路径(section)
|
||
# - 默认值(default)
|
||
# - 类型(type: 留空为自动推断,secret 为密钥)
|
||
# - 自动生成(auto_generate: 密钥自动生成)
|
||
# - 描述(description)
|
||
#
|
||
# 运行 ./scripts/setup.sh 自动生成 config/local.yaml 和 .env
|
||
|
||
# ============================================================
|
||
# 三语言实体提取器配置
|
||
# ============================================================
|
||
|
||
# 通用配置
|
||
- section: trilingual.enabled
|
||
default: true
|
||
description: "Enable trilingual entity extractor (Chinese/English/Swedish)"
|
||
|
||
- section: trilingual.default_language
|
||
default: "en"
|
||
description: "Default language if not specified (zh/en/sv)"
|
||
|
||
- section: trilingual.lazy_loading
|
||
default: true
|
||
description: "Enable lazy loading (load models on-demand to save memory)"
|
||
|
||
# 中文配置(HanLP)
|
||
- section: trilingual.chinese.enabled
|
||
default: true
|
||
description: "Enable Chinese entity extraction (HanLP)"
|
||
|
||
- section: trilingual.chinese.model
|
||
default: "CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH"
|
||
description: "HanLP model name for Chinese"
|
||
|
||
- section: trilingual.chinese.cache_dir
|
||
default: ""
|
||
description: "HanLP model cache directory (empty = default ~/.hanlp)"
|
||
|
||
# 英文配置(spaCy)
|
||
- section: trilingual.english.enabled
|
||
default: true
|
||
description: "Enable English entity extraction (spaCy)"
|
||
|
||
- section: trilingual.english.model
|
||
default: "en_core_web_trf"
|
||
description: "spaCy model name for English (en_core_web_trf/en_core_web_lg/en_core_web_sm)"
|
||
|
||
- section: trilingual.english.batch_size
|
||
default: 32
|
||
description: "spaCy batch size for English processing"
|
||
|
||
# 瑞典语配置(spaCy)
|
||
- section: trilingual.swedish.enabled
|
||
default: true
|
||
description: "Enable Swedish entity extraction (spaCy)"
|
||
|
||
- section: trilingual.swedish.model
|
||
default: "sv_core_news_lg"
|
||
description: "spaCy model name for Swedish (sv_core_news_lg/sv_core_news_md/sv_core_news_sm)"
|
||
|
||
- section: trilingual.swedish.batch_size
|
||
default: 32
|
||
description: "spaCy batch size for Swedish processing"
|
||
|
||
# 性能配置
|
||
- section: trilingual.performance.max_text_length
|
||
default: 1000000
|
||
description: "Maximum text length to process (characters)"
|
||
|
||
- section: trilingual.performance.enable_gpu
|
||
default: false
|
||
description: "Enable GPU acceleration if available"
|
||
|
||
- section: trilingual.performance.num_threads
|
||
default: 4
|
||
description: "Number of threads for parallel processing"
|
||
|
||
# 缓存配置
|
||
- section: trilingual.cache.enabled
|
||
default: true
|
||
description: "Enable result caching"
|
||
|
||
- section: trilingual.cache.ttl
|
||
default: 3600
|
||
description: "Cache TTL in seconds (0 = no expiry)"
|
||
|
||
- section: trilingual.cache.max_size
|
||
default: 1000
|
||
description: "Maximum number of cached results"
|
||
|
||
# 日志配置
|
||
- section: trilingual.logging.level
|
||
default: "INFO"
|
||
description: "Logging level (DEBUG/INFO/WARNING/ERROR)"
|
||
|
||
- section: trilingual.logging.format
|
||
default: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||
description: "Logging format string"
|
||
|
||
# ============================================================
|
||
# LightRAG 通用配置示例
|
||
# ============================================================
|
||
|
||
# API 密钥(自动生成)
|
||
- section: lightrag.api.secret_key
|
||
type: secret
|
||
auto_generate: true
|
||
description: "API secret key (auto-generated, 32 characters)"
|
||
|
||
# API 配置
|
||
- section: lightrag.api.host
|
||
default: "0.0.0.0"
|
||
description: "API server host"
|
||
|
||
- section: lightrag.api.port
|
||
default: 9621
|
||
description: "API server port"
|
||
|
||
- section: lightrag.api.debug
|
||
default: false
|
||
description: "Enable debug mode"
|
||
|
||
# 数据库配置
|
||
- section: lightrag.database.type
|
||
default: "sqlite"
|
||
description: "Database type (sqlite/postgres/mysql)"
|
||
|
||
- section: lightrag.database.path
|
||
default: "./data/lightrag.db"
|
||
description: "Database file path (for SQLite)"
|
||
|
||
# 向量数据库配置
|
||
- section: lightrag.vector_db.type
|
||
default: "nano"
|
||
description: "Vector database type (nano/milvus/qdrant/chroma)"
|
||
|
||
- section: lightrag.vector_db.dimension
|
||
default: 1536
|
||
description: "Vector dimension"
|
||
|
||
# LLM 配置
|
||
- section: lightrag.llm.provider
|
||
default: "openai"
|
||
description: "LLM provider (openai/anthropic/ollama/custom)"
|
||
|
||
- section: lightrag.llm.model
|
||
default: "gpt-4o-mini"
|
||
description: "LLM model name"
|
||
|
||
- section: lightrag.llm.api_key
|
||
type: secret
|
||
auto_generate: false
|
||
description: "LLM API key (user-provided)"
|
||
|
||
- section: lightrag.llm.base_url
|
||
default: ""
|
||
description: "Custom LLM base URL (optional)"
|
||
|
||
- section: lightrag.llm.max_tokens
|
||
default: 4096
|
||
description: "Maximum tokens per request"
|
||
|
||
- section: lightrag.llm.temperature
|
||
default: 0.0
|
||
description: "LLM temperature (0.0-1.0)"
|
||
|
||
# 实体提取配置
|
||
- section: lightrag.entity_extraction.max_gleaning
|
||
default: 1
|
||
description: "Entity extraction gleaning rounds (0=disabled, 1=enabled)"
|
||
|
||
- section: lightrag.entity_extraction.use_trilingual
|
||
default: false
|
||
description: "Use trilingual extractor instead of LLM (requires setup)"
|
||
|
||
# 关系提取配置
|
||
- section: lightrag.relation_extraction.enabled
|
||
default: true
|
||
description: "Enable relation extraction"
|
||
|
||
- section: lightrag.relation_extraction.method
|
||
default: "llm"
|
||
description: "Relation extraction method (llm/pattern/hybrid)"
|
||
|
||
# 安全配置
|
||
- section: lightrag.security.enable_api_key
|
||
default: false
|
||
description: "Require API key for requests"
|
||
|
||
- section: lightrag.security.allowed_origins
|
||
default: "*"
|
||
description: "CORS allowed origins (comma-separated)"
|
||
|
||
- section: lightrag.security.rate_limit
|
||
default: 100
|
||
description: "API rate limit (requests per minute per IP)"
|