From ff0a18e08c720f7ac63a36bf6c3f40dd2897d26e Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 27 Aug 2025 12:23:22 +0800 Subject: [PATCH] Unify SUMMARY_LANGUANGE and ENTITY_TYPES implementation method --- env.example | 11 +++++++---- lightrag/api/config.py | 2 +- lightrag/api/lightrag_server.py | 10 ++++++++-- lightrag/constants.py | 12 ++++++++++-- lightrag/lightrag.py | 7 +++++-- lightrag/operate.py | 7 +++---- lightrag/prompt.py | 1 - 7 files changed, 34 insertions(+), 16 deletions(-) diff --git a/env.example b/env.example index a00b8f88..6cb0dce7 100644 --- a/env.example +++ b/env.example @@ -119,9 +119,14 @@ RERANK_BINDING=null ######################################## ### Document processing configuration ######################################## -### Language: English, Chinese, French, German ... -SUMMARY_LANGUAGE=English ENABLE_LLM_CACHE_FOR_EXTRACT=true + +### Document processing outpu language: English, Chinese, French, German ... +SUMMARY_LANGUAGE=English + +### Entity types that the LLM will attempt to recognize +# ENTITY_TYPES=["person", "organization", "location", "event", "concept"] + ### Chunk size for document splitting, 500~1500 is recommended # CHUNK_SIZE=1200 # CHUNK_OVERLAP_SIZE=100 @@ -134,8 +139,6 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true # SUMMARY_LENGTH_RECOMMENDED_=600 ### Maximum context size sent to LLM for description summary # SUMMARY_CONTEXT_SIZE=12000 -### Customize the entities that the LLM will attempt to recognize -# ENTITY_TYPES=["person", "organization", "location", "event", "concept"] ############################### ### Concurrency Configuration diff --git a/lightrag/api/config.py b/lightrag/api/config.py index 70b855f2..eae2f45b 100644 --- a/lightrag/api/config.py +++ b/lightrag/api/config.py @@ -38,7 +38,7 @@ from lightrag.constants import ( DEFAULT_OLLAMA_MODEL_NAME, DEFAULT_OLLAMA_MODEL_TAG, DEFAULT_RERANK_BINDING, - DEFAULT_ENTITY_TYPES + DEFAULT_ENTITY_TYPES, ) # use the .env that is inside the current folder diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 26c99961..a2a4d848 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -499,7 +499,10 @@ def create_app(args): rerank_model_func=rerank_model_func, max_parallel_insert=args.max_parallel_insert, max_graph_nodes=args.max_graph_nodes, - addon_params={"language": args.summary_language, "entity_types": args.entity_types}, + addon_params={ + "language": args.summary_language, + "entity_types": args.entity_types, + }, ollama_server_infos=ollama_server_infos, ) else: # azure_openai @@ -526,7 +529,10 @@ def create_app(args): rerank_model_func=rerank_model_func, max_parallel_insert=args.max_parallel_insert, max_graph_nodes=args.max_graph_nodes, - addon_params={"language": args.summary_language, "entity_types": args.entity_types}, + addon_params={ + "language": args.summary_language, + "entity_types": args.entity_types, + }, ollama_server_infos=ollama_server_infos, ) diff --git a/lightrag/constants.py b/lightrag/constants.py index d0271be4..4e85325b 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -11,7 +11,7 @@ DEFAULT_WOKERS = 2 DEFAULT_MAX_GRAPH_NODES = 1000 # Default values for extraction settings -DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for summaries +DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing DEFAULT_MAX_GLEANING = 1 # Number of description fragments to trigger LLM summary @@ -23,7 +23,15 @@ DEFAULT_SUMMARY_LENGTH_RECOMMENDED = 600 # Maximum token size sent to LLM for summary DEFAULT_SUMMARY_CONTEXT_SIZE = 12000 # Default entities to extract if ENTITY_TYPES is not specified in .env -DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event", "category"] +DEFAULT_ENTITY_TYPES = [ + "organization", + "person", + "geo", + "event", + "category", + "Equipment", + "Location", +] # Separator for graph fields GRAPH_FIELD_SEP = "" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 235345e8..34ff87e6 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -39,7 +39,8 @@ from lightrag.constants import ( DEFAULT_MAX_ASYNC, DEFAULT_MAX_PARALLEL_INSERT, DEFAULT_MAX_GRAPH_NODES, - DEFAULT_ENTITY_TYPES + DEFAULT_ENTITY_TYPES, + DEFAULT_SUMMARY_LANGUAGE, ) from lightrag.utils import get_env_value @@ -348,7 +349,9 @@ class LightRAG: addon_params: dict[str, Any] = field( default_factory=lambda: { - "language": get_env_value("SUMMARY_LANGUAGE", "English", str), + "language": get_env_value( + "SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE, str + ), "entity_types": get_env_value("ENTITY_TYPES", DEFAULT_ENTITY_TYPES, list), } ) diff --git a/lightrag/operate.py b/lightrag/operate.py index 76a0b2c1..38771f7b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -47,7 +47,8 @@ from .constants import ( DEFAULT_MAX_TOTAL_TOKENS, DEFAULT_RELATED_CHUNK_NUMBER, DEFAULT_KG_CHUNK_PICK_METHOD, - DEFAULT_ENTITY_TYPES + DEFAULT_ENTITY_TYPES, + DEFAULT_SUMMARY_LANGUAGE, ) from .kg.shared_storage import get_storage_keyed_lock import time @@ -1651,9 +1652,7 @@ async def extract_entities( ordered_chunks = list(chunks.items()) # add language and example number params to prompt - language = global_config["addon_params"].get( - "language", PROMPTS["DEFAULT_LANGUAGE"] - ) + language = global_config["addon_params"].get("language", DEFAULT_SUMMARY_LANGUAGE) entity_types = global_config["addon_params"].get( "entity_types", DEFAULT_ENTITY_TYPES ) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 69fb2ef3..f8ea6589 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -4,7 +4,6 @@ from typing import Any PROMPTS: dict[str, Any] = {} -PROMPTS["DEFAULT_LANGUAGE"] = "English" PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"