Unify SUMMARY_LANGUANGE and ENTITY_TYPES implementation method
This commit is contained in:
parent
c3aabfc251
commit
ff0a18e08c
7 changed files with 34 additions and 16 deletions
11
env.example
11
env.example
|
|
@ -119,9 +119,14 @@ RERANK_BINDING=null
|
||||||
########################################
|
########################################
|
||||||
### Document processing configuration
|
### Document processing configuration
|
||||||
########################################
|
########################################
|
||||||
### Language: English, Chinese, French, German ...
|
|
||||||
SUMMARY_LANGUAGE=English
|
|
||||||
ENABLE_LLM_CACHE_FOR_EXTRACT=true
|
ENABLE_LLM_CACHE_FOR_EXTRACT=true
|
||||||
|
|
||||||
|
### Document processing outpu language: English, Chinese, French, German ...
|
||||||
|
SUMMARY_LANGUAGE=English
|
||||||
|
|
||||||
|
### Entity types that the LLM will attempt to recognize
|
||||||
|
# ENTITY_TYPES=["person", "organization", "location", "event", "concept"]
|
||||||
|
|
||||||
### Chunk size for document splitting, 500~1500 is recommended
|
### Chunk size for document splitting, 500~1500 is recommended
|
||||||
# CHUNK_SIZE=1200
|
# CHUNK_SIZE=1200
|
||||||
# CHUNK_OVERLAP_SIZE=100
|
# CHUNK_OVERLAP_SIZE=100
|
||||||
|
|
@ -134,8 +139,6 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
|
||||||
# SUMMARY_LENGTH_RECOMMENDED_=600
|
# SUMMARY_LENGTH_RECOMMENDED_=600
|
||||||
### Maximum context size sent to LLM for description summary
|
### Maximum context size sent to LLM for description summary
|
||||||
# SUMMARY_CONTEXT_SIZE=12000
|
# SUMMARY_CONTEXT_SIZE=12000
|
||||||
### Customize the entities that the LLM will attempt to recognize
|
|
||||||
# ENTITY_TYPES=["person", "organization", "location", "event", "concept"]
|
|
||||||
|
|
||||||
###############################
|
###############################
|
||||||
### Concurrency Configuration
|
### Concurrency Configuration
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ from lightrag.constants import (
|
||||||
DEFAULT_OLLAMA_MODEL_NAME,
|
DEFAULT_OLLAMA_MODEL_NAME,
|
||||||
DEFAULT_OLLAMA_MODEL_TAG,
|
DEFAULT_OLLAMA_MODEL_TAG,
|
||||||
DEFAULT_RERANK_BINDING,
|
DEFAULT_RERANK_BINDING,
|
||||||
DEFAULT_ENTITY_TYPES
|
DEFAULT_ENTITY_TYPES,
|
||||||
)
|
)
|
||||||
|
|
||||||
# use the .env that is inside the current folder
|
# use the .env that is inside the current folder
|
||||||
|
|
|
||||||
|
|
@ -499,7 +499,10 @@ def create_app(args):
|
||||||
rerank_model_func=rerank_model_func,
|
rerank_model_func=rerank_model_func,
|
||||||
max_parallel_insert=args.max_parallel_insert,
|
max_parallel_insert=args.max_parallel_insert,
|
||||||
max_graph_nodes=args.max_graph_nodes,
|
max_graph_nodes=args.max_graph_nodes,
|
||||||
addon_params={"language": args.summary_language, "entity_types": args.entity_types},
|
addon_params={
|
||||||
|
"language": args.summary_language,
|
||||||
|
"entity_types": args.entity_types,
|
||||||
|
},
|
||||||
ollama_server_infos=ollama_server_infos,
|
ollama_server_infos=ollama_server_infos,
|
||||||
)
|
)
|
||||||
else: # azure_openai
|
else: # azure_openai
|
||||||
|
|
@ -526,7 +529,10 @@ def create_app(args):
|
||||||
rerank_model_func=rerank_model_func,
|
rerank_model_func=rerank_model_func,
|
||||||
max_parallel_insert=args.max_parallel_insert,
|
max_parallel_insert=args.max_parallel_insert,
|
||||||
max_graph_nodes=args.max_graph_nodes,
|
max_graph_nodes=args.max_graph_nodes,
|
||||||
addon_params={"language": args.summary_language, "entity_types": args.entity_types},
|
addon_params={
|
||||||
|
"language": args.summary_language,
|
||||||
|
"entity_types": args.entity_types,
|
||||||
|
},
|
||||||
ollama_server_infos=ollama_server_infos,
|
ollama_server_infos=ollama_server_infos,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ DEFAULT_WOKERS = 2
|
||||||
DEFAULT_MAX_GRAPH_NODES = 1000
|
DEFAULT_MAX_GRAPH_NODES = 1000
|
||||||
|
|
||||||
# Default values for extraction settings
|
# Default values for extraction settings
|
||||||
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for summaries
|
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
|
||||||
DEFAULT_MAX_GLEANING = 1
|
DEFAULT_MAX_GLEANING = 1
|
||||||
|
|
||||||
# Number of description fragments to trigger LLM summary
|
# Number of description fragments to trigger LLM summary
|
||||||
|
|
@ -23,7 +23,15 @@ DEFAULT_SUMMARY_LENGTH_RECOMMENDED = 600
|
||||||
# Maximum token size sent to LLM for summary
|
# Maximum token size sent to LLM for summary
|
||||||
DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
|
DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
|
||||||
# Default entities to extract if ENTITY_TYPES is not specified in .env
|
# Default entities to extract if ENTITY_TYPES is not specified in .env
|
||||||
DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event", "category"]
|
DEFAULT_ENTITY_TYPES = [
|
||||||
|
"organization",
|
||||||
|
"person",
|
||||||
|
"geo",
|
||||||
|
"event",
|
||||||
|
"category",
|
||||||
|
"Equipment",
|
||||||
|
"Location",
|
||||||
|
]
|
||||||
|
|
||||||
# Separator for graph fields
|
# Separator for graph fields
|
||||||
GRAPH_FIELD_SEP = "<SEP>"
|
GRAPH_FIELD_SEP = "<SEP>"
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,8 @@ from lightrag.constants import (
|
||||||
DEFAULT_MAX_ASYNC,
|
DEFAULT_MAX_ASYNC,
|
||||||
DEFAULT_MAX_PARALLEL_INSERT,
|
DEFAULT_MAX_PARALLEL_INSERT,
|
||||||
DEFAULT_MAX_GRAPH_NODES,
|
DEFAULT_MAX_GRAPH_NODES,
|
||||||
DEFAULT_ENTITY_TYPES
|
DEFAULT_ENTITY_TYPES,
|
||||||
|
DEFAULT_SUMMARY_LANGUAGE,
|
||||||
)
|
)
|
||||||
from lightrag.utils import get_env_value
|
from lightrag.utils import get_env_value
|
||||||
|
|
||||||
|
|
@ -348,7 +349,9 @@ class LightRAG:
|
||||||
|
|
||||||
addon_params: dict[str, Any] = field(
|
addon_params: dict[str, Any] = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
"language": get_env_value("SUMMARY_LANGUAGE", "English", str),
|
"language": get_env_value(
|
||||||
|
"SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE, str
|
||||||
|
),
|
||||||
"entity_types": get_env_value("ENTITY_TYPES", DEFAULT_ENTITY_TYPES, list),
|
"entity_types": get_env_value("ENTITY_TYPES", DEFAULT_ENTITY_TYPES, list),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,8 @@ from .constants import (
|
||||||
DEFAULT_MAX_TOTAL_TOKENS,
|
DEFAULT_MAX_TOTAL_TOKENS,
|
||||||
DEFAULT_RELATED_CHUNK_NUMBER,
|
DEFAULT_RELATED_CHUNK_NUMBER,
|
||||||
DEFAULT_KG_CHUNK_PICK_METHOD,
|
DEFAULT_KG_CHUNK_PICK_METHOD,
|
||||||
DEFAULT_ENTITY_TYPES
|
DEFAULT_ENTITY_TYPES,
|
||||||
|
DEFAULT_SUMMARY_LANGUAGE,
|
||||||
)
|
)
|
||||||
from .kg.shared_storage import get_storage_keyed_lock
|
from .kg.shared_storage import get_storage_keyed_lock
|
||||||
import time
|
import time
|
||||||
|
|
@ -1651,9 +1652,7 @@ async def extract_entities(
|
||||||
|
|
||||||
ordered_chunks = list(chunks.items())
|
ordered_chunks = list(chunks.items())
|
||||||
# add language and example number params to prompt
|
# add language and example number params to prompt
|
||||||
language = global_config["addon_params"].get(
|
language = global_config["addon_params"].get("language", DEFAULT_SUMMARY_LANGUAGE)
|
||||||
"language", PROMPTS["DEFAULT_LANGUAGE"]
|
|
||||||
)
|
|
||||||
entity_types = global_config["addon_params"].get(
|
entity_types = global_config["addon_params"].get(
|
||||||
"entity_types", DEFAULT_ENTITY_TYPES
|
"entity_types", DEFAULT_ENTITY_TYPES
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ from typing import Any
|
||||||
|
|
||||||
PROMPTS: dict[str, Any] = {}
|
PROMPTS: dict[str, Any] = {}
|
||||||
|
|
||||||
PROMPTS["DEFAULT_LANGUAGE"] = "English"
|
|
||||||
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
|
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
|
||||||
PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
|
PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
|
||||||
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
|
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue