Add timestamp tracking for LLM responses and entity/relationship data

- Track timestamps for cache hits/misses
- Add timestamp to entity/relationship objects
- Sort descriptions by timestamp order
- Preserve temporal ordering in merges
This commit is contained in:
yangdx 2025-09-12 04:34:12 +08:00
parent 40688def20
commit 8660bf34e4
2 changed files with 87 additions and 45 deletions

View file

@ -300,7 +300,7 @@ async def _summarize_descriptions(
) )
# Use LLM function with cache (higher priority for summary generation) # Use LLM function with cache (higher priority for summary generation)
summary = await use_llm_func_with_cache( summary, _ = await use_llm_func_with_cache(
use_prompt, use_prompt,
use_llm_func, use_llm_func,
llm_response_cache=llm_response_cache, llm_response_cache=llm_response_cache,
@ -312,6 +312,7 @@ async def _summarize_descriptions(
async def _handle_single_entity_extraction( async def _handle_single_entity_extraction(
record_attributes: list[str], record_attributes: list[str],
chunk_key: str, chunk_key: str,
timestamp: int,
file_path: str = "unknown_source", file_path: str = "unknown_source",
): ):
if len(record_attributes) < 4 or "entity" not in record_attributes[0]: if len(record_attributes) < 4 or "entity" not in record_attributes[0]:
@ -364,6 +365,7 @@ async def _handle_single_entity_extraction(
description=entity_description, description=entity_description,
source_id=chunk_key, source_id=chunk_key,
file_path=file_path, file_path=file_path,
timestamp=timestamp,
) )
except ValueError as e: except ValueError as e:
@ -381,6 +383,7 @@ async def _handle_single_entity_extraction(
async def _handle_single_relationship_extraction( async def _handle_single_relationship_extraction(
record_attributes: list[str], record_attributes: list[str],
chunk_key: str, chunk_key: str,
timestamp: int,
file_path: str = "unknown_source", file_path: str = "unknown_source",
): ):
if len(record_attributes) < 5 or "relationship" not in record_attributes[0]: if len(record_attributes) < 5 or "relationship" not in record_attributes[0]:
@ -441,6 +444,7 @@ async def _handle_single_relationship_extraction(
keywords=edge_keywords, keywords=edge_keywords,
source_id=edge_source_id, source_id=edge_source_id,
file_path=file_path, file_path=file_path,
timestamp=timestamp,
) )
except ValueError as e: except ValueError as e:
@ -503,7 +507,7 @@ async def _rebuild_knowledge_from_chunks(
pipeline_status["history_messages"].append(status_message) pipeline_status["history_messages"].append(status_message)
# Get cached extraction results for these chunks using storage # Get cached extraction results for these chunks using storage
# cached_results chunk_id -> [list of extraction result from LLM cache sorted by created_at] # cached_results chunk_id -> [list of (extraction_result, create_time) from LLM cache sorted by create_time of the first extraction_result]
cached_results = await _get_cached_extraction_results( cached_results = await _get_cached_extraction_results(
llm_response_cache, llm_response_cache,
all_referenced_chunk_ids, all_referenced_chunk_ids,
@ -523,18 +527,19 @@ async def _rebuild_knowledge_from_chunks(
chunk_entities = {} # chunk_id -> {entity_name: [entity_data]} chunk_entities = {} # chunk_id -> {entity_name: [entity_data]}
chunk_relationships = {} # chunk_id -> {(src, tgt): [relationship_data]} chunk_relationships = {} # chunk_id -> {(src, tgt): [relationship_data]}
for chunk_id, extraction_results in cached_results.items(): for chunk_id, results in cached_results.items():
try: try:
# Handle multiple extraction results per chunk # Handle multiple extraction results per chunk
chunk_entities[chunk_id] = defaultdict(list) chunk_entities[chunk_id] = defaultdict(list)
chunk_relationships[chunk_id] = defaultdict(list) chunk_relationships[chunk_id] = defaultdict(list)
# process multiple LLM extraction results for a single chunk_id # process multiple LLM extraction results for a single chunk_id
for extraction_result in extraction_results: for result in results:
entities, relationships = await _rebuild_from_extraction_result( entities, relationships = await _rebuild_from_extraction_result(
text_chunks_storage=text_chunks_storage, text_chunks_storage=text_chunks_storage,
extraction_result=extraction_result,
chunk_id=chunk_id, chunk_id=chunk_id,
extraction_result=result[0],
timestamp=result[1],
) )
# Merge entities and relationships from this extraction result # Merge entities and relationships from this extraction result
@ -820,8 +825,6 @@ async def _get_cached_extraction_results(
cached_results[chunk_id].sort(key=lambda x: x[1]) cached_results[chunk_id].sort(key=lambda x: x[1])
# Store the earliest create_time for this chunk (first item after sorting) # Store the earliest create_time for this chunk (first item after sorting)
chunk_earliest_times[chunk_id] = cached_results[chunk_id][0][1] chunk_earliest_times[chunk_id] = cached_results[chunk_id][0][1]
# Extract only extraction_result (x[0])
cached_results[chunk_id] = [item[0] for item in cached_results[chunk_id]]
# Sort cached_results by the earliest create_time of each chunk # Sort cached_results by the earliest create_time of each chunk
sorted_chunk_ids = sorted( sorted_chunk_ids = sorted(
@ -836,12 +839,13 @@ async def _get_cached_extraction_results(
logger.info( logger.info(
f"Found {valid_entries} valid cache entries, {len(sorted_cached_results)} chunks with results" f"Found {valid_entries} valid cache entries, {len(sorted_cached_results)} chunks with results"
) )
return sorted_cached_results return sorted_cached_results # each item: list(extraction_result, create_time)
async def _process_extraction_result( async def _process_extraction_result(
result: str, result: str,
chunk_key: str, chunk_key: str,
timestamp: int,
file_path: str = "unknown_source", file_path: str = "unknown_source",
tuple_delimiter: str = "<|SEP|>", tuple_delimiter: str = "<|SEP|>",
completion_delimiter: str = "<|COMPLETE|>", completion_delimiter: str = "<|COMPLETE|>",
@ -886,7 +890,7 @@ async def _process_extraction_result(
# Try to parse as entity # Try to parse as entity
entity_data = await _handle_single_entity_extraction( entity_data = await _handle_single_entity_extraction(
record_attributes, chunk_key, file_path record_attributes, chunk_key, timestamp, file_path
) )
if entity_data is not None: if entity_data is not None:
maybe_nodes[entity_data["entity_name"]].append(entity_data) maybe_nodes[entity_data["entity_name"]].append(entity_data)
@ -894,7 +898,7 @@ async def _process_extraction_result(
# Try to parse as relationship # Try to parse as relationship
relationship_data = await _handle_single_relationship_extraction( relationship_data = await _handle_single_relationship_extraction(
record_attributes, chunk_key, file_path record_attributes, chunk_key, timestamp, file_path
) )
if relationship_data is not None: if relationship_data is not None:
maybe_edges[ maybe_edges[
@ -905,7 +909,10 @@ async def _process_extraction_result(
async def _rebuild_from_extraction_result( async def _rebuild_from_extraction_result(
text_chunks_storage: BaseKVStorage, extraction_result: str, chunk_id: str text_chunks_storage: BaseKVStorage,
extraction_result: str,
chunk_id: str,
timestamp: int,
) -> tuple[dict, dict]: ) -> tuple[dict, dict]:
"""Parse cached extraction result using the same logic as extract_entities """Parse cached extraction result using the same logic as extract_entities
@ -930,6 +937,7 @@ async def _rebuild_from_extraction_result(
return await _process_extraction_result( return await _process_extraction_result(
extraction_result, extraction_result,
chunk_id, chunk_id,
timestamp,
file_path, file_path,
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"], tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"], completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
@ -1256,13 +1264,19 @@ async def _merge_nodes_then_upsert(
reverse=True, reverse=True,
)[0][0] # Get the entity type with the highest count )[0][0] # Get the entity type with the highest count
# merge and deduplicate description # Deduplicate by description, keeping first occurrence
description_list = list( unique_nodes = {}
dict.fromkeys( for dp in nodes_data:
already_description desc = dp["description"]
+ [dp["description"] for dp in nodes_data if dp.get("description")] if desc not in unique_nodes:
) unique_nodes[desc] = dp
)
# Sort description by timestamp
sorted_nodes = sorted(unique_nodes.values(), key=lambda x: x.get("timestamp", 0))
sorted_descriptions = [dp["description"] for dp in sorted_nodes]
# Combine already_description with sorted new sorted descriptions
description_list = already_description + sorted_descriptions
num_fragment = len(description_list) num_fragment = len(description_list)
already_fragment = len(already_description) already_fragment = len(already_description)
@ -1378,12 +1392,20 @@ async def _merge_edges_then_upsert(
# Process edges_data with None checks # Process edges_data with None checks
weight = sum([dp["weight"] for dp in edges_data] + already_weights) weight = sum([dp["weight"] for dp in edges_data] + already_weights)
description_list = list( # Deduplicate by description, keeping first occurrence
dict.fromkeys( unique_edges = {}
already_description for dp in edges_data:
+ [dp["description"] for dp in edges_data if dp.get("description")] if dp.get("description"):
) desc = dp["description"]
) if desc not in unique_edges:
unique_edges[desc] = dp
# Sort description by timestamp
sorted_edges = sorted(unique_edges.values(), key=lambda x: x.get("timestamp", 0))
sorted_descriptions = [dp["description"] for dp in sorted_edges]
# Combine already_description with sorted new descriptions
description_list = already_description + sorted_descriptions
num_fragment = len(description_list) num_fragment = len(description_list)
already_fragment = len(already_description) already_fragment = len(already_description)
@ -1966,7 +1988,7 @@ async def extract_entities(
"entity_continue_extraction_user_prompt" "entity_continue_extraction_user_prompt"
].format(**{**context_base, "input_text": content}) ].format(**{**context_base, "input_text": content})
final_result = await use_llm_func_with_cache( final_result, timestamp = await use_llm_func_with_cache(
entity_extraction_user_prompt, entity_extraction_user_prompt,
use_llm_func, use_llm_func,
system_prompt=entity_extraction_system_prompt, system_prompt=entity_extraction_system_prompt,
@ -1976,7 +1998,6 @@ async def extract_entities(
cache_keys_collector=cache_keys_collector, cache_keys_collector=cache_keys_collector,
) )
# Store LLM cache reference in chunk (will be handled by use_llm_func_with_cache)
history = pack_user_ass_to_openai_messages( history = pack_user_ass_to_openai_messages(
entity_extraction_user_prompt, final_result entity_extraction_user_prompt, final_result
) )
@ -1985,6 +2006,7 @@ async def extract_entities(
maybe_nodes, maybe_edges = await _process_extraction_result( maybe_nodes, maybe_edges = await _process_extraction_result(
final_result, final_result,
chunk_key, chunk_key,
timestamp,
file_path, file_path,
tuple_delimiter=context_base["tuple_delimiter"], tuple_delimiter=context_base["tuple_delimiter"],
completion_delimiter=context_base["completion_delimiter"], completion_delimiter=context_base["completion_delimiter"],
@ -1992,7 +2014,7 @@ async def extract_entities(
# Process additional gleaning results only 1 time when entity_extract_max_gleaning is greater than zero. # Process additional gleaning results only 1 time when entity_extract_max_gleaning is greater than zero.
if entity_extract_max_gleaning > 0: if entity_extract_max_gleaning > 0:
glean_result = await use_llm_func_with_cache( glean_result, timestamp = await use_llm_func_with_cache(
entity_continue_extraction_user_prompt, entity_continue_extraction_user_prompt,
use_llm_func, use_llm_func,
system_prompt=entity_extraction_system_prompt, system_prompt=entity_extraction_system_prompt,
@ -2007,6 +2029,7 @@ async def extract_entities(
glean_nodes, glean_edges = await _process_extraction_result( glean_nodes, glean_edges = await _process_extraction_result(
glean_result, glean_result,
chunk_key, chunk_key,
timestamp,
file_path, file_path,
tuple_delimiter=context_base["tuple_delimiter"], tuple_delimiter=context_base["tuple_delimiter"],
completion_delimiter=context_base["completion_delimiter"], completion_delimiter=context_base["completion_delimiter"],
@ -2157,10 +2180,11 @@ async def kg_query(
query_param.user_prompt or "", query_param.user_prompt or "",
query_param.enable_rerank, query_param.enable_rerank,
) )
cached_response = await handle_cache( cached_result = await handle_cache(
hashing_kv, args_hash, query, query_param.mode, cache_type="query" hashing_kv, args_hash, query, query_param.mode, cache_type="query"
) )
if cached_response is not None: if cached_result is not None:
cached_response, _ = cached_result # Extract content, ignore timestamp
if not query_param.only_need_context and not query_param.only_need_prompt: if not query_param.only_need_context and not query_param.only_need_prompt:
return cached_response return cached_response
@ -2325,10 +2349,11 @@ async def extract_keywords_only(
param.hl_keywords or [], param.hl_keywords or [],
param.ll_keywords or [], param.ll_keywords or [],
) )
cached_response = await handle_cache( cached_result = await handle_cache(
hashing_kv, args_hash, text, param.mode, cache_type="keywords" hashing_kv, args_hash, text, param.mode, cache_type="keywords"
) )
if cached_response is not None: if cached_result is not None:
cached_response, _ = cached_result # Extract content, ignore timestamp
try: try:
keywords_data = json_repair.loads(cached_response) keywords_data = json_repair.loads(cached_response)
return keywords_data.get("high_level_keywords", []), keywords_data.get( return keywords_data.get("high_level_keywords", []), keywords_data.get(
@ -2958,10 +2983,11 @@ async def kg_search(
query_param.user_prompt or "", query_param.user_prompt or "",
query_param.enable_rerank, query_param.enable_rerank,
) )
cached_response = await handle_cache( cached_result = await handle_cache(
hashing_kv, args_hash, query, query_param.mode, cache_type="search" hashing_kv, args_hash, query, query_param.mode, cache_type="search"
) )
if cached_response is not None: if cached_result is not None:
cached_response, _ = cached_result # Extract content, ignore timestamp
try: try:
return json_repair.loads(cached_response) return json_repair.loads(cached_response)
except (json.JSONDecodeError, KeyError): except (json.JSONDecodeError, KeyError):
@ -3982,10 +4008,11 @@ async def naive_query(
query_param.user_prompt or "", query_param.user_prompt or "",
query_param.enable_rerank, query_param.enable_rerank,
) )
cached_response = await handle_cache( cached_result = await handle_cache(
hashing_kv, args_hash, query, query_param.mode, cache_type="query" hashing_kv, args_hash, query, query_param.mode, cache_type="query"
) )
if cached_response is not None: if cached_result is not None:
cached_response, _ = cached_result # Extract content, ignore timestamp
if not query_param.only_need_context and not query_param.only_need_prompt: if not query_param.only_need_context and not query_param.only_need_prompt:
return cached_response return cached_response

View file

@ -9,6 +9,7 @@ import logging
import logging.handlers import logging.handlers
import os import os
import re import re
import time
import uuid import uuid
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
@ -1035,8 +1036,12 @@ async def handle_cache(
prompt, prompt,
mode="default", mode="default",
cache_type="unknown", cache_type="unknown",
) -> str | None: ) -> tuple[str, int] | None:
"""Generic cache handling function with flattened cache keys""" """Generic cache handling function with flattened cache keys
Returns:
tuple[str, int] | None: (content, create_time) if cache hit, None if cache miss
"""
if hashing_kv is None: if hashing_kv is None:
return None return None
@ -1052,7 +1057,9 @@ async def handle_cache(
cache_entry = await hashing_kv.get_by_id(flattened_key) cache_entry = await hashing_kv.get_by_id(flattened_key)
if cache_entry: if cache_entry:
logger.debug(f"Flattened cache hit(key:{flattened_key})") logger.debug(f"Flattened cache hit(key:{flattened_key})")
return cache_entry["return"] content = cache_entry["return"]
timestamp = cache_entry.get("create_time", 0)
return content, timestamp
logger.debug(f"Cache missed(mode:{mode} type:{cache_type})") logger.debug(f"Cache missed(mode:{mode} type:{cache_type})")
return None return None
@ -1593,7 +1600,7 @@ async def use_llm_func_with_cache(
cache_type: str = "extract", cache_type: str = "extract",
chunk_id: str | None = None, chunk_id: str | None = None,
cache_keys_collector: list = None, cache_keys_collector: list = None,
) -> str: ) -> tuple[str, int]:
"""Call LLM function with cache support and text sanitization """Call LLM function with cache support and text sanitization
If cache is available and enabled (determined by handle_cache based on mode), If cache is available and enabled (determined by handle_cache based on mode),
@ -1613,7 +1620,9 @@ async def use_llm_func_with_cache(
cache_keys_collector: Optional list to collect cache keys for batch processing cache_keys_collector: Optional list to collect cache keys for batch processing
Returns: Returns:
LLM response text tuple[str, int]: (LLM response text, timestamp)
- For cache hits: (content, cache_create_time)
- For cache misses: (content, current_timestamp)
""" """
# Sanitize input text to prevent UTF-8 encoding errors for all LLM providers # Sanitize input text to prevent UTF-8 encoding errors for all LLM providers
safe_user_prompt = sanitize_text_for_encoding(user_prompt) safe_user_prompt = sanitize_text_for_encoding(user_prompt)
@ -1648,14 +1657,15 @@ async def use_llm_func_with_cache(
# Generate cache key for this LLM call # Generate cache key for this LLM call
cache_key = generate_cache_key("default", cache_type, arg_hash) cache_key = generate_cache_key("default", cache_type, arg_hash)
cached_return = await handle_cache( cached_result = await handle_cache(
llm_response_cache, llm_response_cache,
arg_hash, arg_hash,
_prompt, _prompt,
"default", "default",
cache_type=cache_type, cache_type=cache_type,
) )
if cached_return: if cached_result:
content, timestamp = cached_result
logger.debug(f"Found cache for {arg_hash}") logger.debug(f"Found cache for {arg_hash}")
statistic_data["llm_cache"] += 1 statistic_data["llm_cache"] += 1
@ -1663,7 +1673,7 @@ async def use_llm_func_with_cache(
if cache_keys_collector is not None: if cache_keys_collector is not None:
cache_keys_collector.append(cache_key) cache_keys_collector.append(cache_key)
return cached_return return content, timestamp
statistic_data["llm_call"] += 1 statistic_data["llm_call"] += 1
# Call LLM with sanitized input # Call LLM with sanitized input
@ -1679,6 +1689,9 @@ async def use_llm_func_with_cache(
res = remove_think_tags(res) res = remove_think_tags(res)
# Generate timestamp for cache miss (LLM call completion time)
current_timestamp = int(time.time())
if llm_response_cache.global_config.get("enable_llm_cache_for_entity_extract"): if llm_response_cache.global_config.get("enable_llm_cache_for_entity_extract"):
await save_to_cache( await save_to_cache(
llm_response_cache, llm_response_cache,
@ -1695,7 +1708,7 @@ async def use_llm_func_with_cache(
if cache_keys_collector is not None: if cache_keys_collector is not None:
cache_keys_collector.append(cache_key) cache_keys_collector.append(cache_key)
return res return res, current_timestamp
# When cache is disabled, directly call LLM with sanitized input # When cache is disabled, directly call LLM with sanitized input
kwargs = {} kwargs = {}
@ -1714,7 +1727,9 @@ async def use_llm_func_with_cache(
# Re-raise with the same exception type but modified message # Re-raise with the same exception type but modified message
raise type(e)(error_msg) from e raise type(e)(error_msg) from e
return remove_think_tags(res) # Generate timestamp for non-cached LLM call
current_timestamp = int(time.time())
return remove_think_tags(res), current_timestamp
def get_content_summary(content: str, max_length: int = 250) -> str: def get_content_summary(content: str, max_length: int = 250) -> str: