fix: Ensures complete metadata (source_id, created_at, file_path) is preserved in aquery_data responses
This commit is contained in:
parent
e71229698d
commit
37d01e2df8
2 changed files with 113 additions and 33 deletions
|
|
@ -2832,6 +2832,8 @@ async def _apply_token_truncation(
|
||||||
"relations_context": [],
|
"relations_context": [],
|
||||||
"filtered_entities": search_result["final_entities"],
|
"filtered_entities": search_result["final_entities"],
|
||||||
"filtered_relations": search_result["final_relations"],
|
"filtered_relations": search_result["final_relations"],
|
||||||
|
"entity_id_to_original": {},
|
||||||
|
"relation_id_to_original": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get token limits from query_param with fallbacks
|
# Get token limits from query_param with fallbacks
|
||||||
|
|
@ -2849,17 +2851,25 @@ async def _apply_token_truncation(
|
||||||
final_entities = search_result["final_entities"]
|
final_entities = search_result["final_entities"]
|
||||||
final_relations = search_result["final_relations"]
|
final_relations = search_result["final_relations"]
|
||||||
|
|
||||||
|
# Create mappings from entity/relation identifiers to original data
|
||||||
|
entity_id_to_original = {}
|
||||||
|
relation_id_to_original = {}
|
||||||
|
|
||||||
# Generate entities context for truncation
|
# Generate entities context for truncation
|
||||||
entities_context = []
|
entities_context = []
|
||||||
for i, entity in enumerate(final_entities):
|
for i, entity in enumerate(final_entities):
|
||||||
|
entity_name = entity["entity_name"]
|
||||||
created_at = entity.get("created_at", "UNKNOWN")
|
created_at = entity.get("created_at", "UNKNOWN")
|
||||||
if isinstance(created_at, (int, float)):
|
if isinstance(created_at, (int, float)):
|
||||||
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
||||||
|
|
||||||
|
# Store mapping from entity name to original data
|
||||||
|
entity_id_to_original[entity_name] = entity
|
||||||
|
|
||||||
entities_context.append(
|
entities_context.append(
|
||||||
{
|
{
|
||||||
"id": i + 1,
|
"id": i + 1,
|
||||||
"entity": entity["entity_name"],
|
"entity": entity_name,
|
||||||
"type": entity.get("entity_type", "UNKNOWN"),
|
"type": entity.get("entity_type", "UNKNOWN"),
|
||||||
"description": entity.get("description", "UNKNOWN"),
|
"description": entity.get("description", "UNKNOWN"),
|
||||||
"created_at": created_at,
|
"created_at": created_at,
|
||||||
|
|
@ -2880,6 +2890,10 @@ async def _apply_token_truncation(
|
||||||
else:
|
else:
|
||||||
entity1, entity2 = relation.get("src_id"), relation.get("tgt_id")
|
entity1, entity2 = relation.get("src_id"), relation.get("tgt_id")
|
||||||
|
|
||||||
|
# Store mapping from relation pair to original data
|
||||||
|
relation_key = (entity1, entity2)
|
||||||
|
relation_id_to_original[relation_key] = relation
|
||||||
|
|
||||||
relations_context.append(
|
relations_context.append(
|
||||||
{
|
{
|
||||||
"id": i + 1,
|
"id": i + 1,
|
||||||
|
|
@ -2898,12 +2912,15 @@ async def _apply_token_truncation(
|
||||||
# Apply token-based truncation
|
# Apply token-based truncation
|
||||||
if entities_context:
|
if entities_context:
|
||||||
# Remove file_path and created_at for token calculation
|
# Remove file_path and created_at for token calculation
|
||||||
|
entities_context_for_truncation = []
|
||||||
for entity in entities_context:
|
for entity in entities_context:
|
||||||
entity.pop("file_path", None)
|
entity_copy = entity.copy()
|
||||||
entity.pop("created_at", None)
|
entity_copy.pop("file_path", None)
|
||||||
|
entity_copy.pop("created_at", None)
|
||||||
|
entities_context_for_truncation.append(entity_copy)
|
||||||
|
|
||||||
entities_context = truncate_list_by_token_size(
|
entities_context = truncate_list_by_token_size(
|
||||||
entities_context,
|
entities_context_for_truncation,
|
||||||
key=lambda x: "\n".join(
|
key=lambda x: "\n".join(
|
||||||
json.dumps(item, ensure_ascii=False) for item in [x]
|
json.dumps(item, ensure_ascii=False) for item in [x]
|
||||||
),
|
),
|
||||||
|
|
@ -2913,12 +2930,15 @@ async def _apply_token_truncation(
|
||||||
|
|
||||||
if relations_context:
|
if relations_context:
|
||||||
# Remove file_path and created_at for token calculation
|
# Remove file_path and created_at for token calculation
|
||||||
|
relations_context_for_truncation = []
|
||||||
for relation in relations_context:
|
for relation in relations_context:
|
||||||
relation.pop("file_path", None)
|
relation_copy = relation.copy()
|
||||||
relation.pop("created_at", None)
|
relation_copy.pop("file_path", None)
|
||||||
|
relation_copy.pop("created_at", None)
|
||||||
|
relations_context_for_truncation.append(relation_copy)
|
||||||
|
|
||||||
relations_context = truncate_list_by_token_size(
|
relations_context = truncate_list_by_token_size(
|
||||||
relations_context,
|
relations_context_for_truncation,
|
||||||
key=lambda x: "\n".join(
|
key=lambda x: "\n".join(
|
||||||
json.dumps(item, ensure_ascii=False) for item in [x]
|
json.dumps(item, ensure_ascii=False) for item in [x]
|
||||||
),
|
),
|
||||||
|
|
@ -2932,6 +2952,7 @@ async def _apply_token_truncation(
|
||||||
|
|
||||||
# Create filtered original data based on truncated context
|
# Create filtered original data based on truncated context
|
||||||
filtered_entities = []
|
filtered_entities = []
|
||||||
|
filtered_entity_id_to_original = {}
|
||||||
if entities_context:
|
if entities_context:
|
||||||
final_entity_names = {e["entity"] for e in entities_context}
|
final_entity_names = {e["entity"] for e in entities_context}
|
||||||
seen_nodes = set()
|
seen_nodes = set()
|
||||||
|
|
@ -2939,9 +2960,11 @@ async def _apply_token_truncation(
|
||||||
name = entity.get("entity_name")
|
name = entity.get("entity_name")
|
||||||
if name in final_entity_names and name not in seen_nodes:
|
if name in final_entity_names and name not in seen_nodes:
|
||||||
filtered_entities.append(entity)
|
filtered_entities.append(entity)
|
||||||
|
filtered_entity_id_to_original[name] = entity
|
||||||
seen_nodes.add(name)
|
seen_nodes.add(name)
|
||||||
|
|
||||||
filtered_relations = []
|
filtered_relations = []
|
||||||
|
filtered_relation_id_to_original = {}
|
||||||
if relations_context:
|
if relations_context:
|
||||||
final_relation_pairs = {(r["entity1"], r["entity2"]) for r in relations_context}
|
final_relation_pairs = {(r["entity1"], r["entity2"]) for r in relations_context}
|
||||||
seen_edges = set()
|
seen_edges = set()
|
||||||
|
|
@ -2953,6 +2976,7 @@ async def _apply_token_truncation(
|
||||||
pair = (src, tgt)
|
pair = (src, tgt)
|
||||||
if pair in final_relation_pairs and pair not in seen_edges:
|
if pair in final_relation_pairs and pair not in seen_edges:
|
||||||
filtered_relations.append(relation)
|
filtered_relations.append(relation)
|
||||||
|
filtered_relation_id_to_original[pair] = relation
|
||||||
seen_edges.add(pair)
|
seen_edges.add(pair)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -2962,6 +2986,8 @@ async def _apply_token_truncation(
|
||||||
"relations_context": relations_context, # Formatted and truncated for LLM
|
"relations_context": relations_context, # Formatted and truncated for LLM
|
||||||
"filtered_entities": filtered_entities, # Original entities that passed truncation
|
"filtered_entities": filtered_entities, # Original entities that passed truncation
|
||||||
"filtered_relations": filtered_relations, # Original relations that passed truncation
|
"filtered_relations": filtered_relations, # Original relations that passed truncation
|
||||||
|
"entity_id_to_original": filtered_entity_id_to_original, # Mapping for original data lookup
|
||||||
|
"relation_id_to_original": filtered_relation_id_to_original, # Mapping for original data lookup
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -3076,6 +3102,8 @@ async def _build_llm_context(
|
||||||
global_config: dict[str, str],
|
global_config: dict[str, str],
|
||||||
chunk_tracking: dict = None,
|
chunk_tracking: dict = None,
|
||||||
return_raw_data: bool = False,
|
return_raw_data: bool = False,
|
||||||
|
entity_id_to_original: dict = None,
|
||||||
|
relation_id_to_original: dict = None,
|
||||||
) -> str | tuple[str, dict[str, Any]]:
|
) -> str | tuple[str, dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Build the final LLM context string with token processing.
|
Build the final LLM context string with token processing.
|
||||||
|
|
@ -3304,6 +3332,8 @@ async def _build_llm_context(
|
||||||
relations_context,
|
relations_context,
|
||||||
truncated_chunks,
|
truncated_chunks,
|
||||||
query_param.mode,
|
query_param.mode,
|
||||||
|
entity_id_to_original,
|
||||||
|
relation_id_to_original,
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[_build_llm_context] Final data after conversion: {len(final_data.get('entities', []))} entities, {len(final_data.get('relationships', []))} relationships, {len(final_data.get('chunks', []))} chunks"
|
f"[_build_llm_context] Final data after conversion: {len(final_data.get('entities', []))} entities, {len(final_data.get('relationships', []))} relationships, {len(final_data.get('chunks', []))} chunks"
|
||||||
|
|
@ -3400,6 +3430,8 @@ async def _build_query_context(
|
||||||
global_config=text_chunks_db.global_config,
|
global_config=text_chunks_db.global_config,
|
||||||
chunk_tracking=search_result["chunk_tracking"],
|
chunk_tracking=search_result["chunk_tracking"],
|
||||||
return_raw_data=True,
|
return_raw_data=True,
|
||||||
|
entity_id_to_original=truncation_result["entity_id_to_original"],
|
||||||
|
relation_id_to_original=truncation_result["relation_id_to_original"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert keywords strings to lists and add complete metadata to raw_data
|
# Convert keywords strings to lists and add complete metadata to raw_data
|
||||||
|
|
|
||||||
|
|
@ -2718,40 +2718,88 @@ def _convert_to_user_format(
|
||||||
relations_context: list[dict],
|
relations_context: list[dict],
|
||||||
final_chunks: list[dict],
|
final_chunks: list[dict],
|
||||||
query_mode: str,
|
query_mode: str,
|
||||||
|
entity_id_to_original: dict = None,
|
||||||
|
relation_id_to_original: dict = None,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert internal data format to user-friendly format"""
|
"""Convert internal data format to user-friendly format using original database data"""
|
||||||
|
|
||||||
# Convert entities format
|
# Convert entities format using original data when available
|
||||||
formatted_entities = []
|
formatted_entities = []
|
||||||
for entity in entities_context:
|
for entity in entities_context:
|
||||||
formatted_entities.append(
|
entity_name = entity.get("entity", "")
|
||||||
{
|
|
||||||
"entity_name": entity.get("entity", ""),
|
|
||||||
"entity_type": entity.get("type", "UNKNOWN"),
|
|
||||||
"description": entity.get("description", ""),
|
|
||||||
"source_id": entity.get("source_id", ""),
|
|
||||||
"file_path": entity.get("file_path", "unknown_source"),
|
|
||||||
"created_at": entity.get("created_at", ""),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert relationships format
|
# Try to get original data first
|
||||||
|
original_entity = None
|
||||||
|
if entity_id_to_original and entity_name in entity_id_to_original:
|
||||||
|
original_entity = entity_id_to_original[entity_name]
|
||||||
|
|
||||||
|
if original_entity:
|
||||||
|
# Use original database data
|
||||||
|
formatted_entities.append(
|
||||||
|
{
|
||||||
|
"entity_name": original_entity.get("entity_name", entity_name),
|
||||||
|
"entity_type": original_entity.get("entity_type", "UNKNOWN"),
|
||||||
|
"description": original_entity.get("description", ""),
|
||||||
|
"source_id": original_entity.get("source_id", ""),
|
||||||
|
"file_path": original_entity.get("file_path", "unknown_source"),
|
||||||
|
"created_at": original_entity.get("created_at", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fallback to LLM context data (for backward compatibility)
|
||||||
|
formatted_entities.append(
|
||||||
|
{
|
||||||
|
"entity_name": entity_name,
|
||||||
|
"entity_type": entity.get("type", "UNKNOWN"),
|
||||||
|
"description": entity.get("description", ""),
|
||||||
|
"source_id": entity.get("source_id", ""),
|
||||||
|
"file_path": entity.get("file_path", "unknown_source"),
|
||||||
|
"created_at": entity.get("created_at", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert relationships format using original data when available
|
||||||
formatted_relationships = []
|
formatted_relationships = []
|
||||||
for relation in relations_context:
|
for relation in relations_context:
|
||||||
formatted_relationships.append(
|
entity1 = relation.get("entity1", "")
|
||||||
{
|
entity2 = relation.get("entity2", "")
|
||||||
"src_id": relation.get("entity1", ""),
|
relation_key = (entity1, entity2)
|
||||||
"tgt_id": relation.get("entity2", ""),
|
|
||||||
"description": relation.get("description", ""),
|
|
||||||
"keywords": relation.get("keywords", ""),
|
|
||||||
"weight": relation.get("weight", 1.0),
|
|
||||||
"source_id": relation.get("source_id", ""),
|
|
||||||
"file_path": relation.get("file_path", "unknown_source"),
|
|
||||||
"created_at": relation.get("created_at", ""),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert chunks format
|
# Try to get original data first
|
||||||
|
original_relation = None
|
||||||
|
if relation_id_to_original and relation_key in relation_id_to_original:
|
||||||
|
original_relation = relation_id_to_original[relation_key]
|
||||||
|
|
||||||
|
if original_relation:
|
||||||
|
# Use original database data
|
||||||
|
formatted_relationships.append(
|
||||||
|
{
|
||||||
|
"src_id": original_relation.get("src_id", entity1),
|
||||||
|
"tgt_id": original_relation.get("tgt_id", entity2),
|
||||||
|
"description": original_relation.get("description", ""),
|
||||||
|
"keywords": original_relation.get("keywords", ""),
|
||||||
|
"weight": original_relation.get("weight", 1.0),
|
||||||
|
"source_id": original_relation.get("source_id", ""),
|
||||||
|
"file_path": original_relation.get("file_path", "unknown_source"),
|
||||||
|
"created_at": original_relation.get("created_at", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fallback to LLM context data (for backward compatibility)
|
||||||
|
formatted_relationships.append(
|
||||||
|
{
|
||||||
|
"src_id": entity1,
|
||||||
|
"tgt_id": entity2,
|
||||||
|
"description": relation.get("description", ""),
|
||||||
|
"keywords": relation.get("keywords", ""),
|
||||||
|
"weight": relation.get("weight", 1.0),
|
||||||
|
"source_id": relation.get("source_id", ""),
|
||||||
|
"file_path": relation.get("file_path", "unknown_source"),
|
||||||
|
"created_at": relation.get("created_at", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert chunks format (chunks already contain complete data)
|
||||||
formatted_chunks = []
|
formatted_chunks = []
|
||||||
for i, chunk in enumerate(final_chunks):
|
for i, chunk in enumerate(final_chunks):
|
||||||
chunk_data = {
|
chunk_data = {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue