refactor: centralize metadata generation in query functions
- Remove processing_info generation from _convert_to_user_format function - Move all metadata generation (keywords, processing_info) to kg_query and naive_query functions - Simplify _convert_to_user_format to focus only on data format conversion
This commit is contained in:
parent
c0d5abba6b
commit
e71229698d
2 changed files with 70 additions and 52 deletions
|
|
@ -3076,8 +3076,6 @@ async def _build_llm_context(
|
||||||
global_config: dict[str, str],
|
global_config: dict[str, str],
|
||||||
chunk_tracking: dict = None,
|
chunk_tracking: dict = None,
|
||||||
return_raw_data: bool = False,
|
return_raw_data: bool = False,
|
||||||
hl_keywords: list[str] = None,
|
|
||||||
ll_keywords: list[str] = None,
|
|
||||||
) -> str | tuple[str, dict[str, Any]]:
|
) -> str | tuple[str, dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Build the final LLM context string with token processing.
|
Build the final LLM context string with token processing.
|
||||||
|
|
@ -3239,9 +3237,10 @@ async def _build_llm_context(
|
||||||
if return_raw_data:
|
if return_raw_data:
|
||||||
# Return empty raw data structure when no entities/relations
|
# Return empty raw data structure when no entities/relations
|
||||||
empty_raw_data = _convert_to_user_format(
|
empty_raw_data = _convert_to_user_format(
|
||||||
[], [], [], query_param.mode,
|
[],
|
||||||
hl_keywords=hl_keywords,
|
[],
|
||||||
ll_keywords=ll_keywords,
|
[],
|
||||||
|
query_param.mode,
|
||||||
)
|
)
|
||||||
return None, empty_raw_data
|
return None, empty_raw_data
|
||||||
else:
|
else:
|
||||||
|
|
@ -3297,16 +3296,18 @@ async def _build_llm_context(
|
||||||
|
|
||||||
# If final data is requested, return both context and complete data structure
|
# If final data is requested, return both context and complete data structure
|
||||||
if return_raw_data:
|
if return_raw_data:
|
||||||
logger.debug(f"[_build_llm_context] Converting to user format: {len(entities_context)} entities, {len(relations_context)} relations, {len(truncated_chunks)} chunks")
|
logger.debug(
|
||||||
|
f"[_build_llm_context] Converting to user format: {len(entities_context)} entities, {len(relations_context)} relations, {len(truncated_chunks)} chunks"
|
||||||
|
)
|
||||||
final_data = _convert_to_user_format(
|
final_data = _convert_to_user_format(
|
||||||
entities_context,
|
entities_context,
|
||||||
relations_context,
|
relations_context,
|
||||||
truncated_chunks,
|
truncated_chunks,
|
||||||
query_param.mode,
|
query_param.mode,
|
||||||
hl_keywords=hl_keywords,
|
|
||||||
ll_keywords=ll_keywords,
|
|
||||||
)
|
)
|
||||||
logger.debug(f"[_build_llm_context] Final data after conversion: {len(final_data.get('entities', []))} entities, {len(final_data.get('relationships', []))} relationships, {len(final_data.get('chunks', []))} chunks")
|
logger.debug(
|
||||||
|
f"[_build_llm_context] Final data after conversion: {len(final_data.get('entities', []))} entities, {len(final_data.get('relationships', []))} relationships, {len(final_data.get('chunks', []))} chunks"
|
||||||
|
)
|
||||||
return result, final_data
|
return result, final_data
|
||||||
else:
|
else:
|
||||||
return result
|
return result
|
||||||
|
|
@ -3399,12 +3400,36 @@ async def _build_query_context(
|
||||||
global_config=text_chunks_db.global_config,
|
global_config=text_chunks_db.global_config,
|
||||||
chunk_tracking=search_result["chunk_tracking"],
|
chunk_tracking=search_result["chunk_tracking"],
|
||||||
return_raw_data=True,
|
return_raw_data=True,
|
||||||
hl_keywords=hl_keywords_list,
|
|
||||||
ll_keywords=ll_keywords_list,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(f"[_build_query_context] Context length: {len(context) if context else 0}")
|
# Convert keywords strings to lists and add complete metadata to raw_data
|
||||||
logger.debug(f"[_build_query_context] Raw data entities: {len(raw_data.get('entities', []))}, relationships: {len(raw_data.get('relationships', []))}, chunks: {len(raw_data.get('chunks', []))}")
|
hl_keywords_list = hl_keywords.split(", ") if hl_keywords else []
|
||||||
|
ll_keywords_list = ll_keywords.split(", ") if ll_keywords else []
|
||||||
|
|
||||||
|
# Add complete metadata to raw_data
|
||||||
|
raw_data["metadata"]["keywords"] = {
|
||||||
|
"high_level": hl_keywords_list,
|
||||||
|
"low_level": ll_keywords_list,
|
||||||
|
}
|
||||||
|
raw_data["metadata"]["processing_info"] = {
|
||||||
|
"total_entities_found": len(search_result.get("final_entities", [])),
|
||||||
|
"total_relations_found": len(search_result.get("final_relations", [])),
|
||||||
|
"entities_after_truncation": len(
|
||||||
|
truncation_result.get("filtered_entities", [])
|
||||||
|
),
|
||||||
|
"relations_after_truncation": len(
|
||||||
|
truncation_result.get("filtered_relations", [])
|
||||||
|
),
|
||||||
|
"merged_chunks_count": len(merged_chunks),
|
||||||
|
"final_chunks_count": len(raw_data.get("chunks", [])),
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"[_build_query_context] Context length: {len(context) if context else 0}"
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
f"[_build_query_context] Raw data entities: {len(raw_data.get('entities', []))}, relationships: {len(raw_data.get('relationships', []))}, chunks: {len(raw_data.get('chunks', []))}"
|
||||||
|
)
|
||||||
return context, raw_data
|
return context, raw_data
|
||||||
else:
|
else:
|
||||||
# Normal context building (existing logic)
|
# Normal context building (existing logic)
|
||||||
|
|
@ -4135,15 +4160,23 @@ async def naive_query(
|
||||||
# If only raw data is requested, return it directly
|
# If only raw data is requested, return it directly
|
||||||
if return_raw_data:
|
if return_raw_data:
|
||||||
# Build raw data structure for naive mode using processed chunks
|
# Build raw data structure for naive mode using processed chunks
|
||||||
raw_data = {
|
raw_data = _convert_to_user_format(
|
||||||
"entities": [], # naive mode has no entities
|
[], # naive mode has no entities
|
||||||
"relationships": [], # naive mode has no relationships
|
[], # naive mode has no relationships
|
||||||
"chunks": processed_chunks, # Use processed chunks (same as LLM)
|
processed_chunks,
|
||||||
"metadata": {
|
"naive",
|
||||||
"query_mode": "naive",
|
)
|
||||||
"keywords": {"high_level": [], "low_level": []},
|
|
||||||
},
|
# Add complete metadata for naive mode
|
||||||
|
raw_data["metadata"]["keywords"] = {
|
||||||
|
"high_level": [], # naive mode has no keyword extraction
|
||||||
|
"low_level": [], # naive mode has no keyword extraction
|
||||||
}
|
}
|
||||||
|
raw_data["metadata"]["processing_info"] = {
|
||||||
|
"total_chunks_found": len(chunks),
|
||||||
|
"final_chunks_count": len(processed_chunks),
|
||||||
|
}
|
||||||
|
|
||||||
return raw_data
|
return raw_data
|
||||||
|
|
||||||
# Build text_units_context from processed chunks
|
# Build text_units_context from processed chunks
|
||||||
|
|
|
||||||
|
|
@ -2718,11 +2718,6 @@ def _convert_to_user_format(
|
||||||
relations_context: list[dict],
|
relations_context: list[dict],
|
||||||
final_chunks: list[dict],
|
final_chunks: list[dict],
|
||||||
query_mode: str,
|
query_mode: str,
|
||||||
hl_keywords: list[str] = None,
|
|
||||||
ll_keywords: list[str] = None,
|
|
||||||
search_result: dict = None,
|
|
||||||
truncation_result: dict = None,
|
|
||||||
merged_chunks: list[dict] = None,
|
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert internal data format to user-friendly format"""
|
"""Convert internal data format to user-friendly format"""
|
||||||
|
|
||||||
|
|
@ -2766,29 +2761,19 @@ def _convert_to_user_format(
|
||||||
}
|
}
|
||||||
formatted_chunks.append(chunk_data)
|
formatted_chunks.append(chunk_data)
|
||||||
|
|
||||||
logger.debug(f"[_convert_to_user_format] Formatted {len(formatted_chunks)}/{len(final_chunks)} chunks")
|
logger.debug(
|
||||||
|
f"[_convert_to_user_format] Formatted {len(formatted_chunks)}/{len(final_chunks)} chunks"
|
||||||
|
)
|
||||||
|
|
||||||
# Build metadata with processing info
|
# Build basic metadata (metadata details will be added by calling functions)
|
||||||
metadata = {
|
metadata = {
|
||||||
"query_mode": query_mode,
|
"query_mode": query_mode,
|
||||||
"keywords": {"high_level": hl_keywords or [], "low_level": ll_keywords or []},
|
"keywords": {
|
||||||
|
"high_level": [],
|
||||||
|
"low_level": [],
|
||||||
|
}, # Placeholder, will be set by calling functions
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add processing info if available
|
|
||||||
if search_result and truncation_result and merged_chunks is not None:
|
|
||||||
metadata["processing_info"] = {
|
|
||||||
"total_entities_found": len(search_result.get("final_entities", [])),
|
|
||||||
"total_relations_found": len(search_result.get("final_relations", [])),
|
|
||||||
"entities_after_truncation": len(
|
|
||||||
truncation_result.get("filtered_entities", [])
|
|
||||||
),
|
|
||||||
"relations_after_truncation": len(
|
|
||||||
truncation_result.get("filtered_relations", [])
|
|
||||||
),
|
|
||||||
"merged_chunks_count": len(merged_chunks),
|
|
||||||
"final_chunks_count": len(final_chunks),
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"entities": formatted_entities,
|
"entities": formatted_entities,
|
||||||
"relationships": formatted_relationships,
|
"relationships": formatted_relationships,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue