Refactor JSON serialization to use newline-separated format
- Replace json.dumps with line-by-line format - Apply to entities, relations, text units - Update truncation key functions - Maintain ensure_ascii=False setting - Improve context readability
This commit is contained in:
parent
2dd143c935
commit
a49c8e4a0d
2 changed files with 33 additions and 10 deletions
|
|
@ -2707,7 +2707,9 @@ async def _apply_token_truncation(
|
||||||
|
|
||||||
entities_context = truncate_list_by_token_size(
|
entities_context = truncate_list_by_token_size(
|
||||||
entities_context,
|
entities_context,
|
||||||
key=lambda x: json.dumps(x, ensure_ascii=False),
|
key=lambda x: "\n".join(
|
||||||
|
json.dumps(item, ensure_ascii=False) for item in [x]
|
||||||
|
),
|
||||||
max_token_size=max_entity_tokens,
|
max_token_size=max_entity_tokens,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
|
|
@ -2720,7 +2722,9 @@ async def _apply_token_truncation(
|
||||||
|
|
||||||
relations_context = truncate_list_by_token_size(
|
relations_context = truncate_list_by_token_size(
|
||||||
relations_context,
|
relations_context,
|
||||||
key=lambda x: json.dumps(x, ensure_ascii=False),
|
key=lambda x: "\n".join(
|
||||||
|
json.dumps(item, ensure_ascii=False) for item in [x]
|
||||||
|
),
|
||||||
max_token_size=max_relation_tokens,
|
max_token_size=max_relation_tokens,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
|
|
@ -3066,8 +3070,12 @@ async def _build_llm_context(
|
||||||
logger.warning("No tokenizer found, building context without token limits")
|
logger.warning("No tokenizer found, building context without token limits")
|
||||||
|
|
||||||
# Build basic context without token processing
|
# Build basic context without token processing
|
||||||
entities_str = json.dumps(entities_context, ensure_ascii=False)
|
entities_str = "\n".join(
|
||||||
relations_str = json.dumps(relations_context, ensure_ascii=False)
|
json.dumps(entity, ensure_ascii=False) for entity in entities_context
|
||||||
|
)
|
||||||
|
relations_str = "\n".join(
|
||||||
|
json.dumps(relation, ensure_ascii=False) for relation in relations_context
|
||||||
|
)
|
||||||
|
|
||||||
text_units_context = []
|
text_units_context = []
|
||||||
for i, chunk in enumerate(merged_chunks):
|
for i, chunk in enumerate(merged_chunks):
|
||||||
|
|
@ -3113,8 +3121,12 @@ async def _build_llm_context(
|
||||||
|
|
||||||
if merged_chunks:
|
if merged_chunks:
|
||||||
# Calculate dynamic token limit for text chunks
|
# Calculate dynamic token limit for text chunks
|
||||||
entities_str = json.dumps(entities_context, ensure_ascii=False)
|
entities_str = "\n".join(
|
||||||
relations_str = json.dumps(relations_context, ensure_ascii=False)
|
json.dumps(entity, ensure_ascii=False) for entity in entities_context
|
||||||
|
)
|
||||||
|
relations_str = "\n".join(
|
||||||
|
json.dumps(relation, ensure_ascii=False) for relation in relations_context
|
||||||
|
)
|
||||||
|
|
||||||
# Calculate base context tokens (entities + relations + template)
|
# Calculate base context tokens (entities + relations + template)
|
||||||
kg_context_template = """-----Entities(KG)-----
|
kg_context_template = """-----Entities(KG)-----
|
||||||
|
|
@ -3226,9 +3238,15 @@ async def _build_llm_context(
|
||||||
if chunk_tracking_log:
|
if chunk_tracking_log:
|
||||||
logger.info(f"chunks: {' '.join(chunk_tracking_log)}")
|
logger.info(f"chunks: {' '.join(chunk_tracking_log)}")
|
||||||
|
|
||||||
entities_str = json.dumps(entities_context, ensure_ascii=False)
|
entities_str = "\n".join(
|
||||||
relations_str = json.dumps(relations_context, ensure_ascii=False)
|
json.dumps(entity, ensure_ascii=False) for entity in entities_context
|
||||||
text_units_str = json.dumps(text_units_context, ensure_ascii=False)
|
)
|
||||||
|
relations_str = "\n".join(
|
||||||
|
json.dumps(relation, ensure_ascii=False) for relation in relations_context
|
||||||
|
)
|
||||||
|
text_units_str = "\n".join(
|
||||||
|
json.dumps(text_unit, ensure_ascii=False) for text_unit in text_units_context
|
||||||
|
)
|
||||||
|
|
||||||
result = f"""-----Entities(KG)-----
|
result = f"""-----Entities(KG)-----
|
||||||
|
|
||||||
|
|
@ -4005,6 +4023,9 @@ async def naive_query(
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
text_units_str = "\n".join(
|
||||||
|
json.dumps(text_unit, ensure_ascii=False) for text_unit in text_units_context
|
||||||
|
)
|
||||||
|
|
||||||
if query_param.only_need_context and not query_param.only_need_prompt:
|
if query_param.only_need_context and not query_param.only_need_prompt:
|
||||||
return f"""
|
return f"""
|
||||||
|
|
|
||||||
|
|
@ -2428,7 +2428,9 @@ async def process_chunks_unified(
|
||||||
|
|
||||||
unique_chunks = truncate_list_by_token_size(
|
unique_chunks = truncate_list_by_token_size(
|
||||||
unique_chunks,
|
unique_chunks,
|
||||||
key=lambda x: json.dumps(x, ensure_ascii=False),
|
key=lambda x: "\n".join(
|
||||||
|
json.dumps(item, ensure_ascii=False) for item in [x]
|
||||||
|
),
|
||||||
max_token_size=chunk_token_limit,
|
max_token_size=chunk_token_limit,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue