Refactor conversation history handling to use LLM native message format

• Remove get_conversation_turns utility
• Pass history_messages to LLM directly
• Clean up prompt template formatting
This commit is contained in:
yangdx 2025-09-10 11:56:58 +08:00
parent e078ab7103
commit 2dd143c935
3 changed files with 26 additions and 121 deletions

View file

@ -21,7 +21,6 @@ from .utils import (
handle_cache,
save_to_cache,
CacheData,
get_conversation_turns,
use_llm_func_with_cache,
update_chunk_cache_list,
remove_think_tags,
@ -2180,13 +2179,6 @@ async def kg_query(
if context is None:
return PROMPTS["fail_response"]
# Process conversation history
history_context = ""
if query_param.conversation_history:
history_context = get_conversation_turns(
query_param.conversation_history, query_param.history_turns
)
# Build system prompt
user_prompt = (
query_param.user_prompt
@ -2197,7 +2189,6 @@ async def kg_query(
sys_prompt = sys_prompt_temp.format(
context_data=context,
response_type=query_param.response_type,
history=history_context,
user_prompt=user_prompt,
)
@ -2213,8 +2204,9 @@ async def kg_query(
response = await use_model_func(
query,
system_prompt=sys_prompt,
stream=query_param.stream,
history_messages=query_param.conversation_history,
enable_cot=True,
stream=query_param.stream,
)
if isinstance(response, str) and len(response) > len(sys_prompt):
response = (
@ -2327,14 +2319,7 @@ async def extract_keywords_only(
language = global_config["addon_params"].get("language", DEFAULT_SUMMARY_LANGUAGE)
# 3. Process conversation history
# history_context = ""
# if param.conversation_history:
# history_context = get_conversation_turns(
# param.conversation_history, param.history_turns
# )
# 4. Build the keyword-extraction prompt
# 3. Build the keyword-extraction prompt
kw_prompt = PROMPTS["keywords_extraction"].format(
query=text,
examples=examples,
@ -2347,7 +2332,7 @@ async def extract_keywords_only(
f"[extract_keywords] Sending to LLM: {len_of_prompts:,} tokens (Prompt: {len_of_prompts})"
)
# 5. Call the LLM for keyword extraction
# 4. Call the LLM for keyword extraction
if param.model_func:
use_model_func = param.model_func
else:
@ -2357,7 +2342,7 @@ async def extract_keywords_only(
result = await use_model_func(kw_prompt, keyword_extraction=True)
# 6. Parse out JSON from the LLM response
# 5. Parse out JSON from the LLM response
result = remove_think_tags(result)
try:
keywords_data = json_repair.loads(result)
@ -2372,7 +2357,7 @@ async def extract_keywords_only(
hl_keywords = keywords_data.get("high_level_keywords", [])
ll_keywords = keywords_data.get("low_level_keywords", [])
# 7. Cache only the processed keywords with cache type
# 6. Cache only the processed keywords with cache type
if hl_keywords or ll_keywords:
cache_data = {
"high_level_keywords": hl_keywords,
@ -3171,7 +3156,6 @@ async def _build_llm_context(
# Create sample system prompt for overhead calculation
sample_sys_prompt = sys_prompt_template.format(
history="", # History not included in context length calculation
context_data="", # Empty for overhead calculation
response_type=response_type,
user_prompt=user_prompt,
@ -3963,14 +3947,6 @@ async def naive_query(
global_config.get("max_total_tokens", DEFAULT_MAX_TOTAL_TOKENS),
)
# Calculate conversation history tokens
history_context = ""
if query_param.conversation_history:
history_context = get_conversation_turns(
query_param.conversation_history, query_param.history_turns
)
history_tokens = len(tokenizer.encode(history_context)) if history_context else 0
# Calculate system prompt template tokens (excluding content_data)
user_prompt = query_param.user_prompt if query_param.user_prompt else ""
response_type = (
@ -3988,7 +3964,6 @@ async def naive_query(
sample_sys_prompt = sys_prompt_template.format(
content_data="", # Empty for overhead calculation
response_type=response_type,
history=history_context,
user_prompt=user_prompt,
)
sys_prompt_template_tokens = len(tokenizer.encode(sample_sys_prompt))
@ -4004,7 +3979,7 @@ async def naive_query(
available_chunk_tokens = max_total_tokens - used_tokens
logger.debug(
f"Naive query token allocation - Total: {max_total_tokens}, History: {history_tokens}, SysPrompt: {sys_prompt_overhead}, Buffer: {buffer_tokens}, Available for chunks: {available_chunk_tokens}"
f"Naive query token allocation - Total: {max_total_tokens}, SysPrompt: {sys_prompt_overhead}, Buffer: {buffer_tokens}, Available for chunks: {available_chunk_tokens}"
)
# Process chunks using unified processing with dynamic token limit
@ -4040,12 +4015,6 @@ async def naive_query(
```
"""
# Process conversation history
history_context = ""
if query_param.conversation_history:
history_context = get_conversation_turns(
query_param.conversation_history, query_param.history_turns
)
# Build system prompt
user_prompt = (
@ -4057,7 +4026,6 @@ async def naive_query(
sys_prompt = sys_prompt_temp.format(
content_data=text_units_str,
response_type=query_param.response_type,
history=history_context,
user_prompt=user_prompt,
)
@ -4072,8 +4040,9 @@ async def naive_query(
response = await use_model_func(
query,
system_prompt=sys_prompt,
stream=query_param.stream,
history_messages=query_param.conversation_history,
enable_cot=True,
stream=query_param.stream,
)
if isinstance(response, str) and len(response) > len(sys_prompt):

View file

@ -176,32 +176,30 @@ You are a helpful assistant responding to user query about Knowledge Graph and D
Generate a concise response based on Knowledge Base and follow Response Rules, considering both current query and the conversation history if provided. Summarize all information in the provided Knowledge Base, and incorporating general knowledge relevant to the Knowledge Base. Do not include information not provided by Knowledge Base.
---Conversation History---
{history}
---Knowledge Graph and Document Chunks---
{context_data}
---Response Guidelines---
**1. Content & Adherence:**
- Strictly adhere to the provided context from the Knowledge Base. Do not invent, assume, or include any information not present in the source data.
- If the answer cannot be found in the provided context, state that you do not have enough information to answer.
- Ensure the response maintains continuity with the conversation history.
1. **Content & Adherence:**
- Strictly adhere to the provided context from the Knowledge Base. Do not invent, assume, or include any information not present in the source data.
- If the answer cannot be found in the provided context, state that you do not have enough information to answer.
- Ensure the response maintains continuity with the conversation history.
**2. Formatting & Language:**
- Format the response using markdown with appropriate section headings.
- The response language must in the same language as the user's question.
- Target format and length: {response_type}
2. **Formatting & Language:**
- Format the response using markdown with appropriate section headings.
- The response language must in the same language as the user's question.
- Target format and length: {response_type}
**3. Citations / References:**
- At the end of the response, under a "References" section, each citation must clearly indicate its origin (KG or DC).
- The maximum number of citations is 5, including both KG and DC.
- Use the following formats for citations:
- For a Knowledge Graph Entity: `[KG] <entity_name>`
- For a Knowledge Graph Relationship: `[KG] <entity1_name> - <entity2_name>`
- For a Document Chunk: `[DC] <file_path_or_document_name>`
3. **Citations / References:**
- At the end of the response, under a "References" section, each citation must clearly indicate its origin (KG or DC).
- The maximum number of citations is 5, including both KG and DC.
- Use the following formats for citations:
- For a Knowledge Graph Entity: `[KG] <entity_name>`
- For a Knowledge Graph Relationship: `[KG] <entity1_name> ~ <entity2_name>`
- For a Document Chunk: `[DC] <file_path_or_document_name>`
---USER CONTEXT---
---User Context---
- Additional user prompt: {user_prompt}
---Response---

View file

@ -1144,68 +1144,6 @@ def exists_func(obj, func_name: str) -> bool:
return False
def get_conversation_turns(
conversation_history: list[dict[str, Any]], num_turns: int
) -> str:
"""
Process conversation history to get the specified number of complete turns.
Args:
conversation_history: List of conversation messages in chronological order
num_turns: Number of complete turns to include
Returns:
Formatted string of the conversation history
"""
# Check if num_turns is valid
if num_turns <= 0:
return ""
# Group messages into turns
turns: list[list[dict[str, Any]]] = []
messages: list[dict[str, Any]] = []
# First, filter out keyword extraction messages
for msg in conversation_history:
if msg["role"] == "assistant" and (
msg["content"].startswith('{ "high_level_keywords"')
or msg["content"].startswith("{'high_level_keywords'")
):
continue
messages.append(msg)
# Then process messages in chronological order
i = 0
while i < len(messages) - 1:
msg1 = messages[i]
msg2 = messages[i + 1]
# Check if we have a user-assistant or assistant-user pair
if (msg1["role"] == "user" and msg2["role"] == "assistant") or (
msg1["role"] == "assistant" and msg2["role"] == "user"
):
# Always put user message first in the turn
if msg1["role"] == "assistant":
turn = [msg2, msg1] # user, assistant
else:
turn = [msg1, msg2] # user, assistant
turns.append(turn)
i += 2
# Keep only the most recent num_turns
if len(turns) > num_turns:
turns = turns[-num_turns:]
# Format the turns into a string
formatted_turns: list[str] = []
for turn in turns:
formatted_turns.extend(
[f"user: {turn[0]['content']}", f"assistant: {turn[1]['content']}"]
)
return "\n".join(formatted_turns)
def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
"""
Ensure that there is always an event loop available.