diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index a2ac131f3..01e20354f 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -402,7 +402,7 @@ async def related_questions(): if "parameter" in gen_conf: del gen_conf["parameter"] prompt = load_prompt("related_question") - ans = chat_mdl.chat( + ans = await chat_mdl.async_chat( prompt, [ { diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py index ee81be7b7..e94f14fcc 100644 --- a/api/apps/sdk/session.py +++ b/api/apps/sdk/session.py @@ -788,7 +788,7 @@ Reason: - At the same time, related terms can also help search engines better understand user needs and return more accurate search results. """ - ans = chat_mdl.chat( + ans = await chat_mdl.async_chat( prompt, [ { @@ -1070,7 +1070,7 @@ async def related_questions_embedded(): gen_conf = search_config.get("llm_setting", {"temperature": 0.9}) prompt = load_prompt("related_question") - ans = chat_mdl.chat( + ans = await chat_mdl.async_chat( prompt, [ { diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index 1f38292ba..953b73942 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -1765,12 +1765,17 @@ class LiteLLMBase(ABC): yield ans, tol - async def async_chat(self, history, gen_conf, **kwargs): - logging.info("[HISTORY]" + json.dumps(history, ensure_ascii=False, indent=2)) + async def async_chat(self, system, history, gen_conf, **kwargs): + hist = list(history) if history else [] + if system: + if not hist or hist[0].get("role") != "system": + hist.insert(0, {"role": "system", "content": system}) + + logging.info("[HISTORY]" + json.dumps(hist, ensure_ascii=False, indent=2)) if self.model_name.lower().find("qwen3") >= 0: kwargs["extra_body"] = {"enable_thinking": False} - completion_args = self._construct_completion_args(history=history, stream=False, tools=False, **gen_conf) + completion_args = self._construct_completion_args(history=hist, stream=False, tools=False, **gen_conf) for attempt in range(self.max_retries + 1): try: