From c8f608b2ddfddb6d3159fedbf19abbd362943246 Mon Sep 17 00:00:00 2001 From: buua436 <66937541+buua436@users.noreply.github.com> Date: Wed, 3 Dec 2025 12:03:59 +0800 Subject: [PATCH 1/3] Feat:support tts in agent (#11675) ### What problem does this PR solve? change: support tts in agent ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- agent/canvas.py | 106 ++++++++++++++++++++++++------ api/db/services/dialog_service.py | 39 ++++++++++- 2 files changed, 123 insertions(+), 22 deletions(-) diff --git a/agent/canvas.py b/agent/canvas.py index c447b77b3..667caec29 100644 --- a/agent/canvas.py +++ b/agent/canvas.py @@ -16,6 +16,7 @@ import asyncio import base64 import inspect +import binascii import json import logging import re @@ -28,7 +29,9 @@ from typing import Any, Union, Tuple from agent.component import component_class from agent.component.base import ComponentBase from api.db.services.file_service import FileService +from api.db.services.llm_service import LLMBundle from api.db.services.task_service import has_canceled +from common.constants import LLMType from common.misc_utils import get_uuid, hash_str2int from common.exceptions import TaskCanceledException from rag.prompts.generator import chunks_format @@ -356,8 +359,6 @@ class Canvas(Graph): self.globals[k] = "" else: self.globals[k] = "" - print(self.globals) - async def run(self, **kwargs): st = time.perf_counter() @@ -456,6 +457,7 @@ class Canvas(Graph): self.error = "" idx = len(self.path) - 1 partials = [] + tts_mdl = None while idx < len(self.path): to = len(self.path) for i in range(idx, to): @@ -473,31 +475,51 @@ class Canvas(Graph): cpn = self.get_component(self.path[i]) cpn_obj = self.get_component_obj(self.path[i]) if cpn_obj.component_name.lower() == "message": + if cpn_obj.get_param("auto_play"): + tts_mdl = LLMBundle(self._tenant_id, LLMType.TTS) if isinstance(cpn_obj.output("content"), partial): _m = "" + buff_m = "" stream = cpn_obj.output("content")() + async def _process_stream(m): + nonlocal buff_m, _m, tts_mdl + if not m: + return + if m == "": + return decorate("message", {"content": "", "start_to_think": True}) + + elif m == "": + return decorate("message", {"content": "", "end_to_think": True}) + + buff_m += m + _m += m + + if len(buff_m) > 16: + ev = decorate( + "message", + { + "content": m, + "audio_binary": self.tts(tts_mdl, buff_m) + } + ) + buff_m = "" + return ev + + return decorate("message", {"content": m}) + if inspect.isasyncgen(stream): async for m in stream: - if not m: - continue - if m == "": - yield decorate("message", {"content": "", "start_to_think": True}) - elif m == "": - yield decorate("message", {"content": "", "end_to_think": True}) - else: - yield decorate("message", {"content": m}) - _m += m + ev= await _process_stream(m) + if ev: + yield ev else: for m in stream: - if not m: - continue - if m == "": - yield decorate("message", {"content": "", "start_to_think": True}) - elif m == "": - yield decorate("message", {"content": "", "end_to_think": True}) - else: - yield decorate("message", {"content": m}) - _m += m + ev= await _process_stream(m) + if ev: + yield ev + if buff_m: + yield decorate("message", {"content": "", "audio_binary": self.tts(tts_mdl, buff_m)}) + buff_m = "" cpn_obj.set_output("content", _m) cite = re.search(r"\[ID:[ 0-9]+\]", _m) else: @@ -618,6 +640,50 @@ class Canvas(Graph): return False return True + + def tts(self,tts_mdl, text): + def clean_tts_text(text: str) -> str: + if not text: + return "" + + text = text.encode("utf-8", "ignore").decode("utf-8", "ignore") + + text = re.sub(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]", "", text) + + emoji_pattern = re.compile( + "[\U0001F600-\U0001F64F" + "\U0001F300-\U0001F5FF" + "\U0001F680-\U0001F6FF" + "\U0001F1E0-\U0001F1FF" + "\U00002700-\U000027BF" + "\U0001F900-\U0001F9FF" + "\U0001FA70-\U0001FAFF" + "\U0001FAD0-\U0001FAFF]+", + flags=re.UNICODE + ) + text = emoji_pattern.sub("", text) + + text = re.sub(r"\s+", " ", text).strip() + + MAX_LEN = 500 + if len(text) > MAX_LEN: + text = text[:MAX_LEN] + + return text + if not tts_mdl or not text: + return None + text = clean_tts_text(text) + if not text: + return None + bin = b"" + try: + for chunk in tts_mdl.tts(text): + bin += chunk + except Exception as e: + logging.error(f"TTS failed: {e}, text={text!r}") + return None + return binascii.hexlify(bin).decode("utf-8") + def get_history(self, window_size): convs = [] if window_size <= 0: diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index f1b74ce82..4afdd1f3c 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -761,13 +761,48 @@ Please write the SQL, only SQL, without any other explanations or text. "prompt": sys_prompt, } +def clean_tts_text(text: str) -> str: + if not text: + return "" + + text = text.encode("utf-8", "ignore").decode("utf-8", "ignore") + + text = re.sub(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]", "", text) + + emoji_pattern = re.compile( + "[\U0001F600-\U0001F64F" + "\U0001F300-\U0001F5FF" + "\U0001F680-\U0001F6FF" + "\U0001F1E0-\U0001F1FF" + "\U00002700-\U000027BF" + "\U0001F900-\U0001F9FF" + "\U0001FA70-\U0001FAFF" + "\U0001FAD0-\U0001FAFF]+", + flags=re.UNICODE + ) + text = emoji_pattern.sub("", text) + + text = re.sub(r"\s+", " ", text).strip() + + MAX_LEN = 500 + if len(text) > MAX_LEN: + text = text[:MAX_LEN] + + return text def tts(tts_mdl, text): if not tts_mdl or not text: return None + text = clean_tts_text(text) + if not text: + return None bin = b"" - for chunk in tts_mdl.tts(text): - bin += chunk + try: + for chunk in tts_mdl.tts(text): + bin += chunk + except Exception as e: + logging.error(f"TTS failed: {e}, text={text!r}") + return None return binascii.hexlify(bin).decode("utf-8") From 6fc7def5623799841a2ecd737d311c35599f9bd5 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Wed, 3 Dec 2025 12:22:01 +0800 Subject: [PATCH 2/3] Feat: optimize the information displayed when .doc preview is unavailable (#11684) ### What problem does this PR solve? Feat: optimize the information displayed when .doc preview is unavailable #11605 ### Type of change - [X] New Feature (non-breaking change which adds functionality) #### Performance (Before) image #### Performance (After) ![img_v3_02sk_c0fcaf74-4a26-4b6c-b0e0-8f8929426d9g](https://github.com/user-attachments/assets/8c8eea3e-2c8e-457c-ab2b-5ef205806f42) --- common/misc_utils.py | 4 +- .../document-preview/doc-preview.tsx | 46 ++++++++++++++++--- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/common/misc_utils.py b/common/misc_utils.py index daebf4c8c..694b8b494 100644 --- a/common/misc_utils.py +++ b/common/misc_utils.py @@ -173,8 +173,8 @@ def install_mineru() -> None: Logging is used to indicate status. """ # Check if MinerU is enabled - use_mineru = os.getenv("USE_MINERU", "").strip().lower() - if use_mineru == "false": + use_mineru = os.getenv("USE_MINERU", "false").strip().lower() + if use_mineru != "true": logging.info("USE_MINERU=%r. Skipping MinerU installation.", use_mineru) return diff --git a/web/src/components/document-preview/doc-preview.tsx b/web/src/components/document-preview/doc-preview.tsx index 470765f6e..5f8dd0cf9 100644 --- a/web/src/components/document-preview/doc-preview.tsx +++ b/web/src/components/document-preview/doc-preview.tsx @@ -1,7 +1,5 @@ import message from '@/components/ui/message'; import { Spin } from '@/components/ui/spin'; -import { Authorization } from '@/constants/authorization'; -import { getAuthorization } from '@/utils/authorization-util'; import request from '@/utils/request'; import classNames from 'classnames'; import mammoth from 'mammoth'; @@ -16,22 +14,55 @@ export const DocPreviewer: React.FC = ({ className, url, }) => { - // const url = useGetDocumentUrl(); const [htmlContent, setHtmlContent] = useState(''); const [loading, setLoading] = useState(false); + const fetchDocument = async () => { + if (!url) return; + setLoading(true); + const res = await request(url, { method: 'GET', responseType: 'blob', - headers: { [Authorization]: getAuthorization() }, onError: () => { message.error('Document parsing failed'); console.error('Error loading document:', url); }, }); + try { - const arrayBuffer = await res.data.arrayBuffer(); + const blob: Blob = res.data; + const contentType: string = + blob.type || (res as any).headers?.['content-type'] || ''; + + // ---- Detect legacy .doc via MIME or URL ---- + const cleanUrl = url.split(/[?#]/)[0].toLowerCase(); + const isDocMime = /application\/msword/i.test(contentType); + const isLegacyDocByUrl = + cleanUrl.endsWith('.doc') && !cleanUrl.endsWith('.docx'); + const isLegacyDoc = isDocMime || isLegacyDocByUrl; + + if (isLegacyDoc) { + // Do not call mammoth and do not throw an error; instead, show a note in the preview area + setHtmlContent(` +
+
+

+ Preview not available for .doc files +

+

+ Mammoth does not support .doc documents.
+ Inline preview is unavailable. +

+
+
+ `); + return; + } + + // ---- Standard .docx preview path ---- + const arrayBuffer = await blob.arrayBuffer(); const result = await mammoth.convertToHtml( { arrayBuffer }, { includeDefaultStyleMap: true }, @@ -43,10 +74,12 @@ export const DocPreviewer: React.FC = ({ setHtmlContent(styledContent); } catch (err) { + // Only errors from the mammoth conversion path should surface here message.error('Document parsing failed'); console.error('Error parsing document:', err); + } finally { + setLoading(false); } - setLoading(false); }; useEffect(() => { @@ -54,6 +87,7 @@ export const DocPreviewer: React.FC = ({ fetchDocument(); } }, [url]); + return (
Date: Wed, 3 Dec 2025 12:27:50 +0800 Subject: [PATCH 3/3] Feat: support TOC transformer. (#11685) ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- agent/tools/retrieval.py | 1 + rag/flow/extractor/extractor.py | 47 ++++++++++++++++++++++++++++++++- rag/svr/task_executor.py | 2 +- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py index fd9096cb6..52e295fd0 100644 --- a/agent/tools/retrieval.py +++ b/agent/tools/retrieval.py @@ -198,6 +198,7 @@ class Retrieval(ToolBase, ABC): return if cks: kbinfos["chunks"] = cks + kbinfos["chunks"] = settings.retriever.retrieval_by_children(kbinfos["chunks"], [kb.tenant_id for kb in kbs]) if self._param.use_kg: ck = settings.kg_retriever.retrieval(query, [kb.tenant_id for kb in kbs], diff --git a/rag/flow/extractor/extractor.py b/rag/flow/extractor/extractor.py index 70464148a..45698b204 100644 --- a/rag/flow/extractor/extractor.py +++ b/rag/flow/extractor/extractor.py @@ -12,10 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json +import logging import random -from copy import deepcopy +from copy import deepcopy, copy + +import trio +import xxhash + from agent.component.llm import LLMParam, LLM from rag.flow.base import ProcessBase, ProcessParamBase +from rag.prompts.generator import run_toc_from_text class ExtractorParam(ProcessParamBase, LLMParam): @@ -31,6 +38,38 @@ class ExtractorParam(ProcessParamBase, LLMParam): class Extractor(ProcessBase, LLM): component_name = "Extractor" + def _build_TOC(self, docs): + self.callback(message="Start to generate table of content ...") + docs = sorted(docs, key=lambda d:( + d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0), + d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0) + )) + toc: list[dict] = trio.run(run_toc_from_text, [d["text"] for d in docs], self.chat_mdl) + logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent=' ')) + ii = 0 + while ii < len(toc): + try: + idx = int(toc[ii]["chunk_id"]) + del toc[ii]["chunk_id"] + toc[ii]["ids"] = [docs[idx]["id"]] + if ii == len(toc) -1: + break + for jj in range(idx+1, int(toc[ii+1]["chunk_id"])+1): + toc[ii]["ids"].append(docs[jj]["id"]) + except Exception as e: + logging.exception(e) + ii += 1 + + if toc: + d = copy.deepcopy(docs[-1]) + d["content_with_weight"] = json.dumps(toc, ensure_ascii=False) + d["toc_kwd"] = "toc" + d["available_int"] = 0 + d["page_num_int"] = [100000000] + d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest() + return d + return None + async def _invoke(self, **kwargs): self.set_output("output_format", "chunks") self.callback(random.randint(1, 5) / 100.0, "Start to generate.") @@ -45,6 +84,12 @@ class Extractor(ProcessBase, LLM): chunks_key = k if chunks: + if self._param.field_name == "toc": + toc = self._build_TOC(chunks) + chunks.append(toc) + self.set_output("chunks", chunks) + return + prog = 0 for i, ck in enumerate(chunks): args[chunks_key] = ck["text"] diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index a91421007..25dea2b24 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -944,7 +944,7 @@ async def do_handle_task(task): logging.info(progress_message) progress_callback(msg=progress_message) if task["parser_id"].lower() == "naive" and task["parser_config"].get("toc_extraction", False): - toc_thread = executor.submit(build_TOC,task, chunks, progress_callback) + toc_thread = executor.submit(build_TOC, task, chunks, progress_callback) chunk_count = len(set([chunk["id"] for chunk in chunks])) start_ts = timer()