From c8f608b2ddfddb6d3159fedbf19abbd362943246 Mon Sep 17 00:00:00 2001
From: buua436 <66937541+buua436@users.noreply.github.com>
Date: Wed, 3 Dec 2025 12:03:59 +0800
Subject: [PATCH 1/3] Feat:support tts in agent (#11675)

### What problem does this PR solve?

change:
support tts in agent

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 agent/canvas.py                   | 106 ++++++++++++++++++++++++------
 api/db/services/dialog_service.py |  39 ++++++++++-
 2 files changed, 123 insertions(+), 22 deletions(-)
diff --git a/agent/canvas.py b/agent/canvas.py
index c447b77b3..667caec29 100644
--- a/agent/canvas.py
+++ b/agent/canvas.py
@@ -16,6 +16,7 @@
 import asyncio
 import base64
 import inspect
+import binascii
 import json
 import logging
 import re
@@ -28,7 +29,9 @@ from typing import Any, Union, Tuple
 from agent.component import component_class
 from agent.component.base import ComponentBase
 from api.db.services.file_service import FileService
+from api.db.services.llm_service import LLMBundle
 from api.db.services.task_service import has_canceled
+from common.constants import LLMType
 from common.misc_utils import get_uuid, hash_str2int
 from common.exceptions import TaskCanceledException
 from rag.prompts.generator import chunks_format
@@ -356,8 +359,6 @@ class Canvas(Graph):
                             self.globals[k] = ""
                 else:
                     self.globals[k] = ""
-        print(self.globals)
-                
 
     async def run(self, **kwargs):
         st = time.perf_counter()
@@ -456,6 +457,7 @@ class Canvas(Graph):
         self.error = ""
         idx = len(self.path) - 1
         partials = []
+        tts_mdl = None
         while idx < len(self.path):
             to = len(self.path)
             for i in range(idx, to):
@@ -473,31 +475,51 @@ class Canvas(Graph):
                 cpn = self.get_component(self.path[i])
                 cpn_obj = self.get_component_obj(self.path[i])
                 if cpn_obj.component_name.lower() == "message":
+                    if cpn_obj.get_param("auto_play"):
+                        tts_mdl = LLMBundle(self._tenant_id, LLMType.TTS)
                     if isinstance(cpn_obj.output("content"), partial):
                         _m = ""
+                        buff_m = ""
                         stream = cpn_obj.output("content")()
+                        async def _process_stream(m):
+                            nonlocal buff_m, _m, tts_mdl
+                            if not m:
+                                return
+                            if m == "<think>":
+                                return decorate("message", {"content": "", "start_to_think": True})
+
+                            elif m == "</think>":
+                                return decorate("message", {"content": "", "end_to_think": True})
+
+                            buff_m += m
+                            _m += m
+
+                            if len(buff_m) > 16:
+                                ev = decorate(
+                                    "message",
+                                    {
+                                        "content": m,
+                                        "audio_binary": self.tts(tts_mdl, buff_m)
+                                    }
+                                )
+                                buff_m = ""
+                                return ev
+
+                            return decorate("message", {"content": m})
+
                         if inspect.isasyncgen(stream):
                             async for m in stream:
-                                if not m:
-                                    continue
-                                if m == "<think>":
-                                    yield decorate("message", {"content": "", "start_to_think": True})
-                                elif m == "</think>":
-                                    yield decorate("message", {"content": "", "end_to_think": True})
-                                else:
-                                    yield decorate("message", {"content": m})
-                                    _m += m
+                                ev= await _process_stream(m)
+                                if ev:
+                                    yield ev
                         else:
                             for m in stream:
-                                if not m:
-                                    continue
-                                if m == "<think>":
-                                    yield decorate("message", {"content": "", "start_to_think": True})
-                                elif m == "</think>":
-                                    yield decorate("message", {"content": "", "end_to_think": True})
-                                else:
-                                    yield decorate("message", {"content": m})
-                                    _m += m
+                                ev= await _process_stream(m)
+                                if ev:
+                                    yield ev
+                        if buff_m:
+                            yield decorate("message", {"content": "", "audio_binary": self.tts(tts_mdl, buff_m)})
+                            buff_m = ""
                         cpn_obj.set_output("content", _m)
                         cite = re.search(r"\[ID:[ 0-9]+\]", _m)
                     else:
@@ -618,6 +640,50 @@ class Canvas(Graph):
             return False
         return True
 
+
+    def tts(self,tts_mdl, text):
+        def clean_tts_text(text: str) -> str:
+            if not text:
+                return ""
+
+            text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
+
+            text = re.sub(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]", "", text)
+
+            emoji_pattern = re.compile(
+                "[\U0001F600-\U0001F64F"
+                "\U0001F300-\U0001F5FF"
+                "\U0001F680-\U0001F6FF"
+                "\U0001F1E0-\U0001F1FF"
+                "\U00002700-\U000027BF"
+                "\U0001F900-\U0001F9FF"
+                "\U0001FA70-\U0001FAFF"
+                "\U0001FAD0-\U0001FAFF]+",
+                flags=re.UNICODE
+            )
+            text = emoji_pattern.sub("", text)
+
+            text = re.sub(r"\s+", " ", text).strip()
+
+            MAX_LEN = 500
+            if len(text) > MAX_LEN:
+                text = text[:MAX_LEN]
+
+            return text
+        if not tts_mdl or not text:
+            return None
+        text = clean_tts_text(text)
+        if not text:
+            return None
+        bin = b""
+        try:
+            for chunk in tts_mdl.tts(text):
+                bin += chunk
+        except Exception as e:
+            logging.error(f"TTS failed: {e}, text={text!r}")
+            return None
+        return binascii.hexlify(bin).decode("utf-8")
+
     def get_history(self, window_size):
         convs = []
         if window_size <= 0:
diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py
index f1b74ce82..4afdd1f3c 100644
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -761,13 +761,48 @@ Please write the SQL, only SQL, without any other explanations or text.
         "prompt": sys_prompt,
     }
 
+def clean_tts_text(text: str) -> str:
+    if not text:
+        return ""
+
+    text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
+
+    text = re.sub(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]", "", text)
+
+    emoji_pattern = re.compile(
+        "[\U0001F600-\U0001F64F"
+        "\U0001F300-\U0001F5FF"
+        "\U0001F680-\U0001F6FF"
+        "\U0001F1E0-\U0001F1FF"
+        "\U00002700-\U000027BF"
+        "\U0001F900-\U0001F9FF"
+        "\U0001FA70-\U0001FAFF"
+        "\U0001FAD0-\U0001FAFF]+",
+        flags=re.UNICODE
+    )
+    text = emoji_pattern.sub("", text)
+
+    text = re.sub(r"\s+", " ", text).strip()
+
+    MAX_LEN = 500
+    if len(text) > MAX_LEN:
+        text = text[:MAX_LEN]
+
+    return text
 
 def tts(tts_mdl, text):
     if not tts_mdl or not text:
         return None
+    text = clean_tts_text(text)
+    if not text:
+        return None
     bin = b""
-    for chunk in tts_mdl.tts(text):
-        bin += chunk
+    try:
+        for chunk in tts_mdl.tts(text):
+            bin += chunk
+    except Exception as e:
+        logging.error(f"TTS failed: {e}, text={text!r}")
+        return None
     return binascii.hexlify(bin).decode("utf-8")
 
 

From 6fc7def5623799841a2ecd737d311c35599f9bd5 Mon Sep 17 00:00:00 2001
From: Billy Bao <newyorkupperbay@gmail.com>
Date: Wed, 3 Dec 2025 12:22:01 +0800
Subject: [PATCH 2/3] Feat: optimize the information displayed when .doc
 preview is unavailable (#11684)

### What problem does this PR solve?

Feat: optimize the information displayed when .doc preview is
unavailable #11605

### Type of change

- [X] New Feature (non-breaking change which adds functionality)


#### Performance (Before)
<img width="700" alt="image"
src="https://github.com/user-attachments/assets/15cf69ee-3698-4e18-8e8f-bb75c321334d"
/>

#### Performance (After)

![img_v3_02sk_c0fcaf74-4a26-4b6c-b0e0-8f8929426d9g](https://github.com/user-attachments/assets/8c8eea3e-2c8e-457c-ab2b-5ef205806f42)
---
 common/misc_utils.py                          |  4 +-
 .../document-preview/doc-preview.tsx          | 46 ++++++++++++++++---
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/common/misc_utils.py b/common/misc_utils.py
index daebf4c8c..694b8b494 100644
--- a/common/misc_utils.py
+++ b/common/misc_utils.py
@@ -173,8 +173,8 @@ def install_mineru() -> None:
         Logging is used to indicate status.
     """
     # Check if MinerU is enabled
-    use_mineru = os.getenv("USE_MINERU", "").strip().lower()
-    if use_mineru == "false":
+    use_mineru = os.getenv("USE_MINERU", "false").strip().lower()
+    if use_mineru != "true":
         logging.info("USE_MINERU=%r. Skipping MinerU installation.", use_mineru)
         return
 
diff --git a/web/src/components/document-preview/doc-preview.tsx b/web/src/components/document-preview/doc-preview.tsx
index 470765f6e..5f8dd0cf9 100644
--- a/web/src/components/document-preview/doc-preview.tsx
+++ b/web/src/components/document-preview/doc-preview.tsx
@@ -1,7 +1,5 @@
 import message from '@/components/ui/message';
 import { Spin } from '@/components/ui/spin';
-import { Authorization } from '@/constants/authorization';
-import { getAuthorization } from '@/utils/authorization-util';
 import request from '@/utils/request';
 import classNames from 'classnames';
 import mammoth from 'mammoth';
@@ -16,22 +14,55 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
   className,
   url,
 }) => {
-  // const url = useGetDocumentUrl();
   const [htmlContent, setHtmlContent] = useState<string>('');
   const [loading, setLoading] = useState(false);
+
   const fetchDocument = async () => {
+    if (!url) return;
+
     setLoading(true);
+
     const res = await request(url, {
       method: 'GET',
       responseType: 'blob',
-      headers: { [Authorization]: getAuthorization() },
       onError: () => {
         message.error('Document parsing failed');
         console.error('Error loading document:', url);
       },
     });
+
     try {
-      const arrayBuffer = await res.data.arrayBuffer();
+      const blob: Blob = res.data;
+      const contentType: string =
+        blob.type || (res as any).headers?.['content-type'] || '';
+
+      // ---- Detect legacy .doc via MIME or URL ----
+      const cleanUrl = url.split(/[?#]/)[0].toLowerCase();
+      const isDocMime = /application\/msword/i.test(contentType);
+      const isLegacyDocByUrl =
+        cleanUrl.endsWith('.doc') && !cleanUrl.endsWith('.docx');
+      const isLegacyDoc = isDocMime || isLegacyDocByUrl;
+
+      if (isLegacyDoc) {
+        // Do not call mammoth and do not throw an error; instead, show a note in the preview area
+        setHtmlContent(`
+          <div class="flex h-full items-center justify-center">
+            <div class="border border-dashed border-border-normal rounded-xl p-8 max-w-2xl text-center">
+              <p class="text-2xl font-bold mb-4">
+                Preview not available for .doc files
+              </p>
+              <p class="italic text-sm text-muted-foreground leading-relaxed">
+                Mammoth does not support <code>.doc</code> documents.<br/>
+                Inline preview is unavailable.
+              </p>
+            </div>
+          </div>
+        `);
+        return;
+      }
+
+      // ---- Standard .docx preview path ----
+      const arrayBuffer = await blob.arrayBuffer();
       const result = await mammoth.convertToHtml(
         { arrayBuffer },
         { includeDefaultStyleMap: true },
@@ -43,10 +74,12 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
 
       setHtmlContent(styledContent);
     } catch (err) {
+      // Only errors from the mammoth conversion path should surface here
       message.error('Document parsing failed');
       console.error('Error parsing document:', err);
+    } finally {
+      setLoading(false);
     }
-    setLoading(false);
   };
 
   useEffect(() => {
@@ -54,6 +87,7 @@ export const DocPreviewer: React.FC<DocPreviewerProps> = ({
       fetchDocument();
     }
   }, [url]);
+
   return (
     <div
       className={classNames(

From b5ad7b7062628811b6bbe8d62b8f061e8031d1c5 Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Wed, 3 Dec 2025 12:27:50 +0800
Subject: [PATCH 3/3] Feat: support TOC transformer. (#11685)

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 agent/tools/retrieval.py        |  1 +
 rag/flow/extractor/extractor.py | 47 ++++++++++++++++++++++++++++++++-
 rag/svr/task_executor.py        |  2 +-
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py
index fd9096cb6..52e295fd0 100644
--- a/agent/tools/retrieval.py
+++ b/agent/tools/retrieval.py
@@ -198,6 +198,7 @@ class Retrieval(ToolBase, ABC):
                     return
                 if cks:
                     kbinfos["chunks"] = cks
+            kbinfos["chunks"] = settings.retriever.retrieval_by_children(kbinfos["chunks"], [kb.tenant_id for kb in kbs])
             if self._param.use_kg:
                 ck = settings.kg_retriever.retrieval(query,
                                                        [kb.tenant_id for kb in kbs],
diff --git a/rag/flow/extractor/extractor.py b/rag/flow/extractor/extractor.py
index 70464148a..45698b204 100644
--- a/rag/flow/extractor/extractor.py
+++ b/rag/flow/extractor/extractor.py
@@ -12,10 +12,17 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import json
+import logging
 import random
-from copy import deepcopy
+from copy import deepcopy, copy
+
+import trio
+import xxhash
+
 from agent.component.llm import LLMParam, LLM
 from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.prompts.generator import run_toc_from_text
 
 
 class ExtractorParam(ProcessParamBase, LLMParam):
@@ -31,6 +38,38 @@ class ExtractorParam(ProcessParamBase, LLMParam):
 class Extractor(ProcessBase, LLM):
     component_name = "Extractor"
 
+    def _build_TOC(self, docs):
+        self.callback(message="Start to generate table of content ...")
+        docs = sorted(docs, key=lambda d:(
+            d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
+            d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
+        ))
+        toc: list[dict] = trio.run(run_toc_from_text, [d["text"] for d in docs], self.chat_mdl)
+        logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent='  '))
+        ii = 0
+        while ii < len(toc):
+            try:
+                idx = int(toc[ii]["chunk_id"])
+                del toc[ii]["chunk_id"]
+                toc[ii]["ids"] = [docs[idx]["id"]]
+                if ii == len(toc) -1:
+                    break
+                for jj in range(idx+1, int(toc[ii+1]["chunk_id"])+1):
+                    toc[ii]["ids"].append(docs[jj]["id"])
+            except Exception as e:
+                logging.exception(e)
+            ii += 1
+
+        if toc:
+            d = copy.deepcopy(docs[-1])
+            d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
+            d["toc_kwd"] = "toc"
+            d["available_int"] = 0
+            d["page_num_int"] = [100000000]
+            d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
+            return d
+        return None
+
     async def _invoke(self, **kwargs):
         self.set_output("output_format", "chunks")
         self.callback(random.randint(1, 5) / 100.0, "Start to generate.")
@@ -45,6 +84,12 @@ class Extractor(ProcessBase, LLM):
                 chunks_key = k
 
         if chunks:
+            if self._param.field_name == "toc":
+                toc = self._build_TOC(chunks)
+                chunks.append(toc)
+                self.set_output("chunks", chunks)
+                return
+            
             prog = 0
             for i, ck in enumerate(chunks):
                 args[chunks_key] = ck["text"]
diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index a91421007..25dea2b24 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -944,7 +944,7 @@ async def do_handle_task(task):
         logging.info(progress_message)
         progress_callback(msg=progress_message)
         if task["parser_id"].lower() == "naive" and task["parser_config"].get("toc_extraction", False):
-            toc_thread = executor.submit(build_TOC,task, chunks, progress_callback)
+            toc_thread = executor.submit(build_TOC, task, chunks, progress_callback)
 
     chunk_count = len(set([chunk["id"] for chunk in chunks]))
     start_ts = timer()