diff --git a/Dockerfile b/Dockerfile index b16a0d7d5..6a8efed6e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,6 +52,8 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \ apt install -y libjemalloc-dev && \ apt install -y python3-pip pipx nginx unzip curl wget git vim less && \ apt install -y ghostscript + apt install -y pandoc + apt install -y texlive RUN if [ "$NEED_MIRROR" == "1" ]; then \ pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ diff --git a/agent/component/message.py b/agent/component/message.py index 641198083..141f58448 100644 --- a/agent/component/message.py +++ b/agent/component/message.py @@ -17,6 +17,9 @@ import json import os import random import re +import pypandoc +import logging +import tempfile from functools import partial from typing import Any @@ -24,7 +27,8 @@ from agent.component.base import ComponentBase, ComponentParamBase from jinja2 import Template as Jinja2Template from common.connection_utils import timeout - +from common.misc_utils import get_uuid +from common import settings class MessageParam(ComponentParamBase): """ @@ -34,6 +38,7 @@ class MessageParam(ComponentParamBase): super().__init__() self.content = [] self.stream = True + self.output_format = None # default output format self.outputs = { "content": { "type": "str" @@ -146,7 +151,7 @@ class Message(ComponentBase): return rand_cnt = random.choice(self._param.content) - if self._param.stream and not self._is_jinjia2(rand_cnt): + if self._param.stream and self._param.output_format is None and not self._is_jinjia2(rand_cnt): self.set_output("content", partial(self._stream, rand_cnt)) return @@ -164,6 +169,60 @@ class Message(ComponentBase): content = re.sub(n, v, content) self.set_output("content", content) + self._convert_content(content) def thoughts(self) -> str: return "" + + def _convert_content(self, content): + doc_id = get_uuid() + + try: + if self._param.output_format in {"markdown", "html"}: + if isinstance(content, str): + converted = pypandoc.convert_text( + content, + to=self._param.output_format, + format="markdown", + ) + else: + converted = pypandoc.convert_file( + content, + to=self._param.output_format, + format="markdown", + ) + + binary_content = converted.encode("utf-8") + + else: # pdf, docx + with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp: + tmp_name = tmp.name + + try: + if isinstance(content, str): + pypandoc.convert_text( + content, + to=self._param.output_format, + format="markdown", + outputfile=tmp_name, + ) + else: + pypandoc.convert_file( + content, + to=self._param.output_format, + format="markdown", + outputfile=tmp_name, + ) + + with open(tmp_name, "rb") as f: + binary_content = f.read() + + finally: + if os.path.exists(tmp_name): + os.remove(tmp_name) + + settings.STORAGE_IMPL.put(self._canvas._tenant_id, doc_id, binary_content) + logging.info(f"Converted content uploaded as {doc_id} (format={self._param.output_format})") + + except Exception as e: + logging.error(f"Error converting content to {self._param.output_format}: {e}") \ No newline at end of file diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 12c19f978..c10fecdbf 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -517,6 +517,22 @@ def get(doc_id): return server_error_response(e) +@manager.route("/download/", methods=["GET"]) # noqa: F821 +@login_required +def download_attachment(attachment_id): + try: + ext = request.args.get("ext", "markdown") + data = settings.STORAGE_IMPL.get(current_user.id, attachment_id) + response = flask.make_response(data) + + response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}")) + + return response + + except Exception as e: + return server_error_response(e) + + @manager.route("/change_parser", methods=["POST"]) # noqa: F821 @login_required @validate_request("doc_id") diff --git a/pyproject.toml b/pyproject.toml index 2ec792b90..c1210dfb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,6 +145,7 @@ dependencies = [ "markdownify>=1.2.0", "captcha>=0.7.1", "pip>=25.2", + "pypandoc>=1.16", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index 166b34ce4..474ca510b 100644 --- a/uv.lock +++ b/uv.lock @@ -4892,6 +4892,14 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" }, ] +[[package]] +name = "pypandoc" +version = "1.16" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/77/af1fc54740a0712988f9518e629d38edc7b8ffccd7549203f19c3d8a2db6/pypandoc-1.16-py3-none-any.whl", hash = "sha256:868f390d48388743e7a5885915cbbaa005dea36a825ecdfd571f8c523416c822", size = 19425, upload-time = "2025-11-08T15:44:38.429Z" }, +] + [[package]] name = "pyparsing" version = "3.2.3" @@ -5292,6 +5300,7 @@ dependencies = [ { name = "pyicu" }, { name = "pymysql" }, { name = "pyodbc" }, + { name = "pypandoc" }, { name = "pypdf" }, { name = "pypdf2" }, { name = "python-calamine" }, @@ -5447,6 +5456,7 @@ requires-dist = [ { name = "pyicu", specifier = ">=2.15.3,<3.0.0" }, { name = "pymysql", specifier = ">=1.1.1,<2.0.0" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, + { name = "pypandoc", specifier = ">=1.16" }, { name = "pypdf", specifier = "==6.0.0" }, { name = "pypdf2", specifier = ">=3.0.1,<4.0.0" }, { name = "python-calamine", specifier = ">=0.4.0" },