feat: add extract output to file

This commit is contained in:
Billy Bao 2025-11-13 16:45:37 +08:00
parent 70a0f081f6
commit 944d88de47
5 changed files with 90 additions and 2 deletions

View file

@ -52,6 +52,8 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
apt install -y libjemalloc-dev && \ apt install -y libjemalloc-dev && \
apt install -y python3-pip pipx nginx unzip curl wget git vim less && \ apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
apt install -y ghostscript apt install -y ghostscript
apt install -y pandoc
apt install -y texlive
RUN if [ "$NEED_MIRROR" == "1" ]; then \ RUN if [ "$NEED_MIRROR" == "1" ]; then \
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \ pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \

View file

@ -17,6 +17,9 @@ import json
import os import os
import random import random
import re import re
import pypandoc
import logging
import tempfile
from functools import partial from functools import partial
from typing import Any from typing import Any
@ -24,7 +27,8 @@ from agent.component.base import ComponentBase, ComponentParamBase
from jinja2 import Template as Jinja2Template from jinja2 import Template as Jinja2Template
from common.connection_utils import timeout from common.connection_utils import timeout
from common.misc_utils import get_uuid
from common import settings
class MessageParam(ComponentParamBase): class MessageParam(ComponentParamBase):
""" """
@ -34,6 +38,7 @@ class MessageParam(ComponentParamBase):
super().__init__() super().__init__()
self.content = [] self.content = []
self.stream = True self.stream = True
self.output_format = None # default output format
self.outputs = { self.outputs = {
"content": { "content": {
"type": "str" "type": "str"
@ -146,7 +151,7 @@ class Message(ComponentBase):
return return
rand_cnt = random.choice(self._param.content) rand_cnt = random.choice(self._param.content)
if self._param.stream and not self._is_jinjia2(rand_cnt): if self._param.stream and self._param.output_format is None and not self._is_jinjia2(rand_cnt):
self.set_output("content", partial(self._stream, rand_cnt)) self.set_output("content", partial(self._stream, rand_cnt))
return return
@ -164,6 +169,60 @@ class Message(ComponentBase):
content = re.sub(n, v, content) content = re.sub(n, v, content)
self.set_output("content", content) self.set_output("content", content)
self._convert_content(content)
def thoughts(self) -> str: def thoughts(self) -> str:
return "" return ""
def _convert_content(self, content):
doc_id = get_uuid()
try:
if self._param.output_format in {"markdown", "html"}:
if isinstance(content, str):
converted = pypandoc.convert_text(
content,
to=self._param.output_format,
format="markdown",
)
else:
converted = pypandoc.convert_file(
content,
to=self._param.output_format,
format="markdown",
)
binary_content = converted.encode("utf-8")
else: # pdf, docx
with tempfile.NamedTemporaryFile(suffix=f".{self._param.output_format}", delete=False) as tmp:
tmp_name = tmp.name
try:
if isinstance(content, str):
pypandoc.convert_text(
content,
to=self._param.output_format,
format="markdown",
outputfile=tmp_name,
)
else:
pypandoc.convert_file(
content,
to=self._param.output_format,
format="markdown",
outputfile=tmp_name,
)
with open(tmp_name, "rb") as f:
binary_content = f.read()
finally:
if os.path.exists(tmp_name):
os.remove(tmp_name)
settings.STORAGE_IMPL.put(self._canvas._tenant_id, doc_id, binary_content)
logging.info(f"Converted content uploaded as {doc_id} (format={self._param.output_format})")
except Exception as e:
logging.error(f"Error converting content to {self._param.output_format}: {e}")

View file

@ -517,6 +517,22 @@ def get(doc_id):
return server_error_response(e) return server_error_response(e)
@manager.route("/download/<attachment_id>", methods=["GET"]) # noqa: F821
@login_required
def download_attachment(attachment_id):
try:
ext = request.args.get("ext", "markdown")
data = settings.STORAGE_IMPL.get(current_user.id, attachment_id)
response = flask.make_response(data)
response.headers.set("Content-Type", CONTENT_TYPE_MAP.get(ext, f"application/{ext}"))
return response
except Exception as e:
return server_error_response(e)
@manager.route("/change_parser", methods=["POST"]) # noqa: F821 @manager.route("/change_parser", methods=["POST"]) # noqa: F821
@login_required @login_required
@validate_request("doc_id") @validate_request("doc_id")

View file

@ -145,6 +145,7 @@ dependencies = [
"markdownify>=1.2.0", "markdownify>=1.2.0",
"captcha>=0.7.1", "captcha>=0.7.1",
"pip>=25.2", "pip>=25.2",
"pypandoc>=1.16",
] ]
[dependency-groups] [dependency-groups]

10
uv.lock generated
View file

@ -4892,6 +4892,14 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" }, { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
] ]
[[package]]
name = "pypandoc"
version = "1.16"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/77/af1fc54740a0712988f9518e629d38edc7b8ffccd7549203f19c3d8a2db6/pypandoc-1.16-py3-none-any.whl", hash = "sha256:868f390d48388743e7a5885915cbbaa005dea36a825ecdfd571f8c523416c822", size = 19425, upload-time = "2025-11-08T15:44:38.429Z" },
]
[[package]] [[package]]
name = "pyparsing" name = "pyparsing"
version = "3.2.3" version = "3.2.3"
@ -5292,6 +5300,7 @@ dependencies = [
{ name = "pyicu" }, { name = "pyicu" },
{ name = "pymysql" }, { name = "pymysql" },
{ name = "pyodbc" }, { name = "pyodbc" },
{ name = "pypandoc" },
{ name = "pypdf" }, { name = "pypdf" },
{ name = "pypdf2" }, { name = "pypdf2" },
{ name = "python-calamine" }, { name = "python-calamine" },
@ -5447,6 +5456,7 @@ requires-dist = [
{ name = "pyicu", specifier = ">=2.15.3,<3.0.0" }, { name = "pyicu", specifier = ">=2.15.3,<3.0.0" },
{ name = "pymysql", specifier = ">=1.1.1,<2.0.0" }, { name = "pymysql", specifier = ">=1.1.1,<2.0.0" },
{ name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
{ name = "pypandoc", specifier = ">=1.16" },
{ name = "pypdf", specifier = "==6.0.0" }, { name = "pypdf", specifier = "==6.0.0" },
{ name = "pypdf2", specifier = ">=3.0.1,<4.0.0" }, { name = "pypdf2", specifier = ">=3.0.1,<4.0.0" },
{ name = "python-calamine", specifier = ">=0.4.0" }, { name = "python-calamine", specifier = ">=0.4.0" },