From 03d22b9e719e81f438bb53c4421b6e3760350600 Mon Sep 17 00:00:00 2001 From: user210 Date: Fri, 12 Dec 2025 20:07:34 +0200 Subject: [PATCH] Feat: Enhance MinerU options with language selection and parsing configurations --- api/db/services/task_service.py | 7 ++ deepdoc/parser/mineru_parser.py | 34 ++++++- rag/app/naive.py | 2 + rag/llm/ocr_model.py | 3 +- .../components/mineru-options-form-field.tsx | 97 +++++++++++++++++++ web/src/constants/common.ts | 10 +- web/src/locales/en.ts | 9 ++ web/src/locales/zh.ts | 9 ++ .../dataset-setting/configuration/naive.tsx | 2 + .../dataset/dataset-setting/form-schema.ts | 5 + .../dataset/dataset-setting/general-form.tsx | 23 +++++ .../pages/dataset/dataset-setting/index.tsx | 5 + .../datasets/dataset-creating-dialog.tsx | 46 ++++++++- 13 files changed, 246 insertions(+), 6 deletions(-) create mode 100644 web/src/components/mineru-options-form-field.tsx diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 9c771223f..d9b6d29ed 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -121,6 +121,13 @@ class TaskService(CommonService): .where(cls.model.id == task_id) ) docs = list(docs.dicts()) + # Assuming docs = list(docs.dicts()) + if docs: + kb_config = docs[0]['kb_parser_config'] # Dict from Knowledgebase.parser_config + mineru_method = kb_config.get('mineru_parse_method', 'auto') + mineru_formula = kb_config.get('mineru_formula_enable', True) + mineru_table = kb_config.get('mineru_table_enable', True) + print(mineru_method, mineru_formula, mineru_table) if not docs: return None diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 2883bf881..7fff7822a 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -53,6 +53,30 @@ class MinerUContentType(StrEnum): DISCARDED = "discarded" +# Mapping from language names to MinerU language codes +LANGUAGE_TO_MINERU_MAP = { + 'English': 'en', + 'Chinese': 'ch', + 'Traditional Chinese': 'chinese_cht', + 'Russian': 'east_slavic', + 'Ukrainian': 'east_slavic', + 'Indonesian': 'latin', + 'Spanish': 'latin', + 'Vietnamese': 'latin', + 'Japanese': 'japan', + 'Korean': 'korean', + 'Portuguese BR': 'latin', + 'German': 'latin', + 'French': 'latin', + 'Italian': 'latin', + 'Tamil': 'ta', + 'Telugu': 'te', + 'Kannada': 'ka', + 'Thai': 'th', + 'Greek': 'el', + 'Hindi': 'devanagari', +} + class MinerUParser(RAGFlowPdfParser): def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""): self.mineru_path = Path(mineru_path) @@ -576,17 +600,21 @@ class MinerUParser(RAGFlowPdfParser): *, output_dir: Optional[str] = None, backend: str = "pipeline", - lang: Optional[str] = None, - method: str = "auto", server_url: Optional[str] = None, delete_output: bool = True, parse_method: str = "raw", + **kwargs, ) -> tuple: import shutil temp_pdf = None created_tmp_dir = False + # Assuming the dict is defined as shown + lang = kwargs.get('lang', 'English') + mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found + # Use .get() to avoid KeyError if the key doesn't exist + # remove spaces, or mineru crash, and _read_output fail too file_path = Path(filepath) pdf_file_name = file_path.stem.replace(" ", "") + ".pdf" @@ -625,7 +653,7 @@ class MinerUParser(RAGFlowPdfParser): self.__images__(pdf, zoomin=1) try: - self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback) + self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=mineru_lang_code, server_url=server_url, callback=callback) outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") if callback: diff --git a/rag/app/naive.py b/rag/app/naive.py index 353504d77..a85df3538 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -85,6 +85,8 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" binary=binary, callback=callback, parse_method=parse_method, + lang=lang, + **kwargs ) return sections, tables, pdf_parser except Exception as e: diff --git a/rag/llm/ocr_model.py b/rag/llm/ocr_model.py index b18a16a36..15950131d 100644 --- a/rag/llm/ocr_model.py +++ b/rag/llm/ocr_model.py @@ -57,7 +57,7 @@ class MinerUOcrModel(Base, MinerUParser): server_url = server_url or self.mineru_server_url return self.check_installation(backend=backend, server_url=server_url) - def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs): + def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs): ok, reason = self.check_available() if not ok: raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.") @@ -72,5 +72,6 @@ class MinerUOcrModel(Base, MinerUParser): server_url=self.mineru_server_url, delete_output=self.mineru_delete_output, parse_method=parse_method, + **kwargs ) return sections, tables diff --git a/web/src/components/mineru-options-form-field.tsx b/web/src/components/mineru-options-form-field.tsx new file mode 100644 index 000000000..5a64a7337 --- /dev/null +++ b/web/src/components/mineru-options-form-field.tsx @@ -0,0 +1,97 @@ +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { RAGFlowSelect } from '@/components/ui/select'; +import { Switch } from '@/components/ui/switch'; +import { LLMFactory } from '@/constants/llm'; +import { buildOptions } from '@/utils/form'; +import { useFormContext, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +const parseMethodOptions = buildOptions(['auto', 'txt', 'ocr']); + +export function MinerUOptionsFormField() { + const form = useFormContext(); + const { t } = useTranslation(); + + const layoutRecognize = useWatch({ + control: form.control, + name: 'parser_config.layout_recognize', + }); + + // Check if MinerU is selected (the value contains 'MinerU' or matches the factory name) + const isMinerUSelected = + layoutRecognize?.includes(LLMFactory.MinerU) || + layoutRecognize?.toLowerCase()?.includes('mineru'); + + if (!isMinerUSelected) { + return null; + } + + return ( +
+
+ {t('knowledgeConfiguration.mineruOptions', 'MinerU Options')} +
+ + + {(field) => ( + + )} + + + + {(field) => ( + + )} + + + + {(field) => ( + + )} + +
+ ); +} diff --git a/web/src/constants/common.ts b/web/src/constants/common.ts index 18d06ca39..205c28f4c 100644 --- a/web/src/constants/common.ts +++ b/web/src/constants/common.ts @@ -103,14 +103,22 @@ export const LanguageTranslationMap = { Chinese: 'zh', 'Traditional Chinese': 'zh-TRADITIONAL', Russian: 'ru', - Indonesia: 'id', + Indonesian: 'id', Spanish: 'es', Vietnamese: 'vi', Japanese: 'ja', + Korean: 'ko', 'Portuguese BR': 'pt-br', German: 'de', French: 'fr', Italian: 'it', + Tamil: 'ta', + Telugu: 'te', + Kannada: 'ka', + Thai: 'th', + Greek: 'el', + Hindi: 'hi', + Ukrainian: 'uk', }; export enum FileMimeType { diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index fd9977257..8abb38838 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -329,6 +329,15 @@ export default { reRankModelWaring: 'Re-rank model is very time consuming.', }, knowledgeConfiguration: { + mineruOptions: 'MinerU Options', + mineruParseMethod: 'Parse Method', + mineruParseMethodTip: + 'Method for parsing PDF: auto (automatic detection), txt (text extraction), ocr (optical character recognition)', + mineruFormulaEnable: 'Formula Recognition', + mineruFormulaEnableTip: + 'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.', + mineruTableEnable: 'Table Recognition', + mineruTableEnableTip: 'Enable table recognition and extraction.', overlappedPercent: 'Overlapped percent', generationScopeTip: 'Determines whether RAPTOR is generated for the entire dataset or for a single file.', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 50b2faae3..28563e09e 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -264,6 +264,15 @@ export default { theDocumentBeingParsedCannotBeDeleted: '正在解析的文档不能被删除', }, knowledgeConfiguration: { + mineruOptions: 'MinerU 选项', + mineruParseMethod: '解析方法', + mineruParseMethodTip: + 'PDF 解析方法:auto(自动检测)、txt(文本提取)、ocr(光学字符识别)', + mineruFormulaEnable: '公式识别', + mineruFormulaEnableTip: + '启用公式识别。注意:对于西里尔文档可能无法正常工作。', + mineruTableEnable: '表格识别', + mineruTableEnableTip: '启用表格识别和提取。', generationScopeTip: '选择 RAPTOR 的生成范围:整个知识库或单个文件。', generationScope: '生成范围', scopeSingleFile: '单文件', diff --git a/web/src/pages/dataset/dataset-setting/configuration/naive.tsx b/web/src/pages/dataset/dataset-setting/configuration/naive.tsx index 3742db39f..d14eca96f 100644 --- a/web/src/pages/dataset/dataset-setting/configuration/naive.tsx +++ b/web/src/pages/dataset/dataset-setting/configuration/naive.tsx @@ -6,6 +6,7 @@ import { DelimiterFormField } from '@/components/delimiter-form-field'; import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field'; +import { MinerUOptionsFormField } from '@/components/mineru-options-form-field'; import { ConfigurationFormContainer, MainContainer, @@ -17,6 +18,7 @@ export function NaiveConfiguration() { + diff --git a/web/src/pages/dataset/dataset-setting/form-schema.ts b/web/src/pages/dataset/dataset-setting/form-schema.ts index ae7342ede..00c00a725 100644 --- a/web/src/pages/dataset/dataset-setting/form-schema.ts +++ b/web/src/pages/dataset/dataset-setting/form-schema.ts @@ -13,6 +13,7 @@ export const formSchema = z // avatar: z.instanceof(File), avatar: z.any().nullish(), permission: z.string().optional(), + language: z.string().optional(), parser_id: z.string(), pipeline_id: z.string().optional(), pipeline_name: z.string().optional(), @@ -30,6 +31,10 @@ export const formSchema = z topn_tags: z.number().optional(), toc_extraction: z.boolean().optional(), overlapped_percent: z.number().optional(), + // MinerU-specific options + mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(), + mineru_formula_enable: z.boolean().optional(), + mineru_table_enable: z.boolean().optional(), raptor: z .object({ use_raptor: z.boolean().optional(), diff --git a/web/src/pages/dataset/dataset-setting/general-form.tsx b/web/src/pages/dataset/dataset-setting/general-form.tsx index 110c03a3e..a93dae8be 100644 --- a/web/src/pages/dataset/dataset-setting/general-form.tsx +++ b/web/src/pages/dataset/dataset-setting/general-form.tsx @@ -1,5 +1,7 @@ import { AvatarUpload } from '@/components/avatar-upload'; +import { SelectWithSearch } from '@/components/originui/select-with-search'; import PageRankFormField from '@/components/page-rank-form-field'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; import { FormControl, FormField, @@ -8,6 +10,8 @@ import { FormMessage, } from '@/components/ui/form'; import { Input } from '@/components/ui/input'; +import { LanguageTranslationMap } from '@/constants/common'; +import { useMemo } from 'react'; import { useFormContext } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { TagItems } from './components/tag-item'; @@ -18,6 +22,13 @@ export function GeneralForm() { const form = useFormContext(); const { t } = useTranslation(); + const languageOptions = useMemo(() => { + return Object.keys(LanguageTranslationMap).map((x) => ({ + label: x, + value: x, + })); + }, []); + return ( <> )} /> +
+ + + +
) { const { t } = useTranslation(); + const languageOptions = useMemo(() => { + return Object.keys(LanguageTranslationMap).map((x) => ({ + label: x, + value: x, + })); + }, []); + const FormSchema = z .object({ name: z @@ -51,6 +66,7 @@ export function InputForm({ onOk }: IModalProps) { .trim(), parser_id: z.string().optional(), pipeline_id: z.string().optional(), + language: z.string().optional(), }) .superRefine((data, ctx) => { // When parseType === 1, parser_id is required @@ -83,6 +99,7 @@ export function InputForm({ onOk }: IModalProps) { parseType: 1, parser_id: '', embd_id: '', + language: 'English', }, }); @@ -130,6 +147,33 @@ export function InputForm({ onOk }: IModalProps) { )} /> + ( + + {t('common.language')} + + + + )} + /> + {parseType === 1 && }