diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py
index 9c771223f..d9b6d29ed 100644
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@@ -121,6 +121,13 @@ class TaskService(CommonService):
.where(cls.model.id == task_id)
)
docs = list(docs.dicts())
+ # Assuming docs = list(docs.dicts())
+ if docs:
+ kb_config = docs[0]['kb_parser_config'] # Dict from Knowledgebase.parser_config
+ mineru_method = kb_config.get('mineru_parse_method', 'auto')
+ mineru_formula = kb_config.get('mineru_formula_enable', True)
+ mineru_table = kb_config.get('mineru_table_enable', True)
+ print(mineru_method, mineru_formula, mineru_table)
if not docs:
return None
diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 2883bf881..7fff7822a 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -53,6 +53,30 @@ class MinerUContentType(StrEnum):
DISCARDED = "discarded"
+# Mapping from language names to MinerU language codes
+LANGUAGE_TO_MINERU_MAP = {
+ 'English': 'en',
+ 'Chinese': 'ch',
+ 'Traditional Chinese': 'chinese_cht',
+ 'Russian': 'east_slavic',
+ 'Ukrainian': 'east_slavic',
+ 'Indonesian': 'latin',
+ 'Spanish': 'latin',
+ 'Vietnamese': 'latin',
+ 'Japanese': 'japan',
+ 'Korean': 'korean',
+ 'Portuguese BR': 'latin',
+ 'German': 'latin',
+ 'French': 'latin',
+ 'Italian': 'latin',
+ 'Tamil': 'ta',
+ 'Telugu': 'te',
+ 'Kannada': 'ka',
+ 'Thai': 'th',
+ 'Greek': 'el',
+ 'Hindi': 'devanagari',
+}
+
class MinerUParser(RAGFlowPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path)
@@ -576,17 +600,21 @@ class MinerUParser(RAGFlowPdfParser):
*,
output_dir: Optional[str] = None,
backend: str = "pipeline",
- lang: Optional[str] = None,
- method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw",
+ **kwargs,
) -> tuple:
import shutil
temp_pdf = None
created_tmp_dir = False
+ # Assuming the dict is defined as shown
+ lang = kwargs.get('lang', 'English')
+ mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found
+ # Use .get() to avoid KeyError if the key doesn't exist
+
# remove spaces, or mineru crash, and _read_output fail too
file_path = Path(filepath)
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
@@ -625,7 +653,7 @@ class MinerUParser(RAGFlowPdfParser):
self.__images__(pdf, zoomin=1)
try:
- self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
+ self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=mineru_lang_code, server_url=server_url, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 353504d77..a85df3538 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -85,6 +85,8 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
binary=binary,
callback=callback,
parse_method=parse_method,
+ lang=lang,
+ **kwargs
)
return sections, tables, pdf_parser
except Exception as e:
diff --git a/rag/llm/ocr_model.py b/rag/llm/ocr_model.py
index b18a16a36..15950131d 100644
--- a/rag/llm/ocr_model.py
+++ b/rag/llm/ocr_model.py
@@ -57,7 +57,7 @@ class MinerUOcrModel(Base, MinerUParser):
server_url = server_url or self.mineru_server_url
return self.check_installation(backend=backend, server_url=server_url)
- def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
+ def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs):
ok, reason = self.check_available()
if not ok:
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
@@ -72,5 +72,6 @@ class MinerUOcrModel(Base, MinerUParser):
server_url=self.mineru_server_url,
delete_output=self.mineru_delete_output,
parse_method=parse_method,
+ **kwargs
)
return sections, tables
diff --git a/web/src/components/mineru-options-form-field.tsx b/web/src/components/mineru-options-form-field.tsx
new file mode 100644
index 000000000..5a64a7337
--- /dev/null
+++ b/web/src/components/mineru-options-form-field.tsx
@@ -0,0 +1,97 @@
+import { RAGFlowFormItem } from '@/components/ragflow-form';
+import { RAGFlowSelect } from '@/components/ui/select';
+import { Switch } from '@/components/ui/switch';
+import { LLMFactory } from '@/constants/llm';
+import { buildOptions } from '@/utils/form';
+import { useFormContext, useWatch } from 'react-hook-form';
+import { useTranslation } from 'react-i18next';
+
+const parseMethodOptions = buildOptions(['auto', 'txt', 'ocr']);
+
+export function MinerUOptionsFormField() {
+ const form = useFormContext();
+ const { t } = useTranslation();
+
+ const layoutRecognize = useWatch({
+ control: form.control,
+ name: 'parser_config.layout_recognize',
+ });
+
+ // Check if MinerU is selected (the value contains 'MinerU' or matches the factory name)
+ const isMinerUSelected =
+ layoutRecognize?.includes(LLMFactory.MinerU) ||
+ layoutRecognize?.toLowerCase()?.includes('mineru');
+
+ if (!isMinerUSelected) {
+ return null;
+ }
+
+ return (
+
+
+ {t('knowledgeConfiguration.mineruOptions', 'MinerU Options')}
+
+
+
+ {(field) => (
+
+ )}
+
+
+
+ {(field) => (
+
+ )}
+
+
+
+ {(field) => (
+
+ )}
+
+
+ );
+}
diff --git a/web/src/constants/common.ts b/web/src/constants/common.ts
index 18d06ca39..205c28f4c 100644
--- a/web/src/constants/common.ts
+++ b/web/src/constants/common.ts
@@ -103,14 +103,22 @@ export const LanguageTranslationMap = {
Chinese: 'zh',
'Traditional Chinese': 'zh-TRADITIONAL',
Russian: 'ru',
- Indonesia: 'id',
+ Indonesian: 'id',
Spanish: 'es',
Vietnamese: 'vi',
Japanese: 'ja',
+ Korean: 'ko',
'Portuguese BR': 'pt-br',
German: 'de',
French: 'fr',
Italian: 'it',
+ Tamil: 'ta',
+ Telugu: 'te',
+ Kannada: 'ka',
+ Thai: 'th',
+ Greek: 'el',
+ Hindi: 'hi',
+ Ukrainian: 'uk',
};
export enum FileMimeType {
diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts
index fd9977257..8abb38838 100644
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -329,6 +329,15 @@ export default {
reRankModelWaring: 'Re-rank model is very time consuming.',
},
knowledgeConfiguration: {
+ mineruOptions: 'MinerU Options',
+ mineruParseMethod: 'Parse Method',
+ mineruParseMethodTip:
+ 'Method for parsing PDF: auto (automatic detection), txt (text extraction), ocr (optical character recognition)',
+ mineruFormulaEnable: 'Formula Recognition',
+ mineruFormulaEnableTip:
+ 'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.',
+ mineruTableEnable: 'Table Recognition',
+ mineruTableEnableTip: 'Enable table recognition and extraction.',
overlappedPercent: 'Overlapped percent',
generationScopeTip:
'Determines whether RAPTOR is generated for the entire dataset or for a single file.',
diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts
index 50b2faae3..28563e09e 100644
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -264,6 +264,15 @@ export default {
theDocumentBeingParsedCannotBeDeleted: '正在解析的文档不能被删除',
},
knowledgeConfiguration: {
+ mineruOptions: 'MinerU 选项',
+ mineruParseMethod: '解析方法',
+ mineruParseMethodTip:
+ 'PDF 解析方法:auto(自动检测)、txt(文本提取)、ocr(光学字符识别)',
+ mineruFormulaEnable: '公式识别',
+ mineruFormulaEnableTip:
+ '启用公式识别。注意:对于西里尔文档可能无法正常工作。',
+ mineruTableEnable: '表格识别',
+ mineruTableEnableTip: '启用表格识别和提取。',
generationScopeTip: '选择 RAPTOR 的生成范围:整个知识库或单个文件。',
generationScope: '生成范围',
scopeSingleFile: '单文件',
diff --git a/web/src/pages/dataset/dataset-setting/configuration/naive.tsx b/web/src/pages/dataset/dataset-setting/configuration/naive.tsx
index 3742db39f..d14eca96f 100644
--- a/web/src/pages/dataset/dataset-setting/configuration/naive.tsx
+++ b/web/src/pages/dataset/dataset-setting/configuration/naive.tsx
@@ -6,6 +6,7 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
+import { MinerUOptionsFormField } from '@/components/mineru-options-form-field';
import {
ConfigurationFormContainer,
MainContainer,
@@ -17,6 +18,7 @@ export function NaiveConfiguration() {
+
diff --git a/web/src/pages/dataset/dataset-setting/form-schema.ts b/web/src/pages/dataset/dataset-setting/form-schema.ts
index ae7342ede..00c00a725 100644
--- a/web/src/pages/dataset/dataset-setting/form-schema.ts
+++ b/web/src/pages/dataset/dataset-setting/form-schema.ts
@@ -13,6 +13,7 @@ export const formSchema = z
// avatar: z.instanceof(File),
avatar: z.any().nullish(),
permission: z.string().optional(),
+ language: z.string().optional(),
parser_id: z.string(),
pipeline_id: z.string().optional(),
pipeline_name: z.string().optional(),
@@ -30,6 +31,10 @@ export const formSchema = z
topn_tags: z.number().optional(),
toc_extraction: z.boolean().optional(),
overlapped_percent: z.number().optional(),
+ // MinerU-specific options
+ mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
+ mineru_formula_enable: z.boolean().optional(),
+ mineru_table_enable: z.boolean().optional(),
raptor: z
.object({
use_raptor: z.boolean().optional(),
diff --git a/web/src/pages/dataset/dataset-setting/general-form.tsx b/web/src/pages/dataset/dataset-setting/general-form.tsx
index 110c03a3e..a93dae8be 100644
--- a/web/src/pages/dataset/dataset-setting/general-form.tsx
+++ b/web/src/pages/dataset/dataset-setting/general-form.tsx
@@ -1,5 +1,7 @@
import { AvatarUpload } from '@/components/avatar-upload';
+import { SelectWithSearch } from '@/components/originui/select-with-search';
import PageRankFormField from '@/components/page-rank-form-field';
+import { RAGFlowFormItem } from '@/components/ragflow-form';
import {
FormControl,
FormField,
@@ -8,6 +10,8 @@ import {
FormMessage,
} from '@/components/ui/form';
import { Input } from '@/components/ui/input';
+import { LanguageTranslationMap } from '@/constants/common';
+import { useMemo } from 'react';
import { useFormContext } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { TagItems } from './components/tag-item';
@@ -18,6 +22,13 @@ export function GeneralForm() {
const form = useFormContext();
const { t } = useTranslation();
+ const languageOptions = useMemo(() => {
+ return Object.keys(LanguageTranslationMap).map((x) => ({
+ label: x,
+ value: x,
+ }));
+ }, []);
+
return (
<>
)}
/>
+
+
+
+
+
) {
const { t } = useTranslation();
+ const languageOptions = useMemo(() => {
+ return Object.keys(LanguageTranslationMap).map((x) => ({
+ label: x,
+ value: x,
+ }));
+ }, []);
+
const FormSchema = z
.object({
name: z
@@ -51,6 +66,7 @@ export function InputForm({ onOk }: IModalProps) {
.trim(),
parser_id: z.string().optional(),
pipeline_id: z.string().optional(),
+ language: z.string().optional(),
})
.superRefine((data, ctx) => {
// When parseType === 1, parser_id is required
@@ -83,6 +99,7 @@ export function InputForm({ onOk }: IModalProps) {
parseType: 1,
parser_id: '',
embd_id: '',
+ language: 'English',
},
});
@@ -130,6 +147,33 @@ export function InputForm({ onOk }: IModalProps) {
)}
/>
+ (
+
+ {t('common.language')}
+
+
+
+ )}
+ />
+
{parseType === 1 && }