diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index 1f46a4098..5b09ed1c9 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -193,7 +193,14 @@ async def add_llm(): api_key = apikey_json(["api_key", "provider_order"]) elif factory == "MinerU": - api_key = apikey_json(["api_key", "provider_order"]) + api_key = apikey_json([ + "llm_name", + "mineru_apiserver", + "mineru_output_dir", + "mineru_backend", + "mineru_server_url", + "mineru_delete_output", + ]) llm = { "tenant_id": current_user.id, diff --git a/conf/llm_factories.json b/conf/llm_factories.json index d363e7f06..e3c9d85fb 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -5496,14 +5496,6 @@ "model_type": "reranker" } ] - }, - { - "name": "MinerU", - "logo": "", - "tags": "OCR", - "status": "1", - "rank": "900", - "llm": [] } ] } diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 2883bf881..b46331cf9 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -185,14 +185,16 @@ class MinerUParser(RAGFlowPdfParser): return False, reason def _run_mineru( - self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None + self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None, + formula_enable: bool = True, table_enable: bool = True ): if self.using_api: - self._run_mineru_api(input_path, output_dir, method, backend, lang, callback) + self._run_mineru_api(input_path, output_dir, method, backend, lang, callback, formula_enable, table_enable) else: self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback) - def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): + def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None, + formula_enable: bool = True, table_enable: bool = True): output_zip_path = os.path.join(str(output_dir), "output.zip") pdf_file_path = str(input_path) @@ -201,7 +203,9 @@ class MinerUParser(RAGFlowPdfParser): raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}") pdf_file_name = Path(pdf_file_path).stem.strip() - output_path = os.path.join(str(output_dir), pdf_file_name, method) + # FIX: MinerU API outputs to 'vlm/' when using VLM backend, not 'auto/' + output_subfolder = "vlm" if backend.startswith("vlm") else method + output_path = os.path.join(str(output_dir), pdf_file_name, output_subfolder) os.makedirs(output_path, exist_ok=True) files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")} @@ -211,8 +215,8 @@ class MinerUParser(RAGFlowPdfParser): "lang_list": lang, "backend": backend, "parse_method": method, - "formula_enable": True, - "table_enable": True, + "formula_enable": formula_enable, + "table_enable": table_enable, "server_url": None, "return_md": True, "return_middle_json": True, @@ -224,6 +228,11 @@ class MinerUParser(RAGFlowPdfParser): "end_page_id": 99999, } + # DEBUG: Log the exact request data being sent to MinerU + self.logger.info(f"[MinerU DEBUG] Request URL: {self.mineru_api}/file_parse") + self.logger.info(f"[MinerU DEBUG] Request data: {json.dumps(data, indent=2)}") + self.logger.info(f"[MinerU DEBUG] File: {pdf_file_name}.pdf") + headers = {"Accept": "application/json"} try: self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse") @@ -581,6 +590,8 @@ class MinerUParser(RAGFlowPdfParser): server_url: Optional[str] = None, delete_output: bool = True, parse_method: str = "raw", + formula_enable: bool = True, + table_enable: bool = True, ) -> tuple: import shutil @@ -625,7 +636,8 @@ class MinerUParser(RAGFlowPdfParser): self.__images__(pdf, zoomin=1) try: - self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback) + self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback, + formula_enable=formula_enable, table_enable=table_enable) outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") if callback: diff --git a/rag/app/naive.py b/rag/app/naive.py index 353504d77..59e66037a 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -39,6 +39,52 @@ from deepdoc.parser.tcadp_parser import TCADPParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context +# MinerU OCR language mapping (RAGFlow language -> MinerU OCR code) +# See: https://github.com/opendatalab/MinerU for supported languages +MINERU_LANG_MAP = { + "chinese": "ch", + "english": "en", + "russian": "cyrillic", + "ukrainian": "cyrillic", + "belarusian": "cyrillic", + "bulgarian": "cyrillic", + "serbian": "cyrillic", + "korean": "korean", + "japanese": "japan", + "arabic": "arabic", + "thai": "th", + "greek": "el", + "hindi": "devanagari", + "tamil": "ta", + "telugu": "te", + "kannada": "ka", + "georgian": "ka", + "vietnamese": "latin", + "french": "latin", + "german": "latin", + "spanish": "latin", + "italian": "latin", + "portuguese": "latin", + "polish": "latin", + "dutch": "latin", + "turkish": "latin", +} + + +def _get_mineru_lang(lang: str) -> str: + """Convert RAGFlow language name to MinerU OCR language code. + + Args: + lang: RAGFlow language name (e.g., "Chinese", "Russian", "English") + + Returns: + MinerU OCR language code (e.g., "ch", "cyrillic", "en") + """ + if not lang: + return "latin" + return MINERU_LANG_MAP.get(lang.lower(), "latin") + + def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): callback = callback binary = binary @@ -60,6 +106,15 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" parse_method = kwargs.get("parse_method", "raw") mineru_llm_name = kwargs.get("mineru_llm_name") tenant_id = kwargs.get("tenant_id") + + # Get MinerU-specific settings from parser_config + parser_config = kwargs.get("parser_config", {}) + mineru_lang = parser_config.get("mineru_lang") or _get_mineru_lang(lang) + formula_enable = parser_config.get("mineru_formula_enable", True) + table_enable = parser_config.get("mineru_table_enable", True) + + logging.info(f"[MinerU] by_mineru called with lang={lang}, parser_config mineru_lang={parser_config.get('mineru_lang')}, resolved mineru_lang={mineru_lang}") + logging.info(f"[MinerU] formula_enable={formula_enable}, table_enable={table_enable}") pdf_parser = None if tenant_id: @@ -85,6 +140,9 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" binary=binary, callback=callback, parse_method=parse_method, + lang=mineru_lang, + formula_enable=formula_enable, + table_enable=table_enable, ) return sections, tables, pdf_parser except Exception as e: diff --git a/rag/llm/ocr_model.py b/rag/llm/ocr_model.py index b18a16a36..278667593 100644 --- a/rag/llm/ocr_model.py +++ b/rag/llm/ocr_model.py @@ -13,10 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import json import logging import os -from typing import Any, Optional, Tuple +from io import BytesIO +from os import PathLike +from typing import Callable, Optional from deepdoc.parser.mineru_parser import MinerUParser @@ -25,7 +26,22 @@ class Base: def __init__(self, key: str | dict, model_name: str, **kwargs): self.model_name = model_name - def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]: + def parse_pdf( + self, + filepath: str | PathLike[str], + binary: BytesIO | bytes, + callback: Optional[Callable] = None, + *, + output_dir: Optional[str] = None, + backend: str = "pipeline", + lang: Optional[str] = None, + method: str = "auto", + server_url: Optional[str] = None, + delete_output: bool = True, + parse_method: str = "raw", + formula_enable: bool = True, + table_enable: bool = True, + ) -> tuple: raise NotImplementedError("Please implement parse_pdf!") @@ -34,30 +50,40 @@ class MinerUOcrModel(Base, MinerUParser): def __init__(self, key: str | dict, model_name: str, **kwargs): Base.__init__(self, key, model_name, **kwargs) - config = {} - if key: - try: - config = json.loads(key) - except Exception: - config = {} - config = config["api_key"] - self.mineru_api = config.get("mineru_apiserver", os.environ.get("MINERU_APISERVER", "")) - self.mineru_output_dir = config.get("mineru_output_dir", os.environ.get("MINERU_OUTPUT_DIR", "")) - self.mineru_backend = config.get("mineru_backend", os.environ.get("MINERU_BACKEND", "pipeline")) - self.mineru_server_url = config.get("mineru_server_url", os.environ.get("MINERU_SERVER_URL", "")) - self.mineru_delete_output = bool(int(config.get("mineru_delete_output", os.environ.get("MINERU_DELETE_OUTPUT", 1)))) + + # Use environment variables directly - no database config needed + self.mineru_api = os.environ.get("MINERU_APISERVER", "") + self.mineru_output_dir = os.environ.get("MINERU_OUTPUT_DIR", "") + self.mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline") + self.mineru_server_url = os.environ.get("MINERU_SERVER_URL", "") + self.mineru_delete_output = os.environ.get("MINERU_DELETE_OUTPUT", "1") == "1" self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") - logging.info(f"Parsed MinerU config: {config}") + logging.info(f"MinerU config from env: api={self.mineru_api}, backend={self.mineru_backend}, server_url={self.mineru_server_url}") MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url) - def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]: + def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]: backend = backend or self.mineru_backend server_url = server_url or self.mineru_server_url return self.check_installation(backend=backend, server_url=server_url) - def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs): + def parse_pdf( + self, + filepath: str | PathLike[str], + binary: BytesIO | bytes, + callback: Optional[Callable] = None, + *, + output_dir: Optional[str] = None, + backend: str = "pipeline", + lang: Optional[str] = None, + method: str = "auto", + server_url: Optional[str] = None, + delete_output: bool = True, + parse_method: str = "raw", + formula_enable: bool = True, + table_enable: bool = True, + ) -> tuple: ok, reason = self.check_available() if not ok: raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.") @@ -65,12 +91,16 @@ class MinerUOcrModel(Base, MinerUParser): sections, tables = MinerUParser.parse_pdf( self, filepath=filepath, - binary=binary, + binary=binary, # type: ignore[arg-type] callback=callback, - output_dir=self.mineru_output_dir, - backend=self.mineru_backend, - server_url=self.mineru_server_url, - delete_output=self.mineru_delete_output, + output_dir=output_dir or self.mineru_output_dir, + backend=backend or self.mineru_backend, + lang=lang, + method=method, + server_url=server_url or self.mineru_server_url, + delete_output=delete_output if delete_output is not None else self.mineru_delete_output, parse_method=parse_method, + formula_enable=formula_enable, + table_enable=table_enable, ) return sections, tables diff --git a/web/src/components/mineru-config-form-field.tsx b/web/src/components/mineru-config-form-field.tsx new file mode 100644 index 000000000..661d248b6 --- /dev/null +++ b/web/src/components/mineru-config-form-field.tsx @@ -0,0 +1,156 @@ +import { useFormContext, useWatch } from 'react-hook-form'; +import { + FormControl, + FormField, + FormItem, + FormLabel, + FormMessage, +} from './ui/form'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from './ui/select'; +import { Switch } from './ui/switch'; + +// MinerU OCR language options with human-readable labels +const MINERU_LANG_OPTIONS = [ + { value: 'ch', label: 'Chinese (Simplified)' }, + { value: 'en', label: 'English' }, + { value: 'cyrillic', label: 'Cyrillic (Russian, Ukrainian, etc.)' }, + { value: 'latin', label: 'Latin (French, German, Spanish, etc.)' }, + { value: 'korean', label: 'Korean' }, + { value: 'japan', label: 'Japanese' }, + { value: 'arabic', label: 'Arabic' }, + { value: 'th', label: 'Thai' }, + { value: 'el', label: 'Greek' }, + { value: 'devanagari', label: 'Hindi (Devanagari)' }, + { value: 'ta', label: 'Tamil' }, + { value: 'te', label: 'Telugu' }, + { value: 'ka', label: 'Georgian/Kannada' }, + { value: 'chinese_cht', label: 'Chinese (Traditional)' }, +]; + +/** + * Check if the current layout recognizer is MinerU + */ +function useIsMineruSelected() { + const form = useFormContext(); + const layoutRecognize = useWatch({ + control: form.control, + name: 'parser_config.layout_recognize', + }); + + // MinerU models have format like "model-name@MinerU" + return ( + typeof layoutRecognize === 'string' && + (layoutRecognize.toLowerCase().includes('mineru') || + layoutRecognize.toLowerCase().endsWith('@mineru')) + ); +} + +export function MineruConfigFormField() { + const form = useFormContext(); + const isMineruSelected = useIsMineruSelected(); + + if (!isMineruSelected) { + return null; + } + + return ( +
+
+ MinerU OCR Settings +
+ + {/* MinerU Language Selection */} + ( + +
+ + OCR Language + +
+ + + +
+
+ +
+ )} + /> + + {/* Formula Recognition Toggle */} + ( + +
+ + Formula Recognition + +
+ + + +
+
+
+ Disable for Cyrillic/stylized fonts to avoid incorrect LaTeX + conversion +
+ +
+ )} + /> + + {/* Table Recognition Toggle */} + ( + +
+ + Table Recognition + +
+ + + +
+
+ +
+ )} + /> +
+ ); +} diff --git a/web/src/interfaces/database/knowledge.ts b/web/src/interfaces/database/knowledge.ts index 602748669..87574843b 100644 --- a/web/src/interfaces/database/knowledge.ts +++ b/web/src/interfaces/database/knowledge.ts @@ -67,6 +67,11 @@ export interface ParserConfig { tag_kb_ids?: string[]; topn_tags?: number; graphrag?: { use_graphrag?: boolean }; + // MinerU-specific settings + mineru_lang?: string; + mineru_formula_enable?: boolean; + mineru_table_enable?: boolean; + mineru_parse_method?: string; } export interface IKnowledgeFileParserConfig { diff --git a/web/src/pages/dataset/dataset-setting/configuration/book.tsx b/web/src/pages/dataset/dataset-setting/configuration/book.tsx index 2ba4623c5..81535e717 100644 --- a/web/src/pages/dataset/dataset-setting/configuration/book.tsx +++ b/web/src/pages/dataset/dataset-setting/configuration/book.tsx @@ -3,6 +3,7 @@ import { AutoQuestionsFormField, } from '@/components/auto-keywords-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; +import { MineruConfigFormField } from '@/components/mineru-config-form-field'; import { ConfigurationFormContainer, MainContainer, @@ -13,6 +14,7 @@ export function BookConfiguration() { + diff --git a/web/src/pages/dataset/dataset-setting/configuration/naive.tsx b/web/src/pages/dataset/dataset-setting/configuration/naive.tsx index 3742db39f..c8797a8e8 100644 --- a/web/src/pages/dataset/dataset-setting/configuration/naive.tsx +++ b/web/src/pages/dataset/dataset-setting/configuration/naive.tsx @@ -6,6 +6,7 @@ import { DelimiterFormField } from '@/components/delimiter-form-field'; import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field'; +import { MineruConfigFormField } from '@/components/mineru-config-form-field'; import { ConfigurationFormContainer, MainContainer, @@ -17,6 +18,7 @@ export function NaiveConfiguration() { + diff --git a/web/src/pages/dataset/dataset-setting/configuration/paper.tsx b/web/src/pages/dataset/dataset-setting/configuration/paper.tsx index 7c682264f..441abd7ca 100644 --- a/web/src/pages/dataset/dataset-setting/configuration/paper.tsx +++ b/web/src/pages/dataset/dataset-setting/configuration/paper.tsx @@ -3,6 +3,7 @@ import { AutoQuestionsFormField, } from '@/components/auto-keywords-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; +import { MineruConfigFormField } from '@/components/mineru-config-form-field'; import { ConfigurationFormContainer, MainContainer, @@ -13,6 +14,7 @@ export function PaperConfiguration() { + diff --git a/web/src/pages/dataset/dataset-setting/form-schema.ts b/web/src/pages/dataset/dataset-setting/form-schema.ts index ae7342ede..ea7c992e0 100644 --- a/web/src/pages/dataset/dataset-setting/form-schema.ts +++ b/web/src/pages/dataset/dataset-setting/form-schema.ts @@ -30,6 +30,10 @@ export const formSchema = z topn_tags: z.number().optional(), toc_extraction: z.boolean().optional(), overlapped_percent: z.number().optional(), + // MinerU-specific settings + mineru_lang: z.string().optional(), + mineru_formula_enable: z.boolean().optional(), + mineru_table_enable: z.boolean().optional(), raptor: z .object({ use_raptor: z.boolean().optional(), diff --git a/web/src/pages/dataset/dataset-setting/saving-button.tsx b/web/src/pages/dataset/dataset-setting/saving-button.tsx index 558150b4f..f8be140da 100644 --- a/web/src/pages/dataset/dataset-setting/saving-button.tsx +++ b/web/src/pages/dataset/dataset-setting/saving-button.tsx @@ -58,7 +58,7 @@ export function SavingButton() { onClick={() => { (async () => { try { - let beValid = await form.formControl.trigger(); + let beValid = await form.trigger(); if (beValid) { form.handleSubmit(async (values) => { console.log('saveKnowledgeConfiguration: ', values);