feat: MinerU integration with environment-based configuration

- Add MinerU PDF parser support via external API or local installation
- Configure MinerU through environment variables (MINERU_APISERVER, etc.)
- Add per-dataset MinerU settings: language, formula/table recognition
- Add MinerU config form fields in Knowledge Base settings
- Remove MinerU from LLM factories (not a typical LLM model)
- Clean up unused ingestors tab from settings
This commit is contained in:
user210 2025-12-11 19:56:46 +02:00
parent 74afb8d710
commit 035e8ced98
12 changed files with 310 additions and 40 deletions

View file

@ -193,7 +193,14 @@ async def add_llm():
api_key = apikey_json(["api_key", "provider_order"]) api_key = apikey_json(["api_key", "provider_order"])
elif factory == "MinerU": elif factory == "MinerU":
api_key = apikey_json(["api_key", "provider_order"]) api_key = apikey_json([
"llm_name",
"mineru_apiserver",
"mineru_output_dir",
"mineru_backend",
"mineru_server_url",
"mineru_delete_output",
])
llm = { llm = {
"tenant_id": current_user.id, "tenant_id": current_user.id,

View file

@ -5496,14 +5496,6 @@
"model_type": "reranker" "model_type": "reranker"
} }
] ]
},
{
"name": "MinerU",
"logo": "",
"tags": "OCR",
"status": "1",
"rank": "900",
"llm": []
} }
] ]
} }

View file

@ -185,14 +185,16 @@ class MinerUParser(RAGFlowPdfParser):
return False, reason return False, reason
def _run_mineru( def _run_mineru(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None,
formula_enable: bool = True, table_enable: bool = True
): ):
if self.using_api: if self.using_api:
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback) self._run_mineru_api(input_path, output_dir, method, backend, lang, callback, formula_enable, table_enable)
else: else:
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback) self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None,
formula_enable: bool = True, table_enable: bool = True):
output_zip_path = os.path.join(str(output_dir), "output.zip") output_zip_path = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path) pdf_file_path = str(input_path)
@ -201,7 +203,9 @@ class MinerUParser(RAGFlowPdfParser):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}") raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip() pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, method) # FIX: MinerU API outputs to 'vlm/' when using VLM backend, not 'auto/'
output_subfolder = "vlm" if backend.startswith("vlm") else method
output_path = os.path.join(str(output_dir), pdf_file_name, output_subfolder)
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")} files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
@ -211,8 +215,8 @@ class MinerUParser(RAGFlowPdfParser):
"lang_list": lang, "lang_list": lang,
"backend": backend, "backend": backend,
"parse_method": method, "parse_method": method,
"formula_enable": True, "formula_enable": formula_enable,
"table_enable": True, "table_enable": table_enable,
"server_url": None, "server_url": None,
"return_md": True, "return_md": True,
"return_middle_json": True, "return_middle_json": True,
@ -224,6 +228,11 @@ class MinerUParser(RAGFlowPdfParser):
"end_page_id": 99999, "end_page_id": 99999,
} }
# DEBUG: Log the exact request data being sent to MinerU
self.logger.info(f"[MinerU DEBUG] Request URL: {self.mineru_api}/file_parse")
self.logger.info(f"[MinerU DEBUG] Request data: {json.dumps(data, indent=2)}")
self.logger.info(f"[MinerU DEBUG] File: {pdf_file_name}.pdf")
headers = {"Accept": "application/json"} headers = {"Accept": "application/json"}
try: try:
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse") self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
@ -581,6 +590,8 @@ class MinerUParser(RAGFlowPdfParser):
server_url: Optional[str] = None, server_url: Optional[str] = None,
delete_output: bool = True, delete_output: bool = True,
parse_method: str = "raw", parse_method: str = "raw",
formula_enable: bool = True,
table_enable: bool = True,
) -> tuple: ) -> tuple:
import shutil import shutil
@ -625,7 +636,8 @@ class MinerUParser(RAGFlowPdfParser):
self.__images__(pdf, zoomin=1) self.__images__(pdf, zoomin=1)
try: try:
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback) self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback,
formula_enable=formula_enable, table_enable=table_enable)
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:

View file

@ -39,6 +39,52 @@ from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
# MinerU OCR language mapping (RAGFlow language -> MinerU OCR code)
# See: https://github.com/opendatalab/MinerU for supported languages
MINERU_LANG_MAP = {
"chinese": "ch",
"english": "en",
"russian": "cyrillic",
"ukrainian": "cyrillic",
"belarusian": "cyrillic",
"bulgarian": "cyrillic",
"serbian": "cyrillic",
"korean": "korean",
"japanese": "japan",
"arabic": "arabic",
"thai": "th",
"greek": "el",
"hindi": "devanagari",
"tamil": "ta",
"telugu": "te",
"kannada": "ka",
"georgian": "ka",
"vietnamese": "latin",
"french": "latin",
"german": "latin",
"spanish": "latin",
"italian": "latin",
"portuguese": "latin",
"polish": "latin",
"dutch": "latin",
"turkish": "latin",
}
def _get_mineru_lang(lang: str) -> str:
"""Convert RAGFlow language name to MinerU OCR language code.
Args:
lang: RAGFlow language name (e.g., "Chinese", "Russian", "English")
Returns:
MinerU OCR language code (e.g., "ch", "cyrillic", "en")
"""
if not lang:
return "latin"
return MINERU_LANG_MAP.get(lang.lower(), "latin")
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback callback = callback
binary = binary binary = binary
@ -60,6 +106,15 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
parse_method = kwargs.get("parse_method", "raw") parse_method = kwargs.get("parse_method", "raw")
mineru_llm_name = kwargs.get("mineru_llm_name") mineru_llm_name = kwargs.get("mineru_llm_name")
tenant_id = kwargs.get("tenant_id") tenant_id = kwargs.get("tenant_id")
# Get MinerU-specific settings from parser_config
parser_config = kwargs.get("parser_config", {})
mineru_lang = parser_config.get("mineru_lang") or _get_mineru_lang(lang)
formula_enable = parser_config.get("mineru_formula_enable", True)
table_enable = parser_config.get("mineru_table_enable", True)
logging.info(f"[MinerU] by_mineru called with lang={lang}, parser_config mineru_lang={parser_config.get('mineru_lang')}, resolved mineru_lang={mineru_lang}")
logging.info(f"[MinerU] formula_enable={formula_enable}, table_enable={table_enable}")
pdf_parser = None pdf_parser = None
if tenant_id: if tenant_id:
@ -85,6 +140,9 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
binary=binary, binary=binary,
callback=callback, callback=callback,
parse_method=parse_method, parse_method=parse_method,
lang=mineru_lang,
formula_enable=formula_enable,
table_enable=table_enable,
) )
return sections, tables, pdf_parser return sections, tables, pdf_parser
except Exception as e: except Exception as e:

View file

@ -13,10 +13,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import json
import logging import logging
import os import os
from typing import Any, Optional, Tuple from io import BytesIO
from os import PathLike
from typing import Callable, Optional
from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.mineru_parser import MinerUParser
@ -25,7 +26,22 @@ class Base:
def __init__(self, key: str | dict, model_name: str, **kwargs): def __init__(self, key: str | dict, model_name: str, **kwargs):
self.model_name = model_name self.model_name = model_name
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]: def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
*,
output_dir: Optional[str] = None,
backend: str = "pipeline",
lang: Optional[str] = None,
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw",
formula_enable: bool = True,
table_enable: bool = True,
) -> tuple:
raise NotImplementedError("Please implement parse_pdf!") raise NotImplementedError("Please implement parse_pdf!")
@ -34,30 +50,40 @@ class MinerUOcrModel(Base, MinerUParser):
def __init__(self, key: str | dict, model_name: str, **kwargs): def __init__(self, key: str | dict, model_name: str, **kwargs):
Base.__init__(self, key, model_name, **kwargs) Base.__init__(self, key, model_name, **kwargs)
config = {}
if key: # Use environment variables directly - no database config needed
try: self.mineru_api = os.environ.get("MINERU_APISERVER", "")
config = json.loads(key) self.mineru_output_dir = os.environ.get("MINERU_OUTPUT_DIR", "")
except Exception: self.mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
config = {} self.mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
config = config["api_key"] self.mineru_delete_output = os.environ.get("MINERU_DELETE_OUTPUT", "1") == "1"
self.mineru_api = config.get("mineru_apiserver", os.environ.get("MINERU_APISERVER", ""))
self.mineru_output_dir = config.get("mineru_output_dir", os.environ.get("MINERU_OUTPUT_DIR", ""))
self.mineru_backend = config.get("mineru_backend", os.environ.get("MINERU_BACKEND", "pipeline"))
self.mineru_server_url = config.get("mineru_server_url", os.environ.get("MINERU_SERVER_URL", ""))
self.mineru_delete_output = bool(int(config.get("mineru_delete_output", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
logging.info(f"Parsed MinerU config: {config}") logging.info(f"MinerU config from env: api={self.mineru_api}, backend={self.mineru_backend}, server_url={self.mineru_server_url}")
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url) MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]: def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
backend = backend or self.mineru_backend backend = backend or self.mineru_backend
server_url = server_url or self.mineru_server_url server_url = server_url or self.mineru_server_url
return self.check_installation(backend=backend, server_url=server_url) return self.check_installation(backend=backend, server_url=server_url)
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs): def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
*,
output_dir: Optional[str] = None,
backend: str = "pipeline",
lang: Optional[str] = None,
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw",
formula_enable: bool = True,
table_enable: bool = True,
) -> tuple:
ok, reason = self.check_available() ok, reason = self.check_available()
if not ok: if not ok:
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.") raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
@ -65,12 +91,16 @@ class MinerUOcrModel(Base, MinerUParser):
sections, tables = MinerUParser.parse_pdf( sections, tables = MinerUParser.parse_pdf(
self, self,
filepath=filepath, filepath=filepath,
binary=binary, binary=binary, # type: ignore[arg-type]
callback=callback, callback=callback,
output_dir=self.mineru_output_dir, output_dir=output_dir or self.mineru_output_dir,
backend=self.mineru_backend, backend=backend or self.mineru_backend,
server_url=self.mineru_server_url, lang=lang,
delete_output=self.mineru_delete_output, method=method,
server_url=server_url or self.mineru_server_url,
delete_output=delete_output if delete_output is not None else self.mineru_delete_output,
parse_method=parse_method, parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable,
) )
return sections, tables return sections, tables

View file

@ -0,0 +1,156 @@
import { useFormContext, useWatch } from 'react-hook-form';
import {
FormControl,
FormField,
FormItem,
FormLabel,
FormMessage,
} from './ui/form';
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from './ui/select';
import { Switch } from './ui/switch';
// MinerU OCR language options with human-readable labels
const MINERU_LANG_OPTIONS = [
{ value: 'ch', label: 'Chinese (Simplified)' },
{ value: 'en', label: 'English' },
{ value: 'cyrillic', label: 'Cyrillic (Russian, Ukrainian, etc.)' },
{ value: 'latin', label: 'Latin (French, German, Spanish, etc.)' },
{ value: 'korean', label: 'Korean' },
{ value: 'japan', label: 'Japanese' },
{ value: 'arabic', label: 'Arabic' },
{ value: 'th', label: 'Thai' },
{ value: 'el', label: 'Greek' },
{ value: 'devanagari', label: 'Hindi (Devanagari)' },
{ value: 'ta', label: 'Tamil' },
{ value: 'te', label: 'Telugu' },
{ value: 'ka', label: 'Georgian/Kannada' },
{ value: 'chinese_cht', label: 'Chinese (Traditional)' },
];
/**
* Check if the current layout recognizer is MinerU
*/
function useIsMineruSelected() {
const form = useFormContext();
const layoutRecognize = useWatch({
control: form.control,
name: 'parser_config.layout_recognize',
});
// MinerU models have format like "model-name@MinerU"
return (
typeof layoutRecognize === 'string' &&
(layoutRecognize.toLowerCase().includes('mineru') ||
layoutRecognize.toLowerCase().endsWith('@mineru'))
);
}
export function MineruConfigFormField() {
const form = useFormContext();
const isMineruSelected = useIsMineruSelected();
if (!isMineruSelected) {
return null;
}
return (
<div className="space-y-4 p-4 border rounded-lg bg-muted/50">
<div className="text-sm font-medium text-foreground">
MinerU OCR Settings
</div>
{/* MinerU Language Selection */}
<FormField
control={form.control}
name="parser_config.mineru_lang"
render={({ field }) => (
<FormItem className="items-center space-y-0">
<div className="flex items-center">
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
OCR Language
</FormLabel>
<div className="w-2/3">
<FormControl>
<Select
value={field.value || 'latin'}
onValueChange={field.onChange}
>
<SelectTrigger>
<SelectValue placeholder="Select language" />
</SelectTrigger>
<SelectContent>
{MINERU_LANG_OPTIONS.map((option) => (
<SelectItem key={option.value} value={option.value}>
{option.label}
</SelectItem>
))}
</SelectContent>
</Select>
</FormControl>
</div>
</div>
<FormMessage />
</FormItem>
)}
/>
{/* Formula Recognition Toggle */}
<FormField
control={form.control}
name="parser_config.mineru_formula_enable"
render={({ field }) => (
<FormItem className="items-center space-y-0">
<div className="flex items-center">
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
Formula Recognition
</FormLabel>
<div className="w-2/3">
<FormControl>
<Switch
checked={field.value ?? true}
onCheckedChange={field.onChange}
/>
</FormControl>
</div>
</div>
<div className="text-xs text-muted-foreground mt-1 ml-[33.33%]">
Disable for Cyrillic/stylized fonts to avoid incorrect LaTeX
conversion
</div>
<FormMessage />
</FormItem>
)}
/>
{/* Table Recognition Toggle */}
<FormField
control={form.control}
name="parser_config.mineru_table_enable"
render={({ field }) => (
<FormItem className="items-center space-y-0">
<div className="flex items-center">
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
Table Recognition
</FormLabel>
<div className="w-2/3">
<FormControl>
<Switch
checked={field.value ?? true}
onCheckedChange={field.onChange}
/>
</FormControl>
</div>
</div>
<FormMessage />
</FormItem>
)}
/>
</div>
);
}

View file

@ -67,6 +67,11 @@ export interface ParserConfig {
tag_kb_ids?: string[]; tag_kb_ids?: string[];
topn_tags?: number; topn_tags?: number;
graphrag?: { use_graphrag?: boolean }; graphrag?: { use_graphrag?: boolean };
// MinerU-specific settings
mineru_lang?: string;
mineru_formula_enable?: boolean;
mineru_table_enable?: boolean;
mineru_parse_method?: string;
} }
export interface IKnowledgeFileParserConfig { export interface IKnowledgeFileParserConfig {

View file

@ -3,6 +3,7 @@ import {
AutoQuestionsFormField, AutoQuestionsFormField,
} from '@/components/auto-keywords-form-field'; } from '@/components/auto-keywords-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
import { import {
ConfigurationFormContainer, ConfigurationFormContainer,
MainContainer, MainContainer,
@ -13,6 +14,7 @@ export function BookConfiguration() {
<MainContainer> <MainContainer>
<ConfigurationFormContainer> <ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField> <LayoutRecognizeFormField></LayoutRecognizeFormField>
<MineruConfigFormField></MineruConfigFormField>
</ConfigurationFormContainer> </ConfigurationFormContainer>
<ConfigurationFormContainer> <ConfigurationFormContainer>

View file

@ -6,6 +6,7 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field'; import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field'; import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
import { import {
ConfigurationFormContainer, ConfigurationFormContainer,
MainContainer, MainContainer,
@ -17,6 +18,7 @@ export function NaiveConfiguration() {
<MainContainer> <MainContainer>
<ConfigurationFormContainer> <ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField> <LayoutRecognizeFormField></LayoutRecognizeFormField>
<MineruConfigFormField></MineruConfigFormField>
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField> <MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField> <DelimiterFormField></DelimiterFormField>
<EnableTocToggle /> <EnableTocToggle />

View file

@ -3,6 +3,7 @@ import {
AutoQuestionsFormField, AutoQuestionsFormField,
} from '@/components/auto-keywords-form-field'; } from '@/components/auto-keywords-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
import { import {
ConfigurationFormContainer, ConfigurationFormContainer,
MainContainer, MainContainer,
@ -13,6 +14,7 @@ export function PaperConfiguration() {
<MainContainer> <MainContainer>
<ConfigurationFormContainer> <ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField> <LayoutRecognizeFormField></LayoutRecognizeFormField>
<MineruConfigFormField></MineruConfigFormField>
</ConfigurationFormContainer> </ConfigurationFormContainer>
<ConfigurationFormContainer> <ConfigurationFormContainer>

View file

@ -30,6 +30,10 @@ export const formSchema = z
topn_tags: z.number().optional(), topn_tags: z.number().optional(),
toc_extraction: z.boolean().optional(), toc_extraction: z.boolean().optional(),
overlapped_percent: z.number().optional(), overlapped_percent: z.number().optional(),
// MinerU-specific settings
mineru_lang: z.string().optional(),
mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(),
raptor: z raptor: z
.object({ .object({
use_raptor: z.boolean().optional(), use_raptor: z.boolean().optional(),

View file

@ -58,7 +58,7 @@ export function SavingButton() {
onClick={() => { onClick={() => {
(async () => { (async () => {
try { try {
let beValid = await form.formControl.trigger(); let beValid = await form.trigger();
if (beValid) { if (beValid) {
form.handleSubmit(async (values) => { form.handleSubmit(async (values) => {
console.log('saveKnowledgeConfiguration: ', values); console.log('saveKnowledgeConfiguration: ', values);