feat: MinerU integration with environment-based configuration

- Add MinerU PDF parser support via external API or local installation
- Configure MinerU through environment variables (MINERU_APISERVER, etc.)
- Add per-dataset MinerU settings: language, formula/table recognition
- Add MinerU config form fields in Knowledge Base settings
- Remove MinerU from LLM factories (not a typical LLM model)
- Clean up unused ingestors tab from settings
This commit is contained in:
user210 2025-12-11 19:56:46 +02:00
parent 74afb8d710
commit 035e8ced98
12 changed files with 310 additions and 40 deletions

View file

@ -193,7 +193,14 @@ async def add_llm():
api_key = apikey_json(["api_key", "provider_order"])
elif factory == "MinerU":
api_key = apikey_json(["api_key", "provider_order"])
api_key = apikey_json([
"llm_name",
"mineru_apiserver",
"mineru_output_dir",
"mineru_backend",
"mineru_server_url",
"mineru_delete_output",
])
llm = {
"tenant_id": current_user.id,

View file

@ -5496,14 +5496,6 @@
"model_type": "reranker"
}
]
},
{
"name": "MinerU",
"logo": "",
"tags": "OCR",
"status": "1",
"rank": "900",
"llm": []
}
]
}

View file

@ -185,14 +185,16 @@ class MinerUParser(RAGFlowPdfParser):
return False, reason
def _run_mineru(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None,
formula_enable: bool = True, table_enable: bool = True
):
if self.using_api:
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback, formula_enable, table_enable)
else:
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None,
formula_enable: bool = True, table_enable: bool = True):
output_zip_path = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path)
@ -201,7 +203,9 @@ class MinerUParser(RAGFlowPdfParser):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, method)
# FIX: MinerU API outputs to 'vlm/' when using VLM backend, not 'auto/'
output_subfolder = "vlm" if backend.startswith("vlm") else method
output_path = os.path.join(str(output_dir), pdf_file_name, output_subfolder)
os.makedirs(output_path, exist_ok=True)
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
@ -211,8 +215,8 @@ class MinerUParser(RAGFlowPdfParser):
"lang_list": lang,
"backend": backend,
"parse_method": method,
"formula_enable": True,
"table_enable": True,
"formula_enable": formula_enable,
"table_enable": table_enable,
"server_url": None,
"return_md": True,
"return_middle_json": True,
@ -224,6 +228,11 @@ class MinerUParser(RAGFlowPdfParser):
"end_page_id": 99999,
}
# DEBUG: Log the exact request data being sent to MinerU
self.logger.info(f"[MinerU DEBUG] Request URL: {self.mineru_api}/file_parse")
self.logger.info(f"[MinerU DEBUG] Request data: {json.dumps(data, indent=2)}")
self.logger.info(f"[MinerU DEBUG] File: {pdf_file_name}.pdf")
headers = {"Accept": "application/json"}
try:
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
@ -581,6 +590,8 @@ class MinerUParser(RAGFlowPdfParser):
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw",
formula_enable: bool = True,
table_enable: bool = True,
) -> tuple:
import shutil
@ -625,7 +636,8 @@ class MinerUParser(RAGFlowPdfParser):
self.__images__(pdf, zoomin=1)
try:
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback,
formula_enable=formula_enable, table_enable=table_enable)
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:

View file

@ -39,6 +39,52 @@ from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
# MinerU OCR language mapping (RAGFlow language -> MinerU OCR code)
# See: https://github.com/opendatalab/MinerU for supported languages
MINERU_LANG_MAP = {
"chinese": "ch",
"english": "en",
"russian": "cyrillic",
"ukrainian": "cyrillic",
"belarusian": "cyrillic",
"bulgarian": "cyrillic",
"serbian": "cyrillic",
"korean": "korean",
"japanese": "japan",
"arabic": "arabic",
"thai": "th",
"greek": "el",
"hindi": "devanagari",
"tamil": "ta",
"telugu": "te",
"kannada": "ka",
"georgian": "ka",
"vietnamese": "latin",
"french": "latin",
"german": "latin",
"spanish": "latin",
"italian": "latin",
"portuguese": "latin",
"polish": "latin",
"dutch": "latin",
"turkish": "latin",
}
def _get_mineru_lang(lang: str) -> str:
"""Convert RAGFlow language name to MinerU OCR language code.
Args:
lang: RAGFlow language name (e.g., "Chinese", "Russian", "English")
Returns:
MinerU OCR language code (e.g., "ch", "cyrillic", "en")
"""
if not lang:
return "latin"
return MINERU_LANG_MAP.get(lang.lower(), "latin")
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback
binary = binary
@ -60,6 +106,15 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
parse_method = kwargs.get("parse_method", "raw")
mineru_llm_name = kwargs.get("mineru_llm_name")
tenant_id = kwargs.get("tenant_id")
# Get MinerU-specific settings from parser_config
parser_config = kwargs.get("parser_config", {})
mineru_lang = parser_config.get("mineru_lang") or _get_mineru_lang(lang)
formula_enable = parser_config.get("mineru_formula_enable", True)
table_enable = parser_config.get("mineru_table_enable", True)
logging.info(f"[MinerU] by_mineru called with lang={lang}, parser_config mineru_lang={parser_config.get('mineru_lang')}, resolved mineru_lang={mineru_lang}")
logging.info(f"[MinerU] formula_enable={formula_enable}, table_enable={table_enable}")
pdf_parser = None
if tenant_id:
@ -85,6 +140,9 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
binary=binary,
callback=callback,
parse_method=parse_method,
lang=mineru_lang,
formula_enable=formula_enable,
table_enable=table_enable,
)
return sections, tables, pdf_parser
except Exception as e:

View file

@ -13,10 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import logging
import os
from typing import Any, Optional, Tuple
from io import BytesIO
from os import PathLike
from typing import Callable, Optional
from deepdoc.parser.mineru_parser import MinerUParser
@ -25,7 +26,22 @@ class Base:
def __init__(self, key: str | dict, model_name: str, **kwargs):
self.model_name = model_name
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
*,
output_dir: Optional[str] = None,
backend: str = "pipeline",
lang: Optional[str] = None,
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw",
formula_enable: bool = True,
table_enable: bool = True,
) -> tuple:
raise NotImplementedError("Please implement parse_pdf!")
@ -34,30 +50,40 @@ class MinerUOcrModel(Base, MinerUParser):
def __init__(self, key: str | dict, model_name: str, **kwargs):
Base.__init__(self, key, model_name, **kwargs)
config = {}
if key:
try:
config = json.loads(key)
except Exception:
config = {}
config = config["api_key"]
self.mineru_api = config.get("mineru_apiserver", os.environ.get("MINERU_APISERVER", ""))
self.mineru_output_dir = config.get("mineru_output_dir", os.environ.get("MINERU_OUTPUT_DIR", ""))
self.mineru_backend = config.get("mineru_backend", os.environ.get("MINERU_BACKEND", "pipeline"))
self.mineru_server_url = config.get("mineru_server_url", os.environ.get("MINERU_SERVER_URL", ""))
self.mineru_delete_output = bool(int(config.get("mineru_delete_output", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
# Use environment variables directly - no database config needed
self.mineru_api = os.environ.get("MINERU_APISERVER", "")
self.mineru_output_dir = os.environ.get("MINERU_OUTPUT_DIR", "")
self.mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
self.mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
self.mineru_delete_output = os.environ.get("MINERU_DELETE_OUTPUT", "1") == "1"
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
logging.info(f"Parsed MinerU config: {config}")
logging.info(f"MinerU config from env: api={self.mineru_api}, backend={self.mineru_backend}, server_url={self.mineru_server_url}")
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]:
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
backend = backend or self.mineru_backend
server_url = server_url or self.mineru_server_url
return self.check_installation(backend=backend, server_url=server_url)
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
*,
output_dir: Optional[str] = None,
backend: str = "pipeline",
lang: Optional[str] = None,
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw",
formula_enable: bool = True,
table_enable: bool = True,
) -> tuple:
ok, reason = self.check_available()
if not ok:
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
@ -65,12 +91,16 @@ class MinerUOcrModel(Base, MinerUParser):
sections, tables = MinerUParser.parse_pdf(
self,
filepath=filepath,
binary=binary,
binary=binary, # type: ignore[arg-type]
callback=callback,
output_dir=self.mineru_output_dir,
backend=self.mineru_backend,
server_url=self.mineru_server_url,
delete_output=self.mineru_delete_output,
output_dir=output_dir or self.mineru_output_dir,
backend=backend or self.mineru_backend,
lang=lang,
method=method,
server_url=server_url or self.mineru_server_url,
delete_output=delete_output if delete_output is not None else self.mineru_delete_output,
parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable,
)
return sections, tables

View file

@ -0,0 +1,156 @@
import { useFormContext, useWatch } from 'react-hook-form';
import {
FormControl,
FormField,
FormItem,
FormLabel,
FormMessage,
} from './ui/form';
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from './ui/select';
import { Switch } from './ui/switch';
// MinerU OCR language options with human-readable labels
const MINERU_LANG_OPTIONS = [
{ value: 'ch', label: 'Chinese (Simplified)' },
{ value: 'en', label: 'English' },
{ value: 'cyrillic', label: 'Cyrillic (Russian, Ukrainian, etc.)' },
{ value: 'latin', label: 'Latin (French, German, Spanish, etc.)' },
{ value: 'korean', label: 'Korean' },
{ value: 'japan', label: 'Japanese' },
{ value: 'arabic', label: 'Arabic' },
{ value: 'th', label: 'Thai' },
{ value: 'el', label: 'Greek' },
{ value: 'devanagari', label: 'Hindi (Devanagari)' },
{ value: 'ta', label: 'Tamil' },
{ value: 'te', label: 'Telugu' },
{ value: 'ka', label: 'Georgian/Kannada' },
{ value: 'chinese_cht', label: 'Chinese (Traditional)' },
];
/**
* Check if the current layout recognizer is MinerU
*/
function useIsMineruSelected() {
const form = useFormContext();
const layoutRecognize = useWatch({
control: form.control,
name: 'parser_config.layout_recognize',
});
// MinerU models have format like "model-name@MinerU"
return (
typeof layoutRecognize === 'string' &&
(layoutRecognize.toLowerCase().includes('mineru') ||
layoutRecognize.toLowerCase().endsWith('@mineru'))
);
}
export function MineruConfigFormField() {
const form = useFormContext();
const isMineruSelected = useIsMineruSelected();
if (!isMineruSelected) {
return null;
}
return (
<div className="space-y-4 p-4 border rounded-lg bg-muted/50">
<div className="text-sm font-medium text-foreground">
MinerU OCR Settings
</div>
{/* MinerU Language Selection */}
<FormField
control={form.control}
name="parser_config.mineru_lang"
render={({ field }) => (
<FormItem className="items-center space-y-0">
<div className="flex items-center">
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
OCR Language
</FormLabel>
<div className="w-2/3">
<FormControl>
<Select
value={field.value || 'latin'}
onValueChange={field.onChange}
>
<SelectTrigger>
<SelectValue placeholder="Select language" />
</SelectTrigger>
<SelectContent>
{MINERU_LANG_OPTIONS.map((option) => (
<SelectItem key={option.value} value={option.value}>
{option.label}
</SelectItem>
))}
</SelectContent>
</Select>
</FormControl>
</div>
</div>
<FormMessage />
</FormItem>
)}
/>
{/* Formula Recognition Toggle */}
<FormField
control={form.control}
name="parser_config.mineru_formula_enable"
render={({ field }) => (
<FormItem className="items-center space-y-0">
<div className="flex items-center">
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
Formula Recognition
</FormLabel>
<div className="w-2/3">
<FormControl>
<Switch
checked={field.value ?? true}
onCheckedChange={field.onChange}
/>
</FormControl>
</div>
</div>
<div className="text-xs text-muted-foreground mt-1 ml-[33.33%]">
Disable for Cyrillic/stylized fonts to avoid incorrect LaTeX
conversion
</div>
<FormMessage />
</FormItem>
)}
/>
{/* Table Recognition Toggle */}
<FormField
control={form.control}
name="parser_config.mineru_table_enable"
render={({ field }) => (
<FormItem className="items-center space-y-0">
<div className="flex items-center">
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
Table Recognition
</FormLabel>
<div className="w-2/3">
<FormControl>
<Switch
checked={field.value ?? true}
onCheckedChange={field.onChange}
/>
</FormControl>
</div>
</div>
<FormMessage />
</FormItem>
)}
/>
</div>
);
}

View file

@ -67,6 +67,11 @@ export interface ParserConfig {
tag_kb_ids?: string[];
topn_tags?: number;
graphrag?: { use_graphrag?: boolean };
// MinerU-specific settings
mineru_lang?: string;
mineru_formula_enable?: boolean;
mineru_table_enable?: boolean;
mineru_parse_method?: string;
}
export interface IKnowledgeFileParserConfig {

View file

@ -3,6 +3,7 @@ import {
AutoQuestionsFormField,
} from '@/components/auto-keywords-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
import {
ConfigurationFormContainer,
MainContainer,
@ -13,6 +14,7 @@ export function BookConfiguration() {
<MainContainer>
<ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField>
<MineruConfigFormField></MineruConfigFormField>
</ConfigurationFormContainer>
<ConfigurationFormContainer>

View file

@ -6,6 +6,7 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
import {
ConfigurationFormContainer,
MainContainer,
@ -17,6 +18,7 @@ export function NaiveConfiguration() {
<MainContainer>
<ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField>
<MineruConfigFormField></MineruConfigFormField>
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField>
<EnableTocToggle />

View file

@ -3,6 +3,7 @@ import {
AutoQuestionsFormField,
} from '@/components/auto-keywords-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
import {
ConfigurationFormContainer,
MainContainer,
@ -13,6 +14,7 @@ export function PaperConfiguration() {
<MainContainer>
<ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField>
<MineruConfigFormField></MineruConfigFormField>
</ConfigurationFormContainer>
<ConfigurationFormContainer>

View file

@ -30,6 +30,10 @@ export const formSchema = z
topn_tags: z.number().optional(),
toc_extraction: z.boolean().optional(),
overlapped_percent: z.number().optional(),
// MinerU-specific settings
mineru_lang: z.string().optional(),
mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(),
raptor: z
.object({
use_raptor: z.boolean().optional(),

View file

@ -58,7 +58,7 @@ export function SavingButton() {
onClick={() => {
(async () => {
try {
let beValid = await form.formControl.trigger();
let beValid = await form.trigger();
if (beValid) {
form.handleSubmit(async (values) => {
console.log('saveKnowledgeConfiguration: ', values);