feat: MinerU integration with environment-based configuration
- Add MinerU PDF parser support via external API or local installation - Configure MinerU through environment variables (MINERU_APISERVER, etc.) - Add per-dataset MinerU settings: language, formula/table recognition - Add MinerU config form fields in Knowledge Base settings - Remove MinerU from LLM factories (not a typical LLM model) - Clean up unused ingestors tab from settings
This commit is contained in:
parent
74afb8d710
commit
035e8ced98
12 changed files with 310 additions and 40 deletions
|
|
@ -193,7 +193,14 @@ async def add_llm():
|
||||||
api_key = apikey_json(["api_key", "provider_order"])
|
api_key = apikey_json(["api_key", "provider_order"])
|
||||||
|
|
||||||
elif factory == "MinerU":
|
elif factory == "MinerU":
|
||||||
api_key = apikey_json(["api_key", "provider_order"])
|
api_key = apikey_json([
|
||||||
|
"llm_name",
|
||||||
|
"mineru_apiserver",
|
||||||
|
"mineru_output_dir",
|
||||||
|
"mineru_backend",
|
||||||
|
"mineru_server_url",
|
||||||
|
"mineru_delete_output",
|
||||||
|
])
|
||||||
|
|
||||||
llm = {
|
llm = {
|
||||||
"tenant_id": current_user.id,
|
"tenant_id": current_user.id,
|
||||||
|
|
|
||||||
|
|
@ -5496,14 +5496,6 @@
|
||||||
"model_type": "reranker"
|
"model_type": "reranker"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "MinerU",
|
|
||||||
"logo": "",
|
|
||||||
"tags": "OCR",
|
|
||||||
"status": "1",
|
|
||||||
"rank": "900",
|
|
||||||
"llm": []
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -185,14 +185,16 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
return False, reason
|
return False, reason
|
||||||
|
|
||||||
def _run_mineru(
|
def _run_mineru(
|
||||||
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None,
|
||||||
|
formula_enable: bool = True, table_enable: bool = True
|
||||||
):
|
):
|
||||||
if self.using_api:
|
if self.using_api:
|
||||||
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback, formula_enable, table_enable)
|
||||||
else:
|
else:
|
||||||
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
||||||
|
|
||||||
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None,
|
||||||
|
formula_enable: bool = True, table_enable: bool = True):
|
||||||
output_zip_path = os.path.join(str(output_dir), "output.zip")
|
output_zip_path = os.path.join(str(output_dir), "output.zip")
|
||||||
|
|
||||||
pdf_file_path = str(input_path)
|
pdf_file_path = str(input_path)
|
||||||
|
|
@ -201,7 +203,9 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
||||||
|
|
||||||
pdf_file_name = Path(pdf_file_path).stem.strip()
|
pdf_file_name = Path(pdf_file_path).stem.strip()
|
||||||
output_path = os.path.join(str(output_dir), pdf_file_name, method)
|
# FIX: MinerU API outputs to 'vlm/' when using VLM backend, not 'auto/'
|
||||||
|
output_subfolder = "vlm" if backend.startswith("vlm") else method
|
||||||
|
output_path = os.path.join(str(output_dir), pdf_file_name, output_subfolder)
|
||||||
os.makedirs(output_path, exist_ok=True)
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
|
||||||
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
||||||
|
|
@ -211,8 +215,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
"lang_list": lang,
|
"lang_list": lang,
|
||||||
"backend": backend,
|
"backend": backend,
|
||||||
"parse_method": method,
|
"parse_method": method,
|
||||||
"formula_enable": True,
|
"formula_enable": formula_enable,
|
||||||
"table_enable": True,
|
"table_enable": table_enable,
|
||||||
"server_url": None,
|
"server_url": None,
|
||||||
"return_md": True,
|
"return_md": True,
|
||||||
"return_middle_json": True,
|
"return_middle_json": True,
|
||||||
|
|
@ -224,6 +228,11 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
"end_page_id": 99999,
|
"end_page_id": 99999,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# DEBUG: Log the exact request data being sent to MinerU
|
||||||
|
self.logger.info(f"[MinerU DEBUG] Request URL: {self.mineru_api}/file_parse")
|
||||||
|
self.logger.info(f"[MinerU DEBUG] Request data: {json.dumps(data, indent=2)}")
|
||||||
|
self.logger.info(f"[MinerU DEBUG] File: {pdf_file_name}.pdf")
|
||||||
|
|
||||||
headers = {"Accept": "application/json"}
|
headers = {"Accept": "application/json"}
|
||||||
try:
|
try:
|
||||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||||
|
|
@ -581,6 +590,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
server_url: Optional[str] = None,
|
server_url: Optional[str] = None,
|
||||||
delete_output: bool = True,
|
delete_output: bool = True,
|
||||||
parse_method: str = "raw",
|
parse_method: str = "raw",
|
||||||
|
formula_enable: bool = True,
|
||||||
|
table_enable: bool = True,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
|
@ -625,7 +636,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||||
self.__images__(pdf, zoomin=1)
|
self.__images__(pdf, zoomin=1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
|
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback,
|
||||||
|
formula_enable=formula_enable, table_enable=table_enable)
|
||||||
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
||||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||||
if callback:
|
if callback:
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,52 @@ from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
||||||
|
|
||||||
|
|
||||||
|
# MinerU OCR language mapping (RAGFlow language -> MinerU OCR code)
|
||||||
|
# See: https://github.com/opendatalab/MinerU for supported languages
|
||||||
|
MINERU_LANG_MAP = {
|
||||||
|
"chinese": "ch",
|
||||||
|
"english": "en",
|
||||||
|
"russian": "cyrillic",
|
||||||
|
"ukrainian": "cyrillic",
|
||||||
|
"belarusian": "cyrillic",
|
||||||
|
"bulgarian": "cyrillic",
|
||||||
|
"serbian": "cyrillic",
|
||||||
|
"korean": "korean",
|
||||||
|
"japanese": "japan",
|
||||||
|
"arabic": "arabic",
|
||||||
|
"thai": "th",
|
||||||
|
"greek": "el",
|
||||||
|
"hindi": "devanagari",
|
||||||
|
"tamil": "ta",
|
||||||
|
"telugu": "te",
|
||||||
|
"kannada": "ka",
|
||||||
|
"georgian": "ka",
|
||||||
|
"vietnamese": "latin",
|
||||||
|
"french": "latin",
|
||||||
|
"german": "latin",
|
||||||
|
"spanish": "latin",
|
||||||
|
"italian": "latin",
|
||||||
|
"portuguese": "latin",
|
||||||
|
"polish": "latin",
|
||||||
|
"dutch": "latin",
|
||||||
|
"turkish": "latin",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_mineru_lang(lang: str) -> str:
|
||||||
|
"""Convert RAGFlow language name to MinerU OCR language code.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lang: RAGFlow language name (e.g., "Chinese", "Russian", "English")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
MinerU OCR language code (e.g., "ch", "cyrillic", "en")
|
||||||
|
"""
|
||||||
|
if not lang:
|
||||||
|
return "latin"
|
||||||
|
return MINERU_LANG_MAP.get(lang.lower(), "latin")
|
||||||
|
|
||||||
|
|
||||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||||
callback = callback
|
callback = callback
|
||||||
binary = binary
|
binary = binary
|
||||||
|
|
@ -60,6 +106,15 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
||||||
parse_method = kwargs.get("parse_method", "raw")
|
parse_method = kwargs.get("parse_method", "raw")
|
||||||
mineru_llm_name = kwargs.get("mineru_llm_name")
|
mineru_llm_name = kwargs.get("mineru_llm_name")
|
||||||
tenant_id = kwargs.get("tenant_id")
|
tenant_id = kwargs.get("tenant_id")
|
||||||
|
|
||||||
|
# Get MinerU-specific settings from parser_config
|
||||||
|
parser_config = kwargs.get("parser_config", {})
|
||||||
|
mineru_lang = parser_config.get("mineru_lang") or _get_mineru_lang(lang)
|
||||||
|
formula_enable = parser_config.get("mineru_formula_enable", True)
|
||||||
|
table_enable = parser_config.get("mineru_table_enable", True)
|
||||||
|
|
||||||
|
logging.info(f"[MinerU] by_mineru called with lang={lang}, parser_config mineru_lang={parser_config.get('mineru_lang')}, resolved mineru_lang={mineru_lang}")
|
||||||
|
logging.info(f"[MinerU] formula_enable={formula_enable}, table_enable={table_enable}")
|
||||||
|
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
if tenant_id:
|
if tenant_id:
|
||||||
|
|
@ -85,6 +140,9 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
||||||
binary=binary,
|
binary=binary,
|
||||||
callback=callback,
|
callback=callback,
|
||||||
parse_method=parse_method,
|
parse_method=parse_method,
|
||||||
|
lang=mineru_lang,
|
||||||
|
formula_enable=formula_enable,
|
||||||
|
table_enable=table_enable,
|
||||||
)
|
)
|
||||||
return sections, tables, pdf_parser
|
return sections, tables, pdf_parser
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -13,10 +13,11 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import Any, Optional, Tuple
|
from io import BytesIO
|
||||||
|
from os import PathLike
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from deepdoc.parser.mineru_parser import MinerUParser
|
from deepdoc.parser.mineru_parser import MinerUParser
|
||||||
|
|
||||||
|
|
@ -25,7 +26,22 @@ class Base:
|
||||||
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
|
def parse_pdf(
|
||||||
|
self,
|
||||||
|
filepath: str | PathLike[str],
|
||||||
|
binary: BytesIO | bytes,
|
||||||
|
callback: Optional[Callable] = None,
|
||||||
|
*,
|
||||||
|
output_dir: Optional[str] = None,
|
||||||
|
backend: str = "pipeline",
|
||||||
|
lang: Optional[str] = None,
|
||||||
|
method: str = "auto",
|
||||||
|
server_url: Optional[str] = None,
|
||||||
|
delete_output: bool = True,
|
||||||
|
parse_method: str = "raw",
|
||||||
|
formula_enable: bool = True,
|
||||||
|
table_enable: bool = True,
|
||||||
|
) -> tuple:
|
||||||
raise NotImplementedError("Please implement parse_pdf!")
|
raise NotImplementedError("Please implement parse_pdf!")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -34,30 +50,40 @@ class MinerUOcrModel(Base, MinerUParser):
|
||||||
|
|
||||||
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||||
Base.__init__(self, key, model_name, **kwargs)
|
Base.__init__(self, key, model_name, **kwargs)
|
||||||
config = {}
|
|
||||||
if key:
|
# Use environment variables directly - no database config needed
|
||||||
try:
|
self.mineru_api = os.environ.get("MINERU_APISERVER", "")
|
||||||
config = json.loads(key)
|
self.mineru_output_dir = os.environ.get("MINERU_OUTPUT_DIR", "")
|
||||||
except Exception:
|
self.mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
|
||||||
config = {}
|
self.mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
|
||||||
config = config["api_key"]
|
self.mineru_delete_output = os.environ.get("MINERU_DELETE_OUTPUT", "1") == "1"
|
||||||
self.mineru_api = config.get("mineru_apiserver", os.environ.get("MINERU_APISERVER", ""))
|
|
||||||
self.mineru_output_dir = config.get("mineru_output_dir", os.environ.get("MINERU_OUTPUT_DIR", ""))
|
|
||||||
self.mineru_backend = config.get("mineru_backend", os.environ.get("MINERU_BACKEND", "pipeline"))
|
|
||||||
self.mineru_server_url = config.get("mineru_server_url", os.environ.get("MINERU_SERVER_URL", ""))
|
|
||||||
self.mineru_delete_output = bool(int(config.get("mineru_delete_output", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
|
|
||||||
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||||
|
|
||||||
logging.info(f"Parsed MinerU config: {config}")
|
logging.info(f"MinerU config from env: api={self.mineru_api}, backend={self.mineru_backend}, server_url={self.mineru_server_url}")
|
||||||
|
|
||||||
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||||
|
|
||||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]:
|
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||||
backend = backend or self.mineru_backend
|
backend = backend or self.mineru_backend
|
||||||
server_url = server_url or self.mineru_server_url
|
server_url = server_url or self.mineru_server_url
|
||||||
return self.check_installation(backend=backend, server_url=server_url)
|
return self.check_installation(backend=backend, server_url=server_url)
|
||||||
|
|
||||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
|
def parse_pdf(
|
||||||
|
self,
|
||||||
|
filepath: str | PathLike[str],
|
||||||
|
binary: BytesIO | bytes,
|
||||||
|
callback: Optional[Callable] = None,
|
||||||
|
*,
|
||||||
|
output_dir: Optional[str] = None,
|
||||||
|
backend: str = "pipeline",
|
||||||
|
lang: Optional[str] = None,
|
||||||
|
method: str = "auto",
|
||||||
|
server_url: Optional[str] = None,
|
||||||
|
delete_output: bool = True,
|
||||||
|
parse_method: str = "raw",
|
||||||
|
formula_enable: bool = True,
|
||||||
|
table_enable: bool = True,
|
||||||
|
) -> tuple:
|
||||||
ok, reason = self.check_available()
|
ok, reason = self.check_available()
|
||||||
if not ok:
|
if not ok:
|
||||||
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
||||||
|
|
@ -65,12 +91,16 @@ class MinerUOcrModel(Base, MinerUParser):
|
||||||
sections, tables = MinerUParser.parse_pdf(
|
sections, tables = MinerUParser.parse_pdf(
|
||||||
self,
|
self,
|
||||||
filepath=filepath,
|
filepath=filepath,
|
||||||
binary=binary,
|
binary=binary, # type: ignore[arg-type]
|
||||||
callback=callback,
|
callback=callback,
|
||||||
output_dir=self.mineru_output_dir,
|
output_dir=output_dir or self.mineru_output_dir,
|
||||||
backend=self.mineru_backend,
|
backend=backend or self.mineru_backend,
|
||||||
server_url=self.mineru_server_url,
|
lang=lang,
|
||||||
delete_output=self.mineru_delete_output,
|
method=method,
|
||||||
|
server_url=server_url or self.mineru_server_url,
|
||||||
|
delete_output=delete_output if delete_output is not None else self.mineru_delete_output,
|
||||||
parse_method=parse_method,
|
parse_method=parse_method,
|
||||||
|
formula_enable=formula_enable,
|
||||||
|
table_enable=table_enable,
|
||||||
)
|
)
|
||||||
return sections, tables
|
return sections, tables
|
||||||
|
|
|
||||||
156
web/src/components/mineru-config-form-field.tsx
Normal file
156
web/src/components/mineru-config-form-field.tsx
Normal file
|
|
@ -0,0 +1,156 @@
|
||||||
|
import { useFormContext, useWatch } from 'react-hook-form';
|
||||||
|
import {
|
||||||
|
FormControl,
|
||||||
|
FormField,
|
||||||
|
FormItem,
|
||||||
|
FormLabel,
|
||||||
|
FormMessage,
|
||||||
|
} from './ui/form';
|
||||||
|
import {
|
||||||
|
Select,
|
||||||
|
SelectContent,
|
||||||
|
SelectItem,
|
||||||
|
SelectTrigger,
|
||||||
|
SelectValue,
|
||||||
|
} from './ui/select';
|
||||||
|
import { Switch } from './ui/switch';
|
||||||
|
|
||||||
|
// MinerU OCR language options with human-readable labels
|
||||||
|
const MINERU_LANG_OPTIONS = [
|
||||||
|
{ value: 'ch', label: 'Chinese (Simplified)' },
|
||||||
|
{ value: 'en', label: 'English' },
|
||||||
|
{ value: 'cyrillic', label: 'Cyrillic (Russian, Ukrainian, etc.)' },
|
||||||
|
{ value: 'latin', label: 'Latin (French, German, Spanish, etc.)' },
|
||||||
|
{ value: 'korean', label: 'Korean' },
|
||||||
|
{ value: 'japan', label: 'Japanese' },
|
||||||
|
{ value: 'arabic', label: 'Arabic' },
|
||||||
|
{ value: 'th', label: 'Thai' },
|
||||||
|
{ value: 'el', label: 'Greek' },
|
||||||
|
{ value: 'devanagari', label: 'Hindi (Devanagari)' },
|
||||||
|
{ value: 'ta', label: 'Tamil' },
|
||||||
|
{ value: 'te', label: 'Telugu' },
|
||||||
|
{ value: 'ka', label: 'Georgian/Kannada' },
|
||||||
|
{ value: 'chinese_cht', label: 'Chinese (Traditional)' },
|
||||||
|
];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the current layout recognizer is MinerU
|
||||||
|
*/
|
||||||
|
function useIsMineruSelected() {
|
||||||
|
const form = useFormContext();
|
||||||
|
const layoutRecognize = useWatch({
|
||||||
|
control: form.control,
|
||||||
|
name: 'parser_config.layout_recognize',
|
||||||
|
});
|
||||||
|
|
||||||
|
// MinerU models have format like "model-name@MinerU"
|
||||||
|
return (
|
||||||
|
typeof layoutRecognize === 'string' &&
|
||||||
|
(layoutRecognize.toLowerCase().includes('mineru') ||
|
||||||
|
layoutRecognize.toLowerCase().endsWith('@mineru'))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function MineruConfigFormField() {
|
||||||
|
const form = useFormContext();
|
||||||
|
const isMineruSelected = useIsMineruSelected();
|
||||||
|
|
||||||
|
if (!isMineruSelected) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-4 p-4 border rounded-lg bg-muted/50">
|
||||||
|
<div className="text-sm font-medium text-foreground">
|
||||||
|
MinerU OCR Settings
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* MinerU Language Selection */}
|
||||||
|
<FormField
|
||||||
|
control={form.control}
|
||||||
|
name="parser_config.mineru_lang"
|
||||||
|
render={({ field }) => (
|
||||||
|
<FormItem className="items-center space-y-0">
|
||||||
|
<div className="flex items-center">
|
||||||
|
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
|
||||||
|
OCR Language
|
||||||
|
</FormLabel>
|
||||||
|
<div className="w-2/3">
|
||||||
|
<FormControl>
|
||||||
|
<Select
|
||||||
|
value={field.value || 'latin'}
|
||||||
|
onValueChange={field.onChange}
|
||||||
|
>
|
||||||
|
<SelectTrigger>
|
||||||
|
<SelectValue placeholder="Select language" />
|
||||||
|
</SelectTrigger>
|
||||||
|
<SelectContent>
|
||||||
|
{MINERU_LANG_OPTIONS.map((option) => (
|
||||||
|
<SelectItem key={option.value} value={option.value}>
|
||||||
|
{option.label}
|
||||||
|
</SelectItem>
|
||||||
|
))}
|
||||||
|
</SelectContent>
|
||||||
|
</Select>
|
||||||
|
</FormControl>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<FormMessage />
|
||||||
|
</FormItem>
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Formula Recognition Toggle */}
|
||||||
|
<FormField
|
||||||
|
control={form.control}
|
||||||
|
name="parser_config.mineru_formula_enable"
|
||||||
|
render={({ field }) => (
|
||||||
|
<FormItem className="items-center space-y-0">
|
||||||
|
<div className="flex items-center">
|
||||||
|
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
|
||||||
|
Formula Recognition
|
||||||
|
</FormLabel>
|
||||||
|
<div className="w-2/3">
|
||||||
|
<FormControl>
|
||||||
|
<Switch
|
||||||
|
checked={field.value ?? true}
|
||||||
|
onCheckedChange={field.onChange}
|
||||||
|
/>
|
||||||
|
</FormControl>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div className="text-xs text-muted-foreground mt-1 ml-[33.33%]">
|
||||||
|
Disable for Cyrillic/stylized fonts to avoid incorrect LaTeX
|
||||||
|
conversion
|
||||||
|
</div>
|
||||||
|
<FormMessage />
|
||||||
|
</FormItem>
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Table Recognition Toggle */}
|
||||||
|
<FormField
|
||||||
|
control={form.control}
|
||||||
|
name="parser_config.mineru_table_enable"
|
||||||
|
render={({ field }) => (
|
||||||
|
<FormItem className="items-center space-y-0">
|
||||||
|
<div className="flex items-center">
|
||||||
|
<FormLabel className="text-sm text-text-secondary whitespace-wrap w-1/3">
|
||||||
|
Table Recognition
|
||||||
|
</FormLabel>
|
||||||
|
<div className="w-2/3">
|
||||||
|
<FormControl>
|
||||||
|
<Switch
|
||||||
|
checked={field.value ?? true}
|
||||||
|
onCheckedChange={field.onChange}
|
||||||
|
/>
|
||||||
|
</FormControl>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<FormMessage />
|
||||||
|
</FormItem>
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
@ -67,6 +67,11 @@ export interface ParserConfig {
|
||||||
tag_kb_ids?: string[];
|
tag_kb_ids?: string[];
|
||||||
topn_tags?: number;
|
topn_tags?: number;
|
||||||
graphrag?: { use_graphrag?: boolean };
|
graphrag?: { use_graphrag?: boolean };
|
||||||
|
// MinerU-specific settings
|
||||||
|
mineru_lang?: string;
|
||||||
|
mineru_formula_enable?: boolean;
|
||||||
|
mineru_table_enable?: boolean;
|
||||||
|
mineru_parse_method?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface IKnowledgeFileParserConfig {
|
export interface IKnowledgeFileParserConfig {
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import {
|
||||||
AutoQuestionsFormField,
|
AutoQuestionsFormField,
|
||||||
} from '@/components/auto-keywords-form-field';
|
} from '@/components/auto-keywords-form-field';
|
||||||
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
||||||
|
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
|
||||||
import {
|
import {
|
||||||
ConfigurationFormContainer,
|
ConfigurationFormContainer,
|
||||||
MainContainer,
|
MainContainer,
|
||||||
|
|
@ -13,6 +14,7 @@ export function BookConfiguration() {
|
||||||
<MainContainer>
|
<MainContainer>
|
||||||
<ConfigurationFormContainer>
|
<ConfigurationFormContainer>
|
||||||
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
||||||
|
<MineruConfigFormField></MineruConfigFormField>
|
||||||
</ConfigurationFormContainer>
|
</ConfigurationFormContainer>
|
||||||
|
|
||||||
<ConfigurationFormContainer>
|
<ConfigurationFormContainer>
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
|
||||||
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
|
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
|
||||||
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
||||||
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
|
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
|
||||||
|
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
|
||||||
import {
|
import {
|
||||||
ConfigurationFormContainer,
|
ConfigurationFormContainer,
|
||||||
MainContainer,
|
MainContainer,
|
||||||
|
|
@ -17,6 +18,7 @@ export function NaiveConfiguration() {
|
||||||
<MainContainer>
|
<MainContainer>
|
||||||
<ConfigurationFormContainer>
|
<ConfigurationFormContainer>
|
||||||
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
||||||
|
<MineruConfigFormField></MineruConfigFormField>
|
||||||
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
|
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
|
||||||
<DelimiterFormField></DelimiterFormField>
|
<DelimiterFormField></DelimiterFormField>
|
||||||
<EnableTocToggle />
|
<EnableTocToggle />
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import {
|
||||||
AutoQuestionsFormField,
|
AutoQuestionsFormField,
|
||||||
} from '@/components/auto-keywords-form-field';
|
} from '@/components/auto-keywords-form-field';
|
||||||
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
||||||
|
import { MineruConfigFormField } from '@/components/mineru-config-form-field';
|
||||||
import {
|
import {
|
||||||
ConfigurationFormContainer,
|
ConfigurationFormContainer,
|
||||||
MainContainer,
|
MainContainer,
|
||||||
|
|
@ -13,6 +14,7 @@ export function PaperConfiguration() {
|
||||||
<MainContainer>
|
<MainContainer>
|
||||||
<ConfigurationFormContainer>
|
<ConfigurationFormContainer>
|
||||||
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
||||||
|
<MineruConfigFormField></MineruConfigFormField>
|
||||||
</ConfigurationFormContainer>
|
</ConfigurationFormContainer>
|
||||||
|
|
||||||
<ConfigurationFormContainer>
|
<ConfigurationFormContainer>
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,10 @@ export const formSchema = z
|
||||||
topn_tags: z.number().optional(),
|
topn_tags: z.number().optional(),
|
||||||
toc_extraction: z.boolean().optional(),
|
toc_extraction: z.boolean().optional(),
|
||||||
overlapped_percent: z.number().optional(),
|
overlapped_percent: z.number().optional(),
|
||||||
|
// MinerU-specific settings
|
||||||
|
mineru_lang: z.string().optional(),
|
||||||
|
mineru_formula_enable: z.boolean().optional(),
|
||||||
|
mineru_table_enable: z.boolean().optional(),
|
||||||
raptor: z
|
raptor: z
|
||||||
.object({
|
.object({
|
||||||
use_raptor: z.boolean().optional(),
|
use_raptor: z.boolean().optional(),
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ export function SavingButton() {
|
||||||
onClick={() => {
|
onClick={() => {
|
||||||
(async () => {
|
(async () => {
|
||||||
try {
|
try {
|
||||||
let beValid = await form.formControl.trigger();
|
let beValid = await form.trigger();
|
||||||
if (beValid) {
|
if (beValid) {
|
||||||
form.handleSubmit(async (values) => {
|
form.handleSubmit(async (values) => {
|
||||||
console.log('saveKnowledgeConfiguration: ', values);
|
console.log('saveKnowledgeConfiguration: ', values);
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue