This commit is contained in:
zrguo 2025-06-05 17:37:11 +08:00
parent 962974589a
commit cc9040d70c
8 changed files with 673 additions and 517 deletions

View file

@ -10,10 +10,12 @@ This example shows how to:
import os import os
import argparse import argparse
from pathlib import Path
from lightrag.mineru_parser import MineruParser from lightrag.mineru_parser import MineruParser
def parse_document(file_path: str, output_dir: str = None, method: str = "auto", stats: bool = False):
def parse_document(
file_path: str, output_dir: str = None, method: str = "auto", stats: bool = False
):
""" """
Parse a document using MinerU parser Parse a document using MinerU parser
@ -26,9 +28,7 @@ def parse_document(file_path: str, output_dir: str = None, method: str = "auto",
try: try:
# Parse the document # Parse the document
content_list, md_content = MineruParser.parse_document( content_list, md_content = MineruParser.parse_document(
file_path=file_path, file_path=file_path, parse_method=method, output_dir=output_dir
parse_method=method,
output_dir=output_dir
) )
# Display statistics if requested # Display statistics if requested
@ -39,7 +39,7 @@ def parse_document(file_path: str, output_dir: str = None, method: str = "auto",
# Count different types of content # Count different types of content
content_types = {} content_types = {}
for item in content_list: for item in content_list:
content_type = item.get('type', 'unknown') content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1 content_types[content_type] = content_types.get(content_type, 0) + 1
print("\nContent Type Distribution:") print("\nContent Type Distribution:")
@ -52,17 +52,22 @@ def parse_document(file_path: str, output_dir: str = None, method: str = "auto",
print(f"Error parsing document: {str(e)}") print(f"Error parsing document: {str(e)}")
return None, None return None, None
def main(): def main():
"""Main function to run the example""" """Main function to run the example"""
parser = argparse.ArgumentParser(description='MinerU Parser Example') parser = argparse.ArgumentParser(description="MinerU Parser Example")
parser.add_argument('file_path', help='Path to the document to parse') parser.add_argument("file_path", help="Path to the document to parse")
parser.add_argument('--output', '-o', help='Output directory path') parser.add_argument("--output", "-o", help="Output directory path")
parser.add_argument('--method', '-m', parser.add_argument(
choices=['auto', 'ocr', 'txt'], "--method",
default='auto', "-m",
help='Parsing method (auto, ocr, txt)') choices=["auto", "ocr", "txt"],
parser.add_argument('--stats', action='store_true', default="auto",
help='Display content statistics') help="Parsing method (auto, ocr, txt)",
)
parser.add_argument(
"--stats", action="store_true", help="Display content statistics"
)
args = parser.parse_args() args = parser.parse_args()
@ -72,11 +77,9 @@ def main():
# Parse document # Parse document
content_list, md_content = parse_document( content_list, md_content = parse_document(
args.file_path, args.file_path, args.output, args.method, args.stats
args.output,
args.method,
args.stats
) )
if __name__ == '__main__':
if __name__ == "__main__":
main() main()

View file

@ -8,72 +8,90 @@ import asyncio
import argparse import argparse
from lightrag.llm.openai import openai_complete_if_cache, openai_embed from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status from lightrag.kg.shared_storage import initialize_pipeline_status
from pathlib import Path
from lightrag import LightRAG from lightrag import LightRAG
from lightrag.modalprocessors import ( from lightrag.modalprocessors import (
ImageModalProcessor, ImageModalProcessor,
TableModalProcessor, TableModalProcessor,
EquationModalProcessor, EquationModalProcessor,
GenericModalProcessor
) )
WORKING_DIR = "./rag_storage" WORKING_DIR = "./rag_storage"
def get_llm_model_func(api_key: str, base_url: str = None):
return lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
def get_vision_model_func(api_key: str, base_url: str = None): def get_llm_model_func(api_key: str, base_url: str = None):
return lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache( return (
"gpt-4o", lambda prompt,
"",
system_prompt=None, system_prompt=None,
history_messages=[], history_messages=[],
messages=[ **kwargs: openai_complete_if_cache(
{"role": "system", "content": system_prompt} if system_prompt else None, "gpt-4o-mini",
{"role": "user", "content": [ prompt,
{"type": "text", "text": prompt}, system_prompt=system_prompt,
{ history_messages=history_messages,
"type": "image_url", api_key=api_key,
"image_url": { base_url=base_url,
"url": f"data:image/jpeg;base64,{image_data}" **kwargs,
} )
}
]} if image_data else {"role": "user", "content": prompt}
],
api_key=api_key,
base_url=base_url,
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
) )
def get_vision_model_func(api_key: str, base_url: str = None):
return (
lambda prompt,
system_prompt=None,
history_messages=[],
image_data=None,
**kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
],
api_key=api_key,
base_url=base_url,
**kwargs,
)
if image_data
else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
)
async def process_image_example(lightrag: LightRAG, vision_model_func): async def process_image_example(lightrag: LightRAG, vision_model_func):
"""Example of processing an image""" """Example of processing an image"""
# Create image processor # Create image processor
image_processor = ImageModalProcessor( image_processor = ImageModalProcessor(
lightrag=lightrag, lightrag=lightrag, modal_caption_func=vision_model_func
modal_caption_func=vision_model_func
) )
# Prepare image content # Prepare image content
image_content = { image_content = {
"img_path": "image.jpg", "img_path": "image.jpg",
"img_caption": ["Example image caption"], "img_caption": ["Example image caption"],
"img_footnote": ["Example image footnote"] "img_footnote": ["Example image footnote"],
} }
# Process image # Process image
@ -81,19 +99,19 @@ async def process_image_example(lightrag: LightRAG, vision_model_func):
modal_content=image_content, modal_content=image_content,
content_type="image", content_type="image",
file_path="image_example.jpg", file_path="image_example.jpg",
entity_name="Example Image" entity_name="Example Image",
) )
print("Image Processing Results:") print("Image Processing Results:")
print(f"Description: {description}") print(f"Description: {description}")
print(f"Entity Info: {entity_info}") print(f"Entity Info: {entity_info}")
async def process_table_example(lightrag: LightRAG, llm_model_func): async def process_table_example(lightrag: LightRAG, llm_model_func):
"""Example of processing a table""" """Example of processing a table"""
# Create table processor # Create table processor
table_processor = TableModalProcessor( table_processor = TableModalProcessor(
lightrag=lightrag, lightrag=lightrag, modal_caption_func=llm_model_func
modal_caption_func=llm_model_func
) )
# Prepare table content # Prepare table content
@ -105,7 +123,7 @@ async def process_table_example(lightrag: LightRAG, llm_model_func):
| Mary | 30 | Designer | | Mary | 30 | Designer |
""", """,
"table_caption": ["Employee Information Table"], "table_caption": ["Employee Information Table"],
"table_footnote": ["Data updated as of 2024"] "table_footnote": ["Data updated as of 2024"],
} }
# Process table # Process table
@ -113,39 +131,37 @@ async def process_table_example(lightrag: LightRAG, llm_model_func):
modal_content=table_content, modal_content=table_content,
content_type="table", content_type="table",
file_path="table_example.md", file_path="table_example.md",
entity_name="Employee Table" entity_name="Employee Table",
) )
print("\nTable Processing Results:") print("\nTable Processing Results:")
print(f"Description: {description}") print(f"Description: {description}")
print(f"Entity Info: {entity_info}") print(f"Entity Info: {entity_info}")
async def process_equation_example(lightrag: LightRAG, llm_model_func): async def process_equation_example(lightrag: LightRAG, llm_model_func):
"""Example of processing a mathematical equation""" """Example of processing a mathematical equation"""
# Create equation processor # Create equation processor
equation_processor = EquationModalProcessor( equation_processor = EquationModalProcessor(
lightrag=lightrag, lightrag=lightrag, modal_caption_func=llm_model_func
modal_caption_func=llm_model_func
) )
# Prepare equation content # Prepare equation content
equation_content = { equation_content = {"text": "E = mc^2", "text_format": "LaTeX"}
"text": "E = mc^2",
"text_format": "LaTeX"
}
# Process equation # Process equation
description, entity_info = await equation_processor.process_multimodal_content( description, entity_info = await equation_processor.process_multimodal_content(
modal_content=equation_content, modal_content=equation_content,
content_type="equation", content_type="equation",
file_path="equation_example.txt", file_path="equation_example.txt",
entity_name="Mass-Energy Equivalence" entity_name="Mass-Energy Equivalence",
) )
print("\nEquation Processing Results:") print("\nEquation Processing Results:")
print(f"Description: {description}") print(f"Description: {description}")
print(f"Entity Info: {entity_info}") print(f"Entity Info: {entity_info}")
async def initialize_rag(api_key: str, base_url: str = None): async def initialize_rag(api_key: str, base_url: str = None):
rag = LightRAG( rag = LightRAG(
working_dir=WORKING_DIR, working_dir=WORKING_DIR,
@ -155,7 +171,10 @@ async def initialize_rag(api_key: str, base_url: str = None):
api_key=api_key, api_key=api_key,
base_url=base_url, base_url=base_url,
), ),
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache( llm_model_func=lambda prompt,
system_prompt=None,
history_messages=[],
**kwargs: openai_complete_if_cache(
"gpt-4o-mini", "gpt-4o-mini",
prompt, prompt,
system_prompt=system_prompt, system_prompt=system_prompt,
@ -171,18 +190,22 @@ async def initialize_rag(api_key: str, base_url: str = None):
return rag return rag
def main(): def main():
"""Main function to run the example""" """Main function to run the example"""
parser = argparse.ArgumentParser(description='Modal Processors Example') parser = argparse.ArgumentParser(description="Modal Processors Example")
parser.add_argument('--api-key', required=True, help='OpenAI API key') parser.add_argument("--api-key", required=True, help="OpenAI API key")
parser.add_argument('--base-url', help='Optional base URL for API') parser.add_argument("--base-url", help="Optional base URL for API")
parser.add_argument('--working-dir', '-w', default=WORKING_DIR, help='Working directory path') parser.add_argument(
"--working-dir", "-w", default=WORKING_DIR, help="Working directory path"
)
args = parser.parse_args() args = parser.parse_args()
# Run examples # Run examples
asyncio.run(main_async(args.api_key, args.base_url)) asyncio.run(main_async(args.api_key, args.base_url))
async def main_async(api_key: str, base_url: str = None): async def main_async(api_key: str, base_url: str = None):
# Initialize LightRAG # Initialize LightRAG
lightrag = await initialize_rag(api_key, base_url) lightrag = await initialize_rag(api_key, base_url)
@ -196,5 +219,6 @@ async def main_async(api_key: str, base_url: str = None):
await process_table_example(lightrag, llm_model_func) await process_table_example(lightrag, llm_model_func)
await process_equation_example(lightrag, llm_model_func) await process_equation_example(lightrag, llm_model_func)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -11,12 +11,17 @@ This example shows how to:
import os import os
import argparse import argparse
import asyncio import asyncio
from pathlib import Path
from lightrag.mineru_parser import MineruParser
from lightrag.llm.openai import openai_complete_if_cache, openai_embed from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.raganything import RAGAnything from lightrag.raganything import RAGAnything
async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_url: str = None, working_dir: str = None):
async def process_with_rag(
file_path: str,
output_dir: str,
api_key: str,
base_url: str = None,
working_dir: str = None,
):
""" """
Process document with RAGAnything Process document with RAGAnything
@ -30,7 +35,10 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
# Initialize RAGAnything # Initialize RAGAnything
rag = RAGAnything( rag = RAGAnything(
working_dir=working_dir, working_dir=working_dir,
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache( llm_model_func=lambda prompt,
system_prompt=None,
history_messages=[],
**kwargs: openai_complete_if_cache(
"gpt-4o-mini", "gpt-4o-mini",
prompt, prompt,
system_prompt=system_prompt, system_prompt=system_prompt,
@ -39,27 +47,40 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
base_url=base_url, base_url=base_url,
**kwargs, **kwargs,
), ),
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache( vision_model_func=lambda prompt,
system_prompt=None,
history_messages=[],
image_data=None,
**kwargs: openai_complete_if_cache(
"gpt-4o", "gpt-4o",
"", "",
system_prompt=None, system_prompt=None,
history_messages=[], history_messages=[],
messages=[ messages=[
{"role": "system", "content": system_prompt} if system_prompt else None, {"role": "system", "content": system_prompt}
{"role": "user", "content": [ if system_prompt
{"type": "text", "text": prompt}, else None,
{ {
"type": "image_url", "role": "user",
"image_url": { "content": [
"url": f"data:image/jpeg;base64,{image_data}" {"type": "text", "text": prompt},
} {
} "type": "image_url",
]} if image_data else {"role": "user", "content": prompt} "image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
], ],
api_key=api_key, api_key=api_key,
base_url=base_url, base_url=base_url,
**kwargs, **kwargs,
) if image_data else openai_complete_if_cache( )
if image_data
else openai_complete_if_cache(
"gpt-4o-mini", "gpt-4o-mini",
prompt, prompt,
system_prompt=system_prompt, system_prompt=system_prompt,
@ -75,21 +96,19 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
base_url=base_url, base_url=base_url,
), ),
embedding_dim=3072, embedding_dim=3072,
max_token_size=8192 max_token_size=8192,
) )
# Process document # Process document
await rag.process_document_complete( await rag.process_document_complete(
file_path=file_path, file_path=file_path, output_dir=output_dir, parse_method="auto"
output_dir=output_dir,
parse_method="auto"
) )
# Example queries # Example queries
queries = [ queries = [
"What is the main content of the document?", "What is the main content of the document?",
"Describe the images and figures in the document", "Describe the images and figures in the document",
"Tell me about the experimental results and data tables" "Tell me about the experimental results and data tables",
] ]
print("\nQuerying processed document:") print("\nQuerying processed document:")
@ -101,14 +120,21 @@ async def process_with_rag(file_path: str, output_dir: str, api_key: str, base_u
except Exception as e: except Exception as e:
print(f"Error processing with RAG: {str(e)}") print(f"Error processing with RAG: {str(e)}")
def main(): def main():
"""Main function to run the example""" """Main function to run the example"""
parser = argparse.ArgumentParser(description='MinerU RAG Example') parser = argparse.ArgumentParser(description="MinerU RAG Example")
parser.add_argument('file_path', help='Path to the document to process') parser.add_argument("file_path", help="Path to the document to process")
parser.add_argument('--working_dir', '-w', default="./rag_storage", help='Working directory path') parser.add_argument(
parser.add_argument('--output', '-o', default="./output", help='Output directory path') "--working_dir", "-w", default="./rag_storage", help="Working directory path"
parser.add_argument('--api-key', required=True, help='OpenAI API key for RAG processing') )
parser.add_argument('--base-url', help='Optional base URL for API') parser.add_argument(
"--output", "-o", default="./output", help="Output directory path"
)
parser.add_argument(
"--api-key", required=True, help="OpenAI API key for RAG processing"
)
parser.add_argument("--base-url", help="Optional base URL for API")
args = parser.parse_args() args = parser.parse_args()
@ -117,13 +143,12 @@ def main():
os.makedirs(args.output, exist_ok=True) os.makedirs(args.output, exist_ok=True)
# Process with RAG # Process with RAG
asyncio.run(process_with_rag( asyncio.run(
args.file_path, process_with_rag(
args.output, args.file_path, args.output, args.api_key, args.base_url, args.working_dir
args.api_key, )
args.base_url, )
args.working_dir
))
if __name__ == '__main__':
if __name__ == "__main__":
main() main()

View file

@ -1,4 +1,4 @@
# type: ignore # type: ignore
""" """
MinerU Document Parser Utility MinerU Document Parser Utility
@ -14,7 +14,18 @@ import os
import json import json
import argparse import argparse
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Union, Tuple, Any, TypeVar, cast, TYPE_CHECKING, ClassVar from typing import (
Dict,
List,
Optional,
Union,
Tuple,
Any,
TypeVar,
cast,
TYPE_CHECKING,
ClassVar,
)
# Type stubs for magic_pdf # Type stubs for magic_pdf
FileBasedDataWriter = Any FileBasedDataWriter = Any
@ -28,20 +39,27 @@ read_local_office = Any
read_local_images = Any read_local_images = Any
if TYPE_CHECKING: if TYPE_CHECKING:
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import (
FileBasedDataWriter,
FileBasedDataReader,
)
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images from magic_pdf.data.read_api import read_local_office, read_local_images
else: else:
# MinerU imports # MinerU imports
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import (
FileBasedDataWriter,
FileBasedDataReader,
)
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images from magic_pdf.data.read_api import read_local_office, read_local_images
T = TypeVar('T') T = TypeVar("T")
class MineruParser: class MineruParser:
""" """
@ -58,7 +76,11 @@ class MineruParser:
pass pass
@staticmethod @staticmethod
def safe_write(writer: Any, content: Union[str, bytes, Dict[str, Any], List[Any]], filename: str) -> None: def safe_write(
writer: Any,
content: Union[str, bytes, Dict[str, Any], List[Any]],
filename: str,
) -> None:
""" """
Safely write content to a file, ensuring the filename is valid Safely write content to a file, ensuring the filename is valid
@ -80,15 +102,22 @@ class MineruParser:
writer.write(content, filename) writer.write(content, filename)
except TypeError: except TypeError:
# If the writer expects bytes, convert string to bytes # If the writer expects bytes, convert string to bytes
writer.write(content.encode('utf-8'), filename) writer.write(content.encode("utf-8"), filename)
else: else:
# For dict/list content, always encode as JSON string first # For dict/list content, always encode as JSON string first
if isinstance(content, (dict, list)): if isinstance(content, (dict, list)):
try: try:
writer.write(json.dumps(content, ensure_ascii=False, indent=4), filename) writer.write(
json.dumps(content, ensure_ascii=False, indent=4), filename
)
except TypeError: except TypeError:
# If the writer expects bytes, convert JSON string to bytes # If the writer expects bytes, convert JSON string to bytes
writer.write(json.dumps(content, ensure_ascii=False, indent=4).encode('utf-8'), filename) writer.write(
json.dumps(content, ensure_ascii=False, indent=4).encode(
"utf-8"
),
filename,
)
else: else:
# Regular content (assumed to be bytes or compatible) # Regular content (assumed to be bytes or compatible)
writer.write(content, filename) writer.write(content, filename)
@ -97,7 +126,7 @@ class MineruParser:
def parse_pdf( def parse_pdf(
pdf_path: Union[str, Path], pdf_path: Union[str, Path],
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
use_ocr: bool = False use_ocr: bool = False,
) -> Tuple[List[Dict[str, Any]], str]: ) -> Tuple[List[Dict[str, Any]], str]:
""" """
Parse PDF document Parse PDF document
@ -150,9 +179,15 @@ class MineruParser:
# Draw visualizations # Draw visualizations
try: try:
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) # type: ignore infer_result.draw_model(
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) # type: ignore os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) # type: ignore ) # type: ignore
pipe_result.draw_layout(
os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")
) # type: ignore
pipe_result.draw_span(
os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")
) # type: ignore
except Exception as e: except Exception as e:
print(f"Warning: Failed to draw visualizations: {str(e)}") print(f"Warning: Failed to draw visualizations: {str(e)}")
@ -162,7 +197,9 @@ class MineruParser:
# Save files using dump methods (consistent with API) # Save files using dump methods (consistent with API)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) # type: ignore pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing # Save model result - convert JSON string to bytes before writing
@ -171,16 +208,24 @@ class MineruParser:
try: try:
# Try to write to a file manually to avoid FileBasedDataWriter issues # Try to write to a file manually to avoid FileBasedDataWriter issues
model_file_path = os.path.join(local_md_dir, f"{name_without_suff}_model.json") model_file_path = os.path.join(
with open(model_file_path, 'w', encoding='utf-8') as f: local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
f.write(json_str) f.write(json_str)
except Exception as e: except Exception as e:
print(f"Warning: Failed to save model result using file write: {str(e)}") print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
try: try:
# If direct file write fails, try using the writer with bytes encoding # If direct file write fails, try using the writer with bytes encoding
md_writer.write(json_str.encode('utf-8'), f"{name_without_suff}_model.json") # type: ignore md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
except Exception as e2: except Exception as e2:
print(f"Warning: Failed to save model result using writer: {str(e2)}") print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content)) return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
@ -190,8 +235,7 @@ class MineruParser:
@staticmethod @staticmethod
def parse_office_doc( def parse_office_doc(
doc_path: Union[str, Path], doc_path: Union[str, Path], output_dir: Optional[str] = None
output_dir: Optional[str] = None
) -> Tuple[List[Dict[str, Any]], str]: ) -> Tuple[List[Dict[str, Any]], str]:
""" """
Parse office document (Word, PPT, etc.) Parse office document (Word, PPT, etc.)
@ -231,9 +275,9 @@ class MineruParser:
# Apply chain of operations according to API documentation # Apply chain of operations according to API documentation
# This follows the pattern shown in MS-Office example in the API docs # This follows the pattern shown in MS-Office example in the API docs
ds.apply(doc_analyze, ocr=True)\ ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
.pipe_txt_mode(image_writer)\ md_writer, f"{name_without_suff}.md", image_dir
.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore ) # type: ignore
# Re-execute for getting the content data # Re-execute for getting the content data
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
@ -244,7 +288,9 @@ class MineruParser:
content_list = pipe_result.get_content_list(image_dir) # type: ignore content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save additional output files # Save additional output files
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) # type: ignore pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing # Save model result - convert JSON string to bytes before writing
@ -253,16 +299,24 @@ class MineruParser:
try: try:
# Try to write to a file manually to avoid FileBasedDataWriter issues # Try to write to a file manually to avoid FileBasedDataWriter issues
model_file_path = os.path.join(local_md_dir, f"{name_without_suff}_model.json") model_file_path = os.path.join(
with open(model_file_path, 'w', encoding='utf-8') as f: local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
f.write(json_str) f.write(json_str)
except Exception as e: except Exception as e:
print(f"Warning: Failed to save model result using file write: {str(e)}") print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
try: try:
# If direct file write fails, try using the writer with bytes encoding # If direct file write fails, try using the writer with bytes encoding
md_writer.write(json_str.encode('utf-8'), f"{name_without_suff}_model.json") # type: ignore md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
except Exception as e2: except Exception as e2:
print(f"Warning: Failed to save model result using writer: {str(e2)}") print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content)) return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
@ -272,8 +326,7 @@ class MineruParser:
@staticmethod @staticmethod
def parse_image( def parse_image(
image_path: Union[str, Path], image_path: Union[str, Path], output_dir: Optional[str] = None
output_dir: Optional[str] = None
) -> Tuple[List[Dict[str, Any]], str]: ) -> Tuple[List[Dict[str, Any]], str]:
""" """
Parse image document Parse image document
@ -313,9 +366,9 @@ class MineruParser:
# Apply chain of operations according to API documentation # Apply chain of operations according to API documentation
# This follows the pattern shown in Image example in the API docs # This follows the pattern shown in Image example in the API docs
ds.apply(doc_analyze, ocr=True)\ ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
.pipe_ocr_mode(image_writer)\ md_writer, f"{name_without_suff}.md", image_dir
.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore ) # type: ignore
# Re-execute for getting the content data # Re-execute for getting the content data
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
@ -326,7 +379,9 @@ class MineruParser:
content_list = pipe_result.get_content_list(image_dir) # type: ignore content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save additional output files # Save additional output files
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) # type: ignore pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing # Save model result - convert JSON string to bytes before writing
@ -335,16 +390,24 @@ class MineruParser:
try: try:
# Try to write to a file manually to avoid FileBasedDataWriter issues # Try to write to a file manually to avoid FileBasedDataWriter issues
model_file_path = os.path.join(local_md_dir, f"{name_without_suff}_model.json") model_file_path = os.path.join(
with open(model_file_path, 'w', encoding='utf-8') as f: local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
f.write(json_str) f.write(json_str)
except Exception as e: except Exception as e:
print(f"Warning: Failed to save model result using file write: {str(e)}") print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
try: try:
# If direct file write fails, try using the writer with bytes encoding # If direct file write fails, try using the writer with bytes encoding
md_writer.write(json_str.encode('utf-8'), f"{name_without_suff}_model.json") # type: ignore md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
except Exception as e2: except Exception as e2:
print(f"Warning: Failed to save model result using writer: {str(e2)}") print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content)) return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
@ -357,7 +420,7 @@ class MineruParser:
file_path: Union[str, Path], file_path: Union[str, Path],
parse_method: str = "auto", parse_method: str = "auto",
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
save_results: bool = True save_results: bool = True,
) -> Tuple[List[Dict[str, Any]], str]: ) -> Tuple[List[Dict[str, Any]], str]:
""" """
Parse document using MinerU based on file extension Parse document using MinerU based on file extension
@ -382,51 +445,46 @@ class MineruParser:
# Choose appropriate parser based on file type # Choose appropriate parser based on file type
if ext in [".pdf"]: if ext in [".pdf"]:
return MineruParser.parse_pdf( return MineruParser.parse_pdf(
file_path, file_path, output_dir, use_ocr=(parse_method == "ocr")
output_dir,
use_ocr=(parse_method == "ocr")
) )
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]: elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
return MineruParser.parse_image( return MineruParser.parse_image(file_path, output_dir)
file_path,
output_dir
)
elif ext in [".doc", ".docx", ".ppt", ".pptx"]: elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
return MineruParser.parse_office_doc( return MineruParser.parse_office_doc(file_path, output_dir)
file_path,
output_dir
)
else: else:
# For unsupported file types, default to PDF parsing # For unsupported file types, default to PDF parsing
print(f"Warning: Unsupported file extension '{ext}', trying generic PDF parser") print(
return MineruParser.parse_pdf( f"Warning: Unsupported file extension '{ext}', trying generic PDF parser"
file_path,
output_dir,
use_ocr=(parse_method == "ocr")
) )
return MineruParser.parse_pdf(
file_path, output_dir, use_ocr=(parse_method == "ocr")
)
def main(): def main():
""" """
Main function to run the MinerU parser from command line Main function to run the MinerU parser from command line
""" """
parser = argparse.ArgumentParser(description='Parse documents using MinerU') parser = argparse.ArgumentParser(description="Parse documents using MinerU")
parser.add_argument('file_path', help='Path to the document to parse') parser.add_argument("file_path", help="Path to the document to parse")
parser.add_argument('--output', '-o', help='Output directory path') parser.add_argument("--output", "-o", help="Output directory path")
parser.add_argument('--method', '-m', parser.add_argument(
choices=['auto', 'ocr', 'txt'], "--method",
default='auto', "-m",
help='Parsing method (auto, ocr, txt)') choices=["auto", "ocr", "txt"],
parser.add_argument('--stats', action='store_true', default="auto",
help='Display content statistics') help="Parsing method (auto, ocr, txt)",
)
parser.add_argument(
"--stats", action="store_true", help="Display content statistics"
)
args = parser.parse_args() args = parser.parse_args()
try: try:
# Parse the document # Parse the document
content_list, md_content = MineruParser.parse_document( content_list, md_content = MineruParser.parse_document(
file_path=args.file_path, file_path=args.file_path, parse_method=args.method, output_dir=args.output
parse_method=args.method,
output_dir=args.output
) )
# Display statistics if requested # Display statistics if requested
@ -437,7 +495,7 @@ def main():
# Count different types of content # Count different types of content
content_types = {} content_types = {}
for item in content_list: for item in content_list:
content_type = item.get('type', 'unknown') content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1 content_types[content_type] = content_types.get(content_type, 0) + 1
print("\nContent Type Distribution:") print("\nContent Type Distribution:")
@ -450,5 +508,6 @@ def main():
return 0 return 0
if __name__ == '__main__':
if __name__ == "__main__":
exit(main()) exit(main())

View file

@ -65,8 +65,8 @@ class BaseModalProcessor:
raise NotImplementedError("Subclasses must implement this method") raise NotImplementedError("Subclasses must implement this method")
async def _create_entity_and_chunk( async def _create_entity_and_chunk(
self, modal_chunk: str, entity_info: Dict[str, Any], self, modal_chunk: str, entity_info: Dict[str, Any], file_path: str
file_path: str) -> Tuple[str, Dict[str, Any]]: ) -> Tuple[str, Dict[str, Any]]:
"""Create entity and text chunk""" """Create entity and text chunk"""
# Create chunk # Create chunk
chunk_id = compute_mdhash_id(str(modal_chunk), prefix="chunk-") chunk_id = compute_mdhash_id(str(modal_chunk), prefix="chunk-")
@ -93,16 +93,16 @@ class BaseModalProcessor:
"created_at": int(time.time()), "created_at": int(time.time()),
} }
await self.knowledge_graph_inst.upsert_node(entity_info["entity_name"], await self.knowledge_graph_inst.upsert_node(
node_data) entity_info["entity_name"], node_data
)
# Insert entity into vector database # Insert entity into vector database
entity_vdb_data = { entity_vdb_data = {
compute_mdhash_id(entity_info["entity_name"], prefix="ent-"): { compute_mdhash_id(entity_info["entity_name"], prefix="ent-"): {
"entity_name": entity_info["entity_name"], "entity_name": entity_info["entity_name"],
"entity_type": entity_info["entity_type"], "entity_type": entity_info["entity_type"],
"content": "content": f"{entity_info['entity_name']}\n{entity_info['summary']}",
f"{entity_info['entity_name']}\n{entity_info['summary']}",
"source_id": chunk_id, "source_id": chunk_id,
"file_path": file_path, "file_path": file_path,
} }
@ -110,8 +110,7 @@ class BaseModalProcessor:
await self.entities_vdb.upsert(entity_vdb_data) await self.entities_vdb.upsert(entity_vdb_data)
# Process entity and relationship extraction # Process entity and relationship extraction
await self._process_chunk_for_extraction(chunk_id, await self._process_chunk_for_extraction(chunk_id, entity_info["entity_name"])
entity_info["entity_name"])
# Ensure all storage updates are complete # Ensure all storage updates are complete
await self._insert_done() await self._insert_done()
@ -120,11 +119,12 @@ class BaseModalProcessor:
"entity_name": entity_info["entity_name"], "entity_name": entity_info["entity_name"],
"entity_type": entity_info["entity_type"], "entity_type": entity_info["entity_type"],
"description": entity_info["summary"], "description": entity_info["summary"],
"chunk_id": chunk_id "chunk_id": chunk_id,
} }
async def _process_chunk_for_extraction(self, chunk_id: str, async def _process_chunk_for_extraction(
modal_entity_name: str): self, chunk_id: str, modal_entity_name: str
):
"""Process chunk for entity and relationship extraction""" """Process chunk for entity and relationship extraction"""
chunk_data = await self.text_chunks_db.get_by_id(chunk_id) chunk_data = await self.text_chunks_db.get_by_id(chunk_id)
if not chunk_data: if not chunk_data:
@ -168,37 +168,27 @@ class BaseModalProcessor:
if entity_name != modal_entity_name: # Skip self-relationship if entity_name != modal_entity_name: # Skip self-relationship
# Create belongs_to relationship # Create belongs_to relationship
relation_data = { relation_data = {
"description": "description": f"Entity {entity_name} belongs to {modal_entity_name}",
f"Entity {entity_name} belongs to {modal_entity_name}", "keywords": "belongs_to,part_of,contained_in",
"keywords": "source_id": chunk_id,
"belongs_to,part_of,contained_in", "weight": 10.0,
"source_id": "file_path": chunk_data.get("file_path", "manual_creation"),
chunk_id,
"weight":
10.0,
"file_path":
chunk_data.get("file_path", "manual_creation"),
} }
await self.knowledge_graph_inst.upsert_edge( await self.knowledge_graph_inst.upsert_edge(
entity_name, modal_entity_name, relation_data) entity_name, modal_entity_name, relation_data
)
relation_id = compute_mdhash_id(entity_name + relation_id = compute_mdhash_id(
modal_entity_name, entity_name + modal_entity_name, prefix="rel-"
prefix="rel-") )
relation_vdb_data = { relation_vdb_data = {
relation_id: { relation_id: {
"src_id": "src_id": entity_name,
entity_name, "tgt_id": modal_entity_name,
"tgt_id": "keywords": relation_data["keywords"],
modal_entity_name, "content": f"{relation_data['keywords']}\t{entity_name}\n{modal_entity_name}\n{relation_data['description']}",
"keywords": "source_id": chunk_id,
relation_data["keywords"], "file_path": chunk_data.get("file_path", "manual_creation"),
"content":
f"{relation_data['keywords']}\t{entity_name}\n{modal_entity_name}\n{relation_data['description']}",
"source_id":
chunk_id,
"file_path":
chunk_data.get("file_path", "manual_creation"),
} }
} }
await self.relationships_vdb.upsert(relation_vdb_data) await self.relationships_vdb.upsert(relation_vdb_data)
@ -215,16 +205,18 @@ class BaseModalProcessor:
) )
async def _insert_done(self) -> None: async def _insert_done(self) -> None:
await asyncio.gather(*[ await asyncio.gather(
cast(StorageNameSpace, storage_inst).index_done_callback() *[
for storage_inst in [ cast(StorageNameSpace, storage_inst).index_done_callback()
self.text_chunks_db, for storage_inst in [
self.chunks_vdb, self.text_chunks_db,
self.entities_vdb, self.chunks_vdb,
self.relationships_vdb, self.entities_vdb,
self.knowledge_graph_inst, self.relationships_vdb,
self.knowledge_graph_inst,
]
] ]
]) )
class ImageModalProcessor(BaseModalProcessor): class ImageModalProcessor(BaseModalProcessor):
@ -243,8 +235,7 @@ class ImageModalProcessor(BaseModalProcessor):
"""Encode image to base64""" """Encode image to base64"""
try: try:
with open(image_path, "rb") as image_file: with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode( encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
image_file.read()).decode('utf-8')
return encoded_string return encoded_string
except Exception as e: except Exception as e:
logger.error(f"Failed to encode image {image_path}: {e}") logger.error(f"Failed to encode image {image_path}: {e}")
@ -309,8 +300,7 @@ class ImageModalProcessor(BaseModalProcessor):
response = await self.modal_caption_func( response = await self.modal_caption_func(
vision_prompt, vision_prompt,
image_data=image_base64, image_data=image_base64,
system_prompt= system_prompt="You are an expert image analyst. Provide detailed, accurate descriptions.",
"You are an expert image analyst. Provide detailed, accurate descriptions."
) )
else: else:
# Analyze based on existing text information # Analyze based on existing text information
@ -324,13 +314,11 @@ class ImageModalProcessor(BaseModalProcessor):
response = await self.modal_caption_func( response = await self.modal_caption_func(
text_prompt, text_prompt,
system_prompt= system_prompt="You are an expert image analyst. Provide detailed analysis based on available information.",
"You are an expert image analyst. Provide detailed analysis based on available information."
) )
# Parse response # Parse response
enhanced_caption, entity_info = self._parse_response( enhanced_caption, entity_info = self._parse_response(response, entity_name)
response, entity_name)
# Build complete image content # Build complete image content
modal_chunk = f""" modal_chunk = f"""
@ -341,27 +329,30 @@ class ImageModalProcessor(BaseModalProcessor):
Visual Analysis: {enhanced_caption}""" Visual Analysis: {enhanced_caption}"""
return await self._create_entity_and_chunk(modal_chunk, return await self._create_entity_and_chunk(
entity_info, file_path) modal_chunk, entity_info, file_path
)
except Exception as e: except Exception as e:
logger.error(f"Error processing image content: {e}") logger.error(f"Error processing image content: {e}")
# Fallback processing # Fallback processing
fallback_entity = { fallback_entity = {
"entity_name": entity_name if entity_name else "entity_name": entity_name
f"image_{compute_mdhash_id(str(modal_content))}", if entity_name
else f"image_{compute_mdhash_id(str(modal_content))}",
"entity_type": "image", "entity_type": "image",
"summary": f"Image content: {str(modal_content)[:100]}" "summary": f"Image content: {str(modal_content)[:100]}",
} }
return str(modal_content), fallback_entity return str(modal_content), fallback_entity
def _parse_response(self, def _parse_response(
response: str, self, response: str, entity_name: str = None
entity_name: str = None) -> Tuple[str, Dict[str, Any]]: ) -> Tuple[str, Dict[str, Any]]:
"""Parse model response""" """Parse model response"""
try: try:
response_data = json.loads( response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)) re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
description = response_data.get("detailed_description", "") description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {}) entity_data = response_data.get("entity_info", {})
@ -369,11 +360,14 @@ class ImageModalProcessor(BaseModalProcessor):
if not description or not entity_data: if not description or not entity_data:
raise ValueError("Missing required fields in response") raise ValueError("Missing required fields in response")
if not all(key in entity_data if not all(
for key in ["entity_name", "entity_type", "summary"]): key in entity_data for key in ["entity_name", "entity_type", "summary"]
):
raise ValueError("Missing required fields in entity_info") raise ValueError("Missing required fields in entity_info")
entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})" entity_data["entity_name"] = (
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
)
if entity_name: if entity_name:
entity_data["entity_name"] = entity_name entity_data["entity_name"] = entity_name
@ -382,13 +376,11 @@ class ImageModalProcessor(BaseModalProcessor):
except (json.JSONDecodeError, AttributeError, ValueError) as e: except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing image analysis response: {e}") logger.error(f"Error parsing image analysis response: {e}")
fallback_entity = { fallback_entity = {
"entity_name": "entity_name": entity_name
entity_name if entity_name
if entity_name else f"image_{compute_mdhash_id(response)}", else f"image_{compute_mdhash_id(response)}",
"entity_type": "entity_type": "image",
"image", "summary": response[:100] + "..." if len(response) > 100 else response,
"summary":
response[:100] + "..." if len(response) > 100 else response
} }
return response, fallback_entity return response, fallback_entity
@ -447,15 +439,15 @@ class TableModalProcessor(BaseModalProcessor):
response = await self.modal_caption_func( response = await self.modal_caption_func(
table_prompt, table_prompt,
system_prompt= system_prompt="You are an expert data analyst. Provide detailed table analysis with specific insights.",
"You are an expert data analyst. Provide detailed table analysis with specific insights."
) )
# Parse response # Parse response
enhanced_caption, entity_info = self._parse_table_response( enhanced_caption, entity_info = self._parse_table_response(
response, entity_name) response, entity_name
)
#TODO: Add Retry Mechanism # TODO: Add Retry Mechanism
# Build complete table content # Build complete table content
modal_chunk = f"""Table Analysis: modal_chunk = f"""Table Analysis:
@ -466,17 +458,16 @@ class TableModalProcessor(BaseModalProcessor):
Analysis: {enhanced_caption}""" Analysis: {enhanced_caption}"""
return await self._create_entity_and_chunk(modal_chunk, entity_info, return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
file_path)
def _parse_table_response( def _parse_table_response(
self, self, response: str, entity_name: str = None
response: str, ) -> Tuple[str, Dict[str, Any]]:
entity_name: str = None) -> Tuple[str, Dict[str, Any]]:
"""Parse table analysis response""" """Parse table analysis response"""
try: try:
response_data = json.loads( response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)) re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
description = response_data.get("detailed_description", "") description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {}) entity_data = response_data.get("entity_info", {})
@ -484,11 +475,14 @@ class TableModalProcessor(BaseModalProcessor):
if not description or not entity_data: if not description or not entity_data:
raise ValueError("Missing required fields in response") raise ValueError("Missing required fields in response")
if not all(key in entity_data if not all(
for key in ["entity_name", "entity_type", "summary"]): key in entity_data for key in ["entity_name", "entity_type", "summary"]
):
raise ValueError("Missing required fields in entity_info") raise ValueError("Missing required fields in entity_info")
entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})" entity_data["entity_name"] = (
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
)
if entity_name: if entity_name:
entity_data["entity_name"] = entity_name entity_data["entity_name"] = entity_name
@ -497,13 +491,11 @@ class TableModalProcessor(BaseModalProcessor):
except (json.JSONDecodeError, AttributeError, ValueError) as e: except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing table analysis response: {e}") logger.error(f"Error parsing table analysis response: {e}")
fallback_entity = { fallback_entity = {
"entity_name": "entity_name": entity_name
entity_name if entity_name
if entity_name else f"table_{compute_mdhash_id(response)}", else f"table_{compute_mdhash_id(response)}",
"entity_type": "entity_type": "table",
"table", "summary": response[:100] + "..." if len(response) > 100 else response,
"summary":
response[:100] + "..." if len(response) > 100 else response
} }
return response, fallback_entity return response, fallback_entity
@ -559,13 +551,13 @@ class EquationModalProcessor(BaseModalProcessor):
response = await self.modal_caption_func( response = await self.modal_caption_func(
equation_prompt, equation_prompt,
system_prompt= system_prompt="You are an expert mathematician. Provide detailed mathematical analysis.",
"You are an expert mathematician. Provide detailed mathematical analysis."
) )
# Parse response # Parse response
enhanced_caption, entity_info = self._parse_equation_response( enhanced_caption, entity_info = self._parse_equation_response(
response, entity_name) response, entity_name
)
# Build complete equation content # Build complete equation content
modal_chunk = f"""Mathematical Equation Analysis: modal_chunk = f"""Mathematical Equation Analysis:
@ -574,17 +566,16 @@ class EquationModalProcessor(BaseModalProcessor):
Mathematical Analysis: {enhanced_caption}""" Mathematical Analysis: {enhanced_caption}"""
return await self._create_entity_and_chunk(modal_chunk, entity_info, return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
file_path)
def _parse_equation_response( def _parse_equation_response(
self, self, response: str, entity_name: str = None
response: str, ) -> Tuple[str, Dict[str, Any]]:
entity_name: str = None) -> Tuple[str, Dict[str, Any]]:
"""Parse equation analysis response""" """Parse equation analysis response"""
try: try:
response_data = json.loads( response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)) re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
description = response_data.get("detailed_description", "") description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {}) entity_data = response_data.get("entity_info", {})
@ -592,11 +583,14 @@ class EquationModalProcessor(BaseModalProcessor):
if not description or not entity_data: if not description or not entity_data:
raise ValueError("Missing required fields in response") raise ValueError("Missing required fields in response")
if not all(key in entity_data if not all(
for key in ["entity_name", "entity_type", "summary"]): key in entity_data for key in ["entity_name", "entity_type", "summary"]
):
raise ValueError("Missing required fields in entity_info") raise ValueError("Missing required fields in entity_info")
entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})" entity_data["entity_name"] = (
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
)
if entity_name: if entity_name:
entity_data["entity_name"] = entity_name entity_data["entity_name"] = entity_name
@ -605,13 +599,11 @@ class EquationModalProcessor(BaseModalProcessor):
except (json.JSONDecodeError, AttributeError, ValueError) as e: except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing equation analysis response: {e}") logger.error(f"Error parsing equation analysis response: {e}")
fallback_entity = { fallback_entity = {
"entity_name": "entity_name": entity_name
entity_name if entity_name
if entity_name else f"equation_{compute_mdhash_id(response)}", else f"equation_{compute_mdhash_id(response)}",
"entity_type": "entity_type": "equation",
"equation", "summary": response[:100] + "..." if len(response) > 100 else response,
"summary":
response[:100] + "..." if len(response) > 100 else response
} }
return response, fallback_entity return response, fallback_entity
@ -651,13 +643,13 @@ class GenericModalProcessor(BaseModalProcessor):
response = await self.modal_caption_func( response = await self.modal_caption_func(
generic_prompt, generic_prompt,
system_prompt= system_prompt=f"You are an expert content analyst specializing in {content_type} content.",
f"You are an expert content analyst specializing in {content_type} content."
) )
# Parse response # Parse response
enhanced_caption, entity_info = self._parse_generic_response( enhanced_caption, entity_info = self._parse_generic_response(
response, entity_name, content_type) response, entity_name, content_type
)
# Build complete content # Build complete content
modal_chunk = f"""{content_type.title()} Content Analysis: modal_chunk = f"""{content_type.title()} Content Analysis:
@ -665,18 +657,16 @@ class GenericModalProcessor(BaseModalProcessor):
Analysis: {enhanced_caption}""" Analysis: {enhanced_caption}"""
return await self._create_entity_and_chunk(modal_chunk, entity_info, return await self._create_entity_and_chunk(modal_chunk, entity_info, file_path)
file_path)
def _parse_generic_response( def _parse_generic_response(
self, self, response: str, entity_name: str = None, content_type: str = "content"
response: str, ) -> Tuple[str, Dict[str, Any]]:
entity_name: str = None,
content_type: str = "content") -> Tuple[str, Dict[str, Any]]:
"""Parse generic analysis response""" """Parse generic analysis response"""
try: try:
response_data = json.loads( response_data = json.loads(
re.search(r"\{.*\}", response, re.DOTALL).group(0)) re.search(r"\{.*\}", response, re.DOTALL).group(0)
)
description = response_data.get("detailed_description", "") description = response_data.get("detailed_description", "")
entity_data = response_data.get("entity_info", {}) entity_data = response_data.get("entity_info", {})
@ -684,11 +674,14 @@ class GenericModalProcessor(BaseModalProcessor):
if not description or not entity_data: if not description or not entity_data:
raise ValueError("Missing required fields in response") raise ValueError("Missing required fields in response")
if not all(key in entity_data if not all(
for key in ["entity_name", "entity_type", "summary"]): key in entity_data for key in ["entity_name", "entity_type", "summary"]
):
raise ValueError("Missing required fields in entity_info") raise ValueError("Missing required fields in entity_info")
entity_data["entity_name"] = entity_data["entity_name"] + f" ({entity_data['entity_type']})" entity_data["entity_name"] = (
entity_data["entity_name"] + f" ({entity_data['entity_type']})"
)
if entity_name: if entity_name:
entity_data["entity_name"] = entity_name entity_data["entity_name"] = entity_name
@ -697,12 +690,10 @@ class GenericModalProcessor(BaseModalProcessor):
except (json.JSONDecodeError, AttributeError, ValueError) as e: except (json.JSONDecodeError, AttributeError, ValueError) as e:
logger.error(f"Error parsing generic analysis response: {e}") logger.error(f"Error parsing generic analysis response: {e}")
fallback_entity = { fallback_entity = {
"entity_name": "entity_name": entity_name
entity_name if entity_name else if entity_name
f"{content_type}_{compute_mdhash_id(response)}", else f"{content_type}_{compute_mdhash_id(response)}",
"entity_type": "entity_type": content_type,
content_type, "summary": response[:100] + "..." if len(response) > 100 else response,
"summary":
response[:100] + "..." if len(response) > 100 else response
} }
return response, fallback_entity return response, fallback_entity

View file

@ -28,7 +28,7 @@ from lightrag.modalprocessors import (
ImageModalProcessor, ImageModalProcessor,
TableModalProcessor, TableModalProcessor,
EquationModalProcessor, EquationModalProcessor,
GenericModalProcessor GenericModalProcessor,
) )
@ -43,7 +43,7 @@ class RAGAnything:
embedding_func: Optional[Callable] = None, embedding_func: Optional[Callable] = None,
working_dir: str = "./rag_storage", working_dir: str = "./rag_storage",
embedding_dim: int = 3072, embedding_dim: int = 3072,
max_token_size: int = 8192 max_token_size: int = 8192,
): ):
""" """
Initialize Multimodal Document Processing Pipeline Initialize Multimodal Document Processing Pipeline
@ -83,26 +83,25 @@ class RAGAnything:
def _initialize_processors(self): def _initialize_processors(self):
"""Initialize multimodal processors with appropriate model functions""" """Initialize multimodal processors with appropriate model functions"""
if self.lightrag is None: if self.lightrag is None:
raise ValueError("LightRAG instance must be initialized before creating processors") raise ValueError(
"LightRAG instance must be initialized before creating processors"
)
# Create different multimodal processors # Create different multimodal processors
self.modal_processors = { self.modal_processors = {
"image": ImageModalProcessor( "image": ImageModalProcessor(
lightrag=self.lightrag, lightrag=self.lightrag,
modal_caption_func=self.vision_model_func or self.llm_model_func modal_caption_func=self.vision_model_func or self.llm_model_func,
), ),
"table": TableModalProcessor( "table": TableModalProcessor(
lightrag=self.lightrag, lightrag=self.lightrag, modal_caption_func=self.llm_model_func
modal_caption_func=self.llm_model_func
), ),
"equation": EquationModalProcessor( "equation": EquationModalProcessor(
lightrag=self.lightrag, lightrag=self.lightrag, modal_caption_func=self.llm_model_func
modal_caption_func=self.llm_model_func
), ),
"generic": GenericModalProcessor( "generic": GenericModalProcessor(
lightrag=self.lightrag, lightrag=self.lightrag, modal_caption_func=self.llm_model_func
modal_caption_func=self.llm_model_func ),
)
} }
self.logger.info("Multimodal processors initialized") self.logger.info("Multimodal processors initialized")
@ -115,9 +114,13 @@ class RAGAnything:
# Validate required functions # Validate required functions
if self.llm_model_func is None: if self.llm_model_func is None:
raise ValueError("llm_model_func must be provided when LightRAG is not pre-initialized") raise ValueError(
"llm_model_func must be provided when LightRAG is not pre-initialized"
)
if self.embedding_func is None: if self.embedding_func is None:
raise ValueError("embedding_func must be provided when LightRAG is not pre-initialized") raise ValueError(
"embedding_func must be provided when LightRAG is not pre-initialized"
)
from lightrag.kg.shared_storage import initialize_pipeline_status from lightrag.kg.shared_storage import initialize_pipeline_status
@ -145,7 +148,7 @@ class RAGAnything:
file_path: str, file_path: str,
output_dir: str = "./output", output_dir: str = "./output",
parse_method: str = "auto", parse_method: str = "auto",
display_stats: bool = True display_stats: bool = True,
) -> Tuple[List[Dict[str, Any]], str]: ) -> Tuple[List[Dict[str, Any]], str]:
""" """
Parse document using MinerU Parse document using MinerU
@ -170,31 +173,29 @@ class RAGAnything:
try: try:
if ext in [".pdf"]: if ext in [".pdf"]:
self.logger.info(f"Detected PDF file, using PDF parser (OCR={parse_method == 'ocr'})...") self.logger.info(
f"Detected PDF file, using PDF parser (OCR={parse_method == 'ocr'})..."
)
content_list, md_content = MineruParser.parse_pdf( content_list, md_content = MineruParser.parse_pdf(
file_path, file_path, output_dir, use_ocr=(parse_method == "ocr")
output_dir,
use_ocr=(parse_method == "ocr")
) )
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]: elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
self.logger.info("Detected image file, using image parser...") self.logger.info("Detected image file, using image parser...")
content_list, md_content = MineruParser.parse_image( content_list, md_content = MineruParser.parse_image(
file_path, file_path, output_dir
output_dir
) )
elif ext in [".doc", ".docx", ".ppt", ".pptx"]: elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
self.logger.info("Detected Office document, using Office parser...") self.logger.info("Detected Office document, using Office parser...")
content_list, md_content = MineruParser.parse_office_doc( content_list, md_content = MineruParser.parse_office_doc(
file_path, file_path, output_dir
output_dir
) )
else: else:
# For other or unknown formats, use generic parser # For other or unknown formats, use generic parser
self.logger.info(f"Using generic parser for {ext} file (method={parse_method})...") self.logger.info(
f"Using generic parser for {ext} file (method={parse_method})..."
)
content_list, md_content = MineruParser.parse_document( content_list, md_content = MineruParser.parse_document(
file_path, file_path, parse_method=parse_method, output_dir=output_dir
parse_method=parse_method,
output_dir=output_dir
) )
except Exception as e: except Exception as e:
@ -202,12 +203,12 @@ class RAGAnything:
self.logger.warning("Falling back to generic parser...") self.logger.warning("Falling back to generic parser...")
# If specific parser fails, fall back to generic parser # If specific parser fails, fall back to generic parser
content_list, md_content = MineruParser.parse_document( content_list, md_content = MineruParser.parse_document(
file_path, file_path, parse_method=parse_method, output_dir=output_dir
parse_method=parse_method,
output_dir=output_dir
) )
self.logger.info(f"Parsing complete! Extracted {len(content_list)} content blocks") self.logger.info(
f"Parsing complete! Extracted {len(content_list)} content blocks"
)
self.logger.info(f"Markdown text length: {len(md_content)} characters") self.logger.info(f"Markdown text length: {len(md_content)} characters")
# Display content statistics if requested # Display content statistics if requested
@ -230,7 +231,9 @@ class RAGAnything:
return content_list, md_content return content_list, md_content
def _separate_content(self, content_list: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]]]: def _separate_content(
self, content_list: List[Dict[str, Any]]
) -> Tuple[str, List[Dict[str, Any]]]:
""" """
Separate text content and multimodal content Separate text content and multimodal content
@ -258,7 +261,7 @@ class RAGAnything:
# Merge all text content # Merge all text content
text_content = "\n\n".join(text_parts) text_content = "\n\n".join(text_parts)
self.logger.info(f"Content separation complete:") self.logger.info("Content separation complete:")
self.logger.info(f" - Text content length: {len(text_content)} characters") self.logger.info(f" - Text content length: {len(text_content)} characters")
self.logger.info(f" - Multimodal items count: {len(multimodal_items)}") self.logger.info(f" - Multimodal items count: {len(multimodal_items)}")
@ -301,12 +304,14 @@ class RAGAnything:
file_paths=file_paths, file_paths=file_paths,
split_by_character=split_by_character, split_by_character=split_by_character,
split_by_character_only=split_by_character_only, split_by_character_only=split_by_character_only,
ids=ids ids=ids,
) )
self.logger.info("Text content insertion complete") self.logger.info("Text content insertion complete")
async def _process_multimodal_content(self, multimodal_items: List[Dict[str, Any]], file_path: str): async def _process_multimodal_content(
self, multimodal_items: List[Dict[str, Any]], file_path: str
):
""" """
Process multimodal content (using specialized processors) Process multimodal content (using specialized processors)
@ -325,20 +330,29 @@ class RAGAnything:
for i, item in enumerate(multimodal_items): for i, item in enumerate(multimodal_items):
try: try:
content_type = item.get("type", "unknown") content_type = item.get("type", "unknown")
self.logger.info(f"Processing item {i+1}/{len(multimodal_items)}: {content_type} content") self.logger.info(
f"Processing item {i+1}/{len(multimodal_items)}: {content_type} content"
)
# Select appropriate processor # Select appropriate processor
processor = self._get_processor_for_type(content_type) processor = self._get_processor_for_type(content_type)
if processor: if processor:
enhanced_caption, entity_info = await processor.process_multimodal_content( (
enhanced_caption,
entity_info,
) = await processor.process_multimodal_content(
modal_content=item, modal_content=item,
content_type=content_type, content_type=content_type,
file_path=file_name file_path=file_name,
)
self.logger.info(
f"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}"
) )
self.logger.info(f"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}")
else: else:
self.logger.warning(f"No suitable processor found for {content_type} type content") self.logger.warning(
f"No suitable processor found for {content_type} type content"
)
except Exception as e: except Exception as e:
self.logger.error(f"Error processing multimodal content: {str(e)}") self.logger.error(f"Error processing multimodal content: {str(e)}")
@ -376,7 +390,7 @@ class RAGAnything:
display_stats: bool = True, display_stats: bool = True,
split_by_character: str | None = None, split_by_character: str | None = None,
split_by_character_only: bool = False, split_by_character_only: bool = False,
doc_id: str | None = None doc_id: str | None = None,
): ):
""" """
Complete document processing workflow Complete document processing workflow
@ -397,10 +411,7 @@ class RAGAnything:
# Step 1: Parse document using MinerU # Step 1: Parse document using MinerU
content_list, md_content = self.parse_document( content_list, md_content = self.parse_document(
file_path, file_path, output_dir, parse_method, display_stats
output_dir,
parse_method,
display_stats
) )
# Step 2: Separate text and multimodal content # Step 2: Separate text and multimodal content
@ -414,7 +425,7 @@ class RAGAnything:
file_paths=file_name, file_paths=file_name,
split_by_character=split_by_character, split_by_character=split_by_character,
split_by_character_only=split_by_character_only, split_by_character_only=split_by_character_only,
ids=doc_id ids=doc_id,
) )
# Step 4: Process multimodal content (using specialized processors) # Step 4: Process multimodal content (using specialized processors)
@ -433,7 +444,7 @@ class RAGAnything:
split_by_character_only: bool = False, split_by_character_only: bool = False,
file_extensions: Optional[List[str]] = None, file_extensions: Optional[List[str]] = None,
recursive: bool = True, recursive: bool = True,
max_workers: int = 1 max_workers: int = 1,
): ):
""" """
Process all files in a folder in batch Process all files in a folder in batch
@ -454,12 +465,25 @@ class RAGAnything:
folder_path = Path(folder_path) folder_path = Path(folder_path)
if not folder_path.exists() or not folder_path.is_dir(): if not folder_path.exists() or not folder_path.is_dir():
raise ValueError(f"Folder does not exist or is not a valid directory: {folder_path}") raise ValueError(
f"Folder does not exist or is not a valid directory: {folder_path}"
)
# Supported file formats # Supported file formats
supported_extensions = { supported_extensions = {
".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".pdf",
".doc", ".docx", ".ppt", ".pptx", ".txt", ".md" ".jpg",
".jpeg",
".png",
".bmp",
".tiff",
".tif",
".doc",
".docx",
".ppt",
".pptx",
".txt",
".md",
} }
# Use specified extensions or all supported formats # Use specified extensions or all supported formats
@ -468,7 +492,9 @@ class RAGAnything:
# Validate if all are supported formats # Validate if all are supported formats
unsupported = target_extensions - supported_extensions unsupported = target_extensions - supported_extensions
if unsupported: if unsupported:
self.logger.warning(f"The following file formats may not be fully supported: {unsupported}") self.logger.warning(
f"The following file formats may not be fully supported: {unsupported}"
)
else: else:
target_extensions = supported_extensions target_extensions = supported_extensions
@ -478,12 +504,18 @@ class RAGAnything:
if recursive: if recursive:
# Recursively traverse all subfolders # Recursively traverse all subfolders
for file_path in folder_path.rglob("*"): for file_path in folder_path.rglob("*"):
if file_path.is_file() and file_path.suffix.lower() in target_extensions: if (
file_path.is_file()
and file_path.suffix.lower() in target_extensions
):
files_to_process.append(file_path) files_to_process.append(file_path)
else: else:
# Process only current folder # Process only current folder
for file_path in folder_path.glob("*"): for file_path in folder_path.glob("*"):
if file_path.is_file() and file_path.suffix.lower() in target_extensions: if (
file_path.is_file()
and file_path.suffix.lower() in target_extensions
):
files_to_process.append(file_path) files_to_process.append(file_path)
if not files_to_process: if not files_to_process:
@ -491,7 +523,7 @@ class RAGAnything:
return return
self.logger.info(f"Found {len(files_to_process)} files to process") self.logger.info(f"Found {len(files_to_process)} files to process")
self.logger.info(f"File type distribution:") self.logger.info("File type distribution:")
# Count file types # Count file types
file_type_count = {} file_type_count = {}
@ -514,7 +546,9 @@ class RAGAnything:
async with semaphore: async with semaphore:
nonlocal processed_count nonlocal processed_count
try: try:
self.logger.info(f"[{index}/{len(files_to_process)}] Processing: {file_path}") self.logger.info(
f"[{index}/{len(files_to_process)}] Processing: {file_path}"
)
# Create separate output directory for each file # Create separate output directory for each file
file_output_dir = Path(output_dir) / file_path.stem file_output_dir = Path(output_dir) / file_path.stem
@ -527,14 +561,18 @@ class RAGAnything:
parse_method=parse_method, parse_method=parse_method,
display_stats=display_stats, display_stats=display_stats,
split_by_character=split_by_character, split_by_character=split_by_character,
split_by_character_only=split_by_character_only split_by_character_only=split_by_character_only,
) )
processed_count += 1 processed_count += 1
self.logger.info(f"[{index}/{len(files_to_process)}] Successfully processed: {file_path}") self.logger.info(
f"[{index}/{len(files_to_process)}] Successfully processed: {file_path}"
)
except Exception as e: except Exception as e:
self.logger.error(f"[{index}/{len(files_to_process)}] Failed to process: {file_path}") self.logger.error(
f"[{index}/{len(files_to_process)}] Failed to process: {file_path}"
)
self.logger.error(f"Error: {str(e)}") self.logger.error(f"Error: {str(e)}")
failed_files.append((file_path, str(e))) failed_files.append((file_path, str(e)))
@ -562,14 +600,10 @@ class RAGAnything:
"total": len(files_to_process), "total": len(files_to_process),
"success": processed_count, "success": processed_count,
"failed": len(failed_files), "failed": len(failed_files),
"failed_files": failed_files "failed_files": failed_files,
} }
async def query_with_multimodal( async def query_with_multimodal(self, query: str, mode: str = "hybrid") -> str:
self,
query: str,
mode: str = "hybrid"
) -> str:
""" """
Query with multimodal content support Query with multimodal content support
@ -589,10 +623,7 @@ class RAGAnything:
"to create and populate the LightRAG instance." "to create and populate the LightRAG instance."
) )
result = await self.lightrag.aquery( result = await self.lightrag.aquery(query, param=QueryParam(mode=mode))
query,
param=QueryParam(mode=mode)
)
return result return result
@ -605,16 +636,22 @@ class RAGAnything:
"status": "Initialized", "status": "Initialized",
"processors": {}, "processors": {},
"models": { "models": {
"llm_model": "External function" if self.llm_model_func else "Not provided", "llm_model": "External function"
"vision_model": "External function" if self.vision_model_func else "Not provided", if self.llm_model_func
"embedding_model": "External function" if self.embedding_func else "Not provided" else "Not provided",
} "vision_model": "External function"
if self.vision_model_func
else "Not provided",
"embedding_model": "External function"
if self.embedding_func
else "Not provided",
},
} }
for proc_type, processor in self.modal_processors.items(): for proc_type, processor in self.modal_processors.items():
info["processors"][proc_type] = { info["processors"][proc_type] = {
"class": processor.__class__.__name__, "class": processor.__class__.__name__,
"supports": self._get_processor_supports(proc_type) "supports": self._get_processor_supports(proc_type),
} }
return info return info
@ -622,11 +659,28 @@ class RAGAnything:
def _get_processor_supports(self, proc_type: str) -> List[str]: def _get_processor_supports(self, proc_type: str) -> List[str]:
"""Get processor supported features""" """Get processor supported features"""
supports_map = { supports_map = {
"image": ["Image content analysis", "Visual understanding", "Image description generation", "Image entity extraction"], "image": [
"table": ["Table structure analysis", "Data statistics", "Trend identification", "Table entity extraction"], "Image content analysis",
"equation": ["Mathematical formula parsing", "Variable identification", "Formula meaning explanation", "Formula entity extraction"], "Visual understanding",
"generic": ["General content analysis", "Structured processing", "Entity extraction"] "Image description generation",
"Image entity extraction",
],
"table": [
"Table structure analysis",
"Data statistics",
"Trend identification",
"Table entity extraction",
],
"equation": [
"Mathematical formula parsing",
"Variable identification",
"Formula meaning explanation",
"Formula entity extraction",
],
"generic": [
"General content analysis",
"Structured processing",
"Entity extraction",
],
} }
return supports_map.get(proc_type, ["Basic processing"]) return supports_map.get(proc_type, ["Basic processing"])