Add example of directly using modal processors

2025-06-05 17:36:05 +08:00 · 2025-06-05 17:36:05 +08:00 · 962974589a
commit 962974589a
parent 8a726f6e08
3 changed files with 429 additions and 2 deletions
--- a/docs/mineru_integration_en.md
+++ b/docs/mineru_integration_en.md
@ -243,4 +243,118 @@ The MinerU configuration file `magic-pdf.json` supports various customization op
 - GPU acceleration settings
 - Cache settings
-For complete configuration options, refer to the [MinerU official documentation](https://mineru.readthedocs.io/). 
+For complete configuration options, refer to the [MinerU official documentation](https://mineru.readthedocs.io/).
 ### Using Modal Processors Directly
 You can also use LightRAG's modal processors directly without going through MinerU. This is useful when you want to process specific types of content or have more control over the processing pipeline.
 Each modal processor returns a tuple containing:
 1. A description of the processed content
 2. Entity information that can be used for further processing or storage
 The processors support different types of content:
 - `ImageModalProcessor`: Processes images with captions and footnotes
 - `TableModalProcessor`: Processes tables with captions and footnotes
 - `EquationModalProcessor`: Processes mathematical equations in LaTeX format
 - `GenericModalProcessor`: A base processor that can be extended for custom content types 
 > **Note**: A complete working example can be found in `examples/modalprocessors_example.py`. You can run it using:
 > ```bash
 > python examples/modalprocessors_example.py --api-key YOUR_API_KEY
 > ```
 <details>
 <summary> Here's an example of how to use different modal processors: </summary>
 ```python
 from lightrag.modalprocessors import (
    ImageModalProcessor,
    TableModalProcessor,
    EquationModalProcessor,
    GenericModalProcessor
 )
 # Initialize LightRAG
 lightrag = LightRAG(
    working_dir="./rag_storage",
    embedding_func=lambda texts: openai_embed(
        texts,
        model="text-embedding-3-large",
        api_key="your-api-key",
        base_url="your-base-url",
    ),
    llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
        "gpt-4o-mini",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key="your-api-key",
        base_url="your-base-url",
        **kwargs,
    ),
 )
 # Process an image
 image_processor = ImageModalProcessor(
    lightrag=lightrag,
    modal_caption_func=vision_model_func
 )
 image_content = {
    "img_path": "image.jpg",
    "img_caption": ["Example image caption"],
    "img_footnote": ["Example image footnote"]
 }
 description, entity_info = await image_processor.process_multimodal_content(
    modal_content=image_content,
    content_type="image",
    file_path="image_example.jpg",
    entity_name="Example Image"
 )
 # Process a table
 table_processor = TableModalProcessor(
    lightrag=lightrag,
    modal_caption_func=llm_model_func
 )
 table_content = {
    "table_body": """
    | Name | Age | Occupation |
    |------|-----|------------|
    | John | 25  | Engineer   |
    | Mary | 30  | Designer   |
    """,
    "table_caption": ["Employee Information Table"],
    "table_footnote": ["Data updated as of 2024"]
 }
 description, entity_info = await table_processor.process_multimodal_content(
    modal_content=table_content,
    content_type="table",
    file_path="table_example.md",
    entity_name="Employee Table"
 )
 # Process an equation
 equation_processor = EquationModalProcessor(
    lightrag=lightrag,
    modal_caption_func=llm_model_func
 )
 equation_content = {
    "text": "E = mc^2",
    "text_format": "LaTeX"
 }
 description, entity_info = await equation_processor.process_multimodal_content(
    modal_content=equation_content,
    content_type="equation",
    file_path="equation_example.txt",
    entity_name="Mass-Energy Equivalence"
 )
 ```
 </details>
--- a/docs/mineru_integration_zh.md
+++ b/docs/mineru_integration_zh.md
@ -242,4 +242,117 @@ MinerU 配置文件 `magic-pdf.json` 支持多种自定义选项，包括：
 - GPU 加速设置
 - 缓存设置
-有关完整的配置选项，请参阅 [MinerU 官方文档](https://mineru.readthedocs.io/)。 
+有关完整的配置选项，请参阅 [MinerU 官方文档](https://mineru.readthedocs.io/)。
 ### 直接使用模态处理器
 您也可以直接使用 LightRAG 的模态处理器，而不需要通过 MinerU。这在您想要处理特定类型的内容或对处理流程有更多控制时特别有用。
 每个模态处理器都会返回一个包含以下内容的元组：
 1. 处理后内容的描述
 2. 可用于进一步处理或存储的实体信息
 处理器支持不同类型的内容：
 - `ImageModalProcessor`：处理带有标题和脚注的图像
 - `TableModalProcessor`：处理带有标题和脚注的表格
 - `EquationModalProcessor`：处理 LaTeX 格式的数学公式
 - `GenericModalProcessor`：可用于扩展自定义内容类型的基础处理器 
 > **注意**：完整的可运行示例可以在 `examples/modalprocessors_example.py` 中找到。您可以使用以下命令运行它：
 > ```bash
 > python examples/modalprocessors_example.py --api-key YOUR_API_KEY
 > ```
 <details>
 <summary> 使用不同模态处理器的示例 </summary>
 ```python
 from lightrag.modalprocessors import (
    ImageModalProcessor,
    TableModalProcessor,
    EquationModalProcessor,
    GenericModalProcessor
 )
 # 初始化 LightRAG
 lightrag = LightRAG(
    working_dir="./rag_storage",
    embedding_func=lambda texts: openai_embed(
        texts,
        model="text-embedding-3-large",
        api_key="your-api-key",
        base_url="your-base-url",
    ),
    llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
        "gpt-4o-mini",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key="your-api-key",
        base_url="your-base-url",
        **kwargs,
    ),
 )
 # 处理图像
 image_processor = ImageModalProcessor(
    lightrag=lightrag,
    modal_caption_func=vision_model_func
 )
 image_content = {
    "img_path": "image.jpg",
    "img_caption": ["示例图像标题"],
    "img_footnote": ["示例图像脚注"]
 }
 description, entity_info = await image_processor.process_multimodal_content(
    modal_content=image_content,
    content_type="image",
    file_path="image_example.jpg",
    entity_name="示例图像"
 )
 # 处理表格
 table_processor = TableModalProcessor(
    lightrag=lightrag,
    modal_caption_func=llm_model_func
 )
 table_content = {
    "table_body": """
    | 姓名 | 年龄 | 职业 |
    |------|-----|------|
    | 张三 | 25  | 工程师 |
    | 李四 | 30  | 设计师 |
    """,
    "table_caption": ["员工信息表"],
    "table_footnote": ["数据更新至2024年"]
 }
 description, entity_info = await table_processor.process_multimodal_content(
    modal_content=table_content,
    content_type="table",
    file_path="table_example.md",
    entity_name="员工表格"
 )
 # 处理公式
 equation_processor = EquationModalProcessor(
    lightrag=lightrag,
    modal_caption_func=llm_model_func
 )
 equation_content = {
    "text": "E = mc^2",
    "text_format": "LaTeX"
 }
 description, entity_info = await equation_processor.process_multimodal_content(
    modal_content=equation_content,
    content_type="equation",
    file_path="equation_example.txt",
    entity_name="质能方程"
 )
 ```
 </details>
--- a/examples/modalprocessors_example.py
+++ b/examples/modalprocessors_example.py
@ -0,0 +1,200 @@
 """
 Example of directly using modal processors
 This example demonstrates how to use LightRAG's modal processors directly without going through MinerU.
 """
 import asyncio
 import argparse
 from lightrag.llm.openai import openai_complete_if_cache, openai_embed
 from lightrag.kg.shared_storage import initialize_pipeline_status
 from pathlib import Path
 from lightrag import LightRAG
 from lightrag.modalprocessors import (
    ImageModalProcessor,
    TableModalProcessor,
    EquationModalProcessor,
    GenericModalProcessor
 )
 WORKING_DIR = "./rag_storage"
 def get_llm_model_func(api_key: str, base_url: str = None):
    return lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
        "gpt-4o-mini",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=api_key,
        base_url=base_url,
        **kwargs,
    )
 def get_vision_model_func(api_key: str, base_url: str = None):
    return lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
        "gpt-4o",
        "",
        system_prompt=None,
        history_messages=[],
        messages=[
            {"role": "system", "content": system_prompt} if system_prompt else None,
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_data}"
                    }
                }
            ]} if image_data else {"role": "user", "content": prompt}
        ],
        api_key=api_key,
        base_url=base_url,
        **kwargs,
    ) if image_data else openai_complete_if_cache(
        "gpt-4o-mini",
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=api_key,
        base_url=base_url,
        **kwargs,
    )
 async def process_image_example(lightrag: LightRAG, vision_model_func):
    """Example of processing an image"""
    # Create image processor
    image_processor = ImageModalProcessor(
        lightrag=lightrag,
        modal_caption_func=vision_model_func
    )
    # Prepare image content
    image_content = {
        "img_path": "image.jpg",
        "img_caption": ["Example image caption"],
        "img_footnote": ["Example image footnote"]
    }
    # Process image
    description, entity_info = await image_processor.process_multimodal_content(
        modal_content=image_content,
        content_type="image",
        file_path="image_example.jpg",
        entity_name="Example Image"
    )
    print("Image Processing Results:")
    print(f"Description: {description}")
    print(f"Entity Info: {entity_info}")
 async def process_table_example(lightrag: LightRAG, llm_model_func):
    """Example of processing a table"""
    # Create table processor
    table_processor = TableModalProcessor(
        lightrag=lightrag,
        modal_caption_func=llm_model_func
    )
    # Prepare table content
    table_content = {
        "table_body": """
        | Name | Age | Occupation |
        |------|-----|------------|
        | John | 25  | Engineer   |
        | Mary | 30  | Designer   |
        """,
        "table_caption": ["Employee Information Table"],
        "table_footnote": ["Data updated as of 2024"]
    }
    # Process table
    description, entity_info = await table_processor.process_multimodal_content(
        modal_content=table_content,
        content_type="table",
        file_path="table_example.md",
        entity_name="Employee Table"
    )
    print("\nTable Processing Results:")
    print(f"Description: {description}")
    print(f"Entity Info: {entity_info}")
 async def process_equation_example(lightrag: LightRAG, llm_model_func):
    """Example of processing a mathematical equation"""
    # Create equation processor
    equation_processor = EquationModalProcessor(
        lightrag=lightrag,
        modal_caption_func=llm_model_func
    )
    # Prepare equation content
    equation_content = {
        "text": "E = mc^2",
        "text_format": "LaTeX"
    }
    # Process equation
    description, entity_info = await equation_processor.process_multimodal_content(
        modal_content=equation_content,
        content_type="equation",
        file_path="equation_example.txt",
        entity_name="Mass-Energy Equivalence"
    )
    print("\nEquation Processing Results:")
    print(f"Description: {description}")
    print(f"Entity Info: {entity_info}")
 async def initialize_rag(api_key: str, base_url: str = None):
    rag = LightRAG(
        working_dir=WORKING_DIR,
        embedding_func=lambda texts: openai_embed(
            texts,
            model="text-embedding-3-large",
            api_key=api_key,
            base_url=base_url,
        ),
        llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
            "gpt-4o-mini",
            prompt,
            system_prompt=system_prompt,
            history_messages=history_messages,
            api_key=api_key,
            base_url=base_url,
            **kwargs,
        ),
    )
    await rag.initialize_storages()
    await initialize_pipeline_status()
    return rag
 def main():
    """Main function to run the example"""
    parser = argparse.ArgumentParser(description='Modal Processors Example')
    parser.add_argument('--api-key', required=True, help='OpenAI API key')
    parser.add_argument('--base-url', help='Optional base URL for API')
    parser.add_argument('--working-dir', '-w', default=WORKING_DIR, help='Working directory path')
    args = parser.parse_args()
    # Run examples
    asyncio.run(main_async(args.api_key, args.base_url))
 async def main_async(api_key: str, base_url: str = None):
    # Initialize LightRAG
    lightrag = await initialize_rag(api_key, base_url)
    # Get model functions
    llm_model_func = get_llm_model_func(api_key, base_url)
    vision_model_func = get_vision_model_func(api_key, base_url)
    # Run examples
    await process_image_example(lightrag, vision_model_func)
    await process_table_example(lightrag, llm_model_func)
    await process_equation_example(lightrag, llm_model_func)
 if __name__ == "__main__":
    main()