Add example of directly using modal processors

This commit is contained in:
zrguo 2025-06-05 17:36:05 +08:00
parent 8a726f6e08
commit 962974589a
3 changed files with 429 additions and 2 deletions

View file

@ -243,4 +243,118 @@ The MinerU configuration file `magic-pdf.json` supports various customization op
- GPU acceleration settings - GPU acceleration settings
- Cache settings - Cache settings
For complete configuration options, refer to the [MinerU official documentation](https://mineru.readthedocs.io/). For complete configuration options, refer to the [MinerU official documentation](https://mineru.readthedocs.io/).
### Using Modal Processors Directly
You can also use LightRAG's modal processors directly without going through MinerU. This is useful when you want to process specific types of content or have more control over the processing pipeline.
Each modal processor returns a tuple containing:
1. A description of the processed content
2. Entity information that can be used for further processing or storage
The processors support different types of content:
- `ImageModalProcessor`: Processes images with captions and footnotes
- `TableModalProcessor`: Processes tables with captions and footnotes
- `EquationModalProcessor`: Processes mathematical equations in LaTeX format
- `GenericModalProcessor`: A base processor that can be extended for custom content types
> **Note**: A complete working example can be found in `examples/modalprocessors_example.py`. You can run it using:
> ```bash
> python examples/modalprocessors_example.py --api-key YOUR_API_KEY
> ```
<details>
<summary> Here's an example of how to use different modal processors: </summary>
```python
from lightrag.modalprocessors import (
ImageModalProcessor,
TableModalProcessor,
EquationModalProcessor,
GenericModalProcessor
)
# Initialize LightRAG
lightrag = LightRAG(
working_dir="./rag_storage",
embedding_func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key="your-api-key",
base_url="your-base-url",
),
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
base_url="your-base-url",
**kwargs,
),
)
# Process an image
image_processor = ImageModalProcessor(
lightrag=lightrag,
modal_caption_func=vision_model_func
)
image_content = {
"img_path": "image.jpg",
"img_caption": ["Example image caption"],
"img_footnote": ["Example image footnote"]
}
description, entity_info = await image_processor.process_multimodal_content(
modal_content=image_content,
content_type="image",
file_path="image_example.jpg",
entity_name="Example Image"
)
# Process a table
table_processor = TableModalProcessor(
lightrag=lightrag,
modal_caption_func=llm_model_func
)
table_content = {
"table_body": """
| Name | Age | Occupation |
|------|-----|------------|
| John | 25 | Engineer |
| Mary | 30 | Designer |
""",
"table_caption": ["Employee Information Table"],
"table_footnote": ["Data updated as of 2024"]
}
description, entity_info = await table_processor.process_multimodal_content(
modal_content=table_content,
content_type="table",
file_path="table_example.md",
entity_name="Employee Table"
)
# Process an equation
equation_processor = EquationModalProcessor(
lightrag=lightrag,
modal_caption_func=llm_model_func
)
equation_content = {
"text": "E = mc^2",
"text_format": "LaTeX"
}
description, entity_info = await equation_processor.process_multimodal_content(
modal_content=equation_content,
content_type="equation",
file_path="equation_example.txt",
entity_name="Mass-Energy Equivalence"
)
```
</details>

View file

@ -242,4 +242,117 @@ MinerU 配置文件 `magic-pdf.json` 支持多种自定义选项,包括:
- GPU 加速设置 - GPU 加速设置
- 缓存设置 - 缓存设置
有关完整的配置选项,请参阅 [MinerU 官方文档](https://mineru.readthedocs.io/)。 有关完整的配置选项,请参阅 [MinerU 官方文档](https://mineru.readthedocs.io/)。
### 直接使用模态处理器
您也可以直接使用 LightRAG 的模态处理器,而不需要通过 MinerU。这在您想要处理特定类型的内容或对处理流程有更多控制时特别有用。
每个模态处理器都会返回一个包含以下内容的元组:
1. 处理后内容的描述
2. 可用于进一步处理或存储的实体信息
处理器支持不同类型的内容:
- `ImageModalProcessor`:处理带有标题和脚注的图像
- `TableModalProcessor`:处理带有标题和脚注的表格
- `EquationModalProcessor`:处理 LaTeX 格式的数学公式
- `GenericModalProcessor`:可用于扩展自定义内容类型的基础处理器
> **注意**:完整的可运行示例可以在 `examples/modalprocessors_example.py` 中找到。您可以使用以下命令运行它:
> ```bash
> python examples/modalprocessors_example.py --api-key YOUR_API_KEY
> ```
<details>
<summary> 使用不同模态处理器的示例 </summary>
```python
from lightrag.modalprocessors import (
ImageModalProcessor,
TableModalProcessor,
EquationModalProcessor,
GenericModalProcessor
)
# 初始化 LightRAG
lightrag = LightRAG(
working_dir="./rag_storage",
embedding_func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key="your-api-key",
base_url="your-base-url",
),
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key="your-api-key",
base_url="your-base-url",
**kwargs,
),
)
# 处理图像
image_processor = ImageModalProcessor(
lightrag=lightrag,
modal_caption_func=vision_model_func
)
image_content = {
"img_path": "image.jpg",
"img_caption": ["示例图像标题"],
"img_footnote": ["示例图像脚注"]
}
description, entity_info = await image_processor.process_multimodal_content(
modal_content=image_content,
content_type="image",
file_path="image_example.jpg",
entity_name="示例图像"
)
# 处理表格
table_processor = TableModalProcessor(
lightrag=lightrag,
modal_caption_func=llm_model_func
)
table_content = {
"table_body": """
| 姓名 | 年龄 | 职业 |
|------|-----|------|
| 张三 | 25 | 工程师 |
| 李四 | 30 | 设计师 |
""",
"table_caption": ["员工信息表"],
"table_footnote": ["数据更新至2024年"]
}
description, entity_info = await table_processor.process_multimodal_content(
modal_content=table_content,
content_type="table",
file_path="table_example.md",
entity_name="员工表格"
)
# 处理公式
equation_processor = EquationModalProcessor(
lightrag=lightrag,
modal_caption_func=llm_model_func
)
equation_content = {
"text": "E = mc^2",
"text_format": "LaTeX"
}
description, entity_info = await equation_processor.process_multimodal_content(
modal_content=equation_content,
content_type="equation",
file_path="equation_example.txt",
entity_name="质能方程"
)
```
</details>

View file

@ -0,0 +1,200 @@
"""
Example of directly using modal processors
This example demonstrates how to use LightRAG's modal processors directly without going through MinerU.
"""
import asyncio
import argparse
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from pathlib import Path
from lightrag import LightRAG
from lightrag.modalprocessors import (
ImageModalProcessor,
TableModalProcessor,
EquationModalProcessor,
GenericModalProcessor
)
WORKING_DIR = "./rag_storage"
def get_llm_model_func(api_key: str, base_url: str = None):
return lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
def get_vision_model_func(api_key: str, base_url: str = None):
return lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
"gpt-4o",
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt} if system_prompt else None,
{"role": "user", "content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
}
}
]} if image_data else {"role": "user", "content": prompt}
],
api_key=api_key,
base_url=base_url,
**kwargs,
) if image_data else openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
async def process_image_example(lightrag: LightRAG, vision_model_func):
"""Example of processing an image"""
# Create image processor
image_processor = ImageModalProcessor(
lightrag=lightrag,
modal_caption_func=vision_model_func
)
# Prepare image content
image_content = {
"img_path": "image.jpg",
"img_caption": ["Example image caption"],
"img_footnote": ["Example image footnote"]
}
# Process image
description, entity_info = await image_processor.process_multimodal_content(
modal_content=image_content,
content_type="image",
file_path="image_example.jpg",
entity_name="Example Image"
)
print("Image Processing Results:")
print(f"Description: {description}")
print(f"Entity Info: {entity_info}")
async def process_table_example(lightrag: LightRAG, llm_model_func):
"""Example of processing a table"""
# Create table processor
table_processor = TableModalProcessor(
lightrag=lightrag,
modal_caption_func=llm_model_func
)
# Prepare table content
table_content = {
"table_body": """
| Name | Age | Occupation |
|------|-----|------------|
| John | 25 | Engineer |
| Mary | 30 | Designer |
""",
"table_caption": ["Employee Information Table"],
"table_footnote": ["Data updated as of 2024"]
}
# Process table
description, entity_info = await table_processor.process_multimodal_content(
modal_content=table_content,
content_type="table",
file_path="table_example.md",
entity_name="Employee Table"
)
print("\nTable Processing Results:")
print(f"Description: {description}")
print(f"Entity Info: {entity_info}")
async def process_equation_example(lightrag: LightRAG, llm_model_func):
"""Example of processing a mathematical equation"""
# Create equation processor
equation_processor = EquationModalProcessor(
lightrag=lightrag,
modal_caption_func=llm_model_func
)
# Prepare equation content
equation_content = {
"text": "E = mc^2",
"text_format": "LaTeX"
}
# Process equation
description, entity_info = await equation_processor.process_multimodal_content(
modal_content=equation_content,
content_type="equation",
file_path="equation_example.txt",
entity_name="Mass-Energy Equivalence"
)
print("\nEquation Processing Results:")
print(f"Description: {description}")
print(f"Entity Info: {entity_info}")
async def initialize_rag(api_key: str, base_url: str = None):
rag = LightRAG(
working_dir=WORKING_DIR,
embedding_func=lambda texts: openai_embed(
texts,
model="text-embedding-3-large",
api_key=api_key,
base_url=base_url,
),
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
"gpt-4o-mini",
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
),
)
await rag.initialize_storages()
await initialize_pipeline_status()
return rag
def main():
"""Main function to run the example"""
parser = argparse.ArgumentParser(description='Modal Processors Example')
parser.add_argument('--api-key', required=True, help='OpenAI API key')
parser.add_argument('--base-url', help='Optional base URL for API')
parser.add_argument('--working-dir', '-w', default=WORKING_DIR, help='Working directory path')
args = parser.parse_args()
# Run examples
asyncio.run(main_async(args.api_key, args.base_url))
async def main_async(api_key: str, base_url: str = None):
# Initialize LightRAG
lightrag = await initialize_rag(api_key, base_url)
# Get model functions
llm_model_func = get_llm_model_func(api_key, base_url)
vision_model_func = get_vision_model_func(api_key, base_url)
# Run examples
await process_image_example(lightrag, vision_model_func)
await process_table_example(lightrag, llm_model_func)
await process_equation_example(lightrag, llm_model_func)
if __name__ == "__main__":
main()