This commit is contained in:
hzywhite 2025-09-05 15:02:49 +08:00
parent e3ea87da24
commit 8d800239d6
2 changed files with 38 additions and 19 deletions

View file

@ -25,6 +25,7 @@ from lightrag.api.utils_api import (
display_splash_screen,
check_env_file,
)
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from .config import (
global_args,
update_uvicorn_mode_config,
@ -645,7 +646,7 @@ def create_app(args):
embedding_func=raganything_embedding_func,
)
logger.info("Check the download status of the RAGANything parser...")
logger.info("Check the download status of the RAGAnything parser...")
rag_anything.verify_parser_installation_once()
RAGManager.set_rag(rag_anything)

View file

@ -111,17 +111,6 @@ def sanitize_filename(filename: str, input_dir: Path) -> str:
return clean_name
class ScanRequest(BaseModel):
"""Request model for document scanning operations.
Attributes:
framework (str | None): Processing framework to use for scanning.
Can be "lightrag" or "raganything". If None, uses default framework.
"""
framework: str | None = None
class SchemeConfig(BaseModel):
"""Configuration model for processing schemes.
@ -135,10 +124,16 @@ class SchemeConfig(BaseModel):
- "mineru": MinerU parser for comprehensive document parsing
- "docling": Docling parser for office document processing
- "": Default/automatic extractor selection
modelSource (Literal["huggingface", "modelscope", "local", ""]): The model source used by Mineru.
- "huggingface": Using pre-trained models from the Hugging Face model library
- "modelscope": using model resources on ModelScope platform
- "local": Use custom models deployed locally
- "":Maintain the default model source configuration of the system (usually huggingface)
"""
framework: Literal["lightrag", "raganything"]
extractor: Literal["mineru", "docling", ""] = "" # 默认值
modelSource: Literal["huggingface", "modelscope", "local", ""] = ""
class Scheme(BaseModel):
@ -185,6 +180,12 @@ class SchemesResponse(BaseModel):
data: Optional[List[Dict[str, Any]]] = Field(None, description="List of schemes")
class ScanRequest(BaseModel):
"""Request model for document scanning operations.
"""
schemeConfig: SchemeConfig = Field(..., description="Scanning scheme configuration")
class ScanResponse(BaseModel):
"""Response model for document scanning operation
@ -1403,8 +1404,9 @@ async def pipeline_index_files(
async def pipeline_index_files_raganything(
rag_anything: RAGAnything,
file_paths: List[Path],
track_id: str = None,
scheme_name: str = None,
parser: str = None,
source: str = None
):
"""Index multiple files using RAGAnything framework for multimodal processing.
@ -1414,6 +1416,10 @@ async def pipeline_index_files_raganything(
track_id (str, optional): Tracking ID for batch monitoring. Defaults to None.
scheme_name (str, optional): Processing scheme name for categorization.
Defaults to None.
parser (str, optional): Document extraction tool to use.
Defaults to None.
source (str, optional): The model source used by Mineru.
Defaults to None.
Note:
- Uses RAGAnything's process_document_complete_lightrag_api method for each file
@ -1438,6 +1444,8 @@ async def pipeline_index_files_raganything(
output_dir="./output",
parse_method="auto",
scheme_name=scheme_name,
parser=parser,
source=source
)
if success:
pass
@ -1481,7 +1489,7 @@ async def run_scanning_process(
rag_anything: RAGAnything,
doc_manager: DocumentManager,
track_id: str = None,
scheme_name: str = None,
schemeConfig = None,
):
"""Background task to scan and index documents
@ -1490,8 +1498,8 @@ async def run_scanning_process(
rag_anythingL: RAGAnything instance
doc_manager: DocumentManager instance
track_id: Optional tracking ID to pass to all scanned files
scheme_name (str, optional): Processing scheme name for categorization.
Defaults to None
schemeConfig: Scanning scheme configuration.
Defaults to None
"""
try:
new_files = doc_manager.scan_directory_for_new_files()
@ -1504,6 +1512,10 @@ async def run_scanning_process(
is_pipeline_scan_busy = pipeline_status.get("scan_disabled", False)
is_pipeline_busy = pipeline_status.get("busy", False)
scheme_name = schemeConfig.framework
extractor = schemeConfig.extractor
modelSource = schemeConfig.modelSource
if new_files:
# Process all files at once with track_id
if is_pipeline_busy:
@ -1525,7 +1537,7 @@ async def run_scanning_process(
)
elif scheme_name == "raganything":
await pipeline_index_files_raganything(
rag_anything, new_files, track_id, scheme_name=scheme_name
rag_anything, new_files, scheme_name=scheme_name, parser=extractor, source=modelSource
)
logger.info(
f"Scanning process completed with raganything: {total_files} files Processed."
@ -1834,6 +1846,7 @@ def create_document_routes(
"config": {
"framework": schemes[0].config.framework,
"extractor": schemes[0].config.extractor,
"modelSource": schemes[0].config.modelSource,
},
}
# 保存新方案
@ -1842,6 +1855,7 @@ def create_document_routes(
item["name"] = updated_item["name"]
item["config"]["framework"] = updated_item["config"]["framework"]
item["config"]["extractor"] = updated_item["config"]["extractor"]
item["config"]["modelSource"] = updated_item["config"]["modelSource"]
break
# 写回文件
@ -1909,6 +1923,7 @@ def create_document_routes(
"config": {
"framework": scheme.config.framework,
"extractor": scheme.config.extractor,
"modelSource": scheme.config.modelSource,
},
}
@ -1989,7 +2004,6 @@ def create_document_routes(
Returns:
ScanResponse: A response object containing the scanning status and track_id
"""
scheme_name = request.framework
# Generate track_id with "scan" prefix for scanning operation
track_id = generate_track_id("scan")
@ -2000,7 +2014,7 @@ def create_document_routes(
rag_anything,
doc_manager,
track_id,
scheme_name=scheme_name,
schemeConfig=request.schemeConfig,
)
return ScanResponse(
status="scanning_started",
@ -2077,6 +2091,8 @@ def create_document_routes(
config = load_config()
current_framework = config.get("framework")
current_extractor = config.get("extractor")
current_modelSource = config.get("modelSource")
doc_pre_id = f"doc-pre-{safe_filename}"
if current_framework and current_framework == "lightrag":
@ -2095,6 +2111,8 @@ def create_document_routes(
output_dir="./output",
parse_method="auto",
scheme_name=current_framework,
parser=current_extractor,
source=current_modelSource
)
await rag.doc_status.upsert(