merge
This commit is contained in:
parent
e3ea87da24
commit
8d800239d6
2 changed files with 38 additions and 19 deletions
|
|
@ -25,6 +25,7 @@ from lightrag.api.utils_api import (
|
||||||
display_splash_screen,
|
display_splash_screen,
|
||||||
check_env_file,
|
check_env_file,
|
||||||
)
|
)
|
||||||
|
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||||
from .config import (
|
from .config import (
|
||||||
global_args,
|
global_args,
|
||||||
update_uvicorn_mode_config,
|
update_uvicorn_mode_config,
|
||||||
|
|
@ -645,7 +646,7 @@ def create_app(args):
|
||||||
embedding_func=raganything_embedding_func,
|
embedding_func=raganything_embedding_func,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Check the download status of the RAGANything parser...")
|
logger.info("Check the download status of the RAGAnything parser...")
|
||||||
rag_anything.verify_parser_installation_once()
|
rag_anything.verify_parser_installation_once()
|
||||||
|
|
||||||
RAGManager.set_rag(rag_anything)
|
RAGManager.set_rag(rag_anything)
|
||||||
|
|
|
||||||
|
|
@ -111,17 +111,6 @@ def sanitize_filename(filename: str, input_dir: Path) -> str:
|
||||||
return clean_name
|
return clean_name
|
||||||
|
|
||||||
|
|
||||||
class ScanRequest(BaseModel):
|
|
||||||
"""Request model for document scanning operations.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
framework (str | None): Processing framework to use for scanning.
|
|
||||||
Can be "lightrag" or "raganything". If None, uses default framework.
|
|
||||||
"""
|
|
||||||
|
|
||||||
framework: str | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class SchemeConfig(BaseModel):
|
class SchemeConfig(BaseModel):
|
||||||
"""Configuration model for processing schemes.
|
"""Configuration model for processing schemes.
|
||||||
|
|
||||||
|
|
@ -135,10 +124,16 @@ class SchemeConfig(BaseModel):
|
||||||
- "mineru": MinerU parser for comprehensive document parsing
|
- "mineru": MinerU parser for comprehensive document parsing
|
||||||
- "docling": Docling parser for office document processing
|
- "docling": Docling parser for office document processing
|
||||||
- "": Default/automatic extractor selection
|
- "": Default/automatic extractor selection
|
||||||
|
modelSource (Literal["huggingface", "modelscope", "local", ""]): The model source used by Mineru.
|
||||||
|
- "huggingface": Using pre-trained models from the Hugging Face model library
|
||||||
|
- "modelscope": using model resources on ModelScope platform
|
||||||
|
- "local": Use custom models deployed locally
|
||||||
|
- "":Maintain the default model source configuration of the system (usually huggingface)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
framework: Literal["lightrag", "raganything"]
|
framework: Literal["lightrag", "raganything"]
|
||||||
extractor: Literal["mineru", "docling", ""] = "" # 默认值
|
extractor: Literal["mineru", "docling", ""] = "" # 默认值
|
||||||
|
modelSource: Literal["huggingface", "modelscope", "local", ""] = ""
|
||||||
|
|
||||||
|
|
||||||
class Scheme(BaseModel):
|
class Scheme(BaseModel):
|
||||||
|
|
@ -185,6 +180,12 @@ class SchemesResponse(BaseModel):
|
||||||
data: Optional[List[Dict[str, Any]]] = Field(None, description="List of schemes")
|
data: Optional[List[Dict[str, Any]]] = Field(None, description="List of schemes")
|
||||||
|
|
||||||
|
|
||||||
|
class ScanRequest(BaseModel):
|
||||||
|
"""Request model for document scanning operations.
|
||||||
|
"""
|
||||||
|
schemeConfig: SchemeConfig = Field(..., description="Scanning scheme configuration")
|
||||||
|
|
||||||
|
|
||||||
class ScanResponse(BaseModel):
|
class ScanResponse(BaseModel):
|
||||||
"""Response model for document scanning operation
|
"""Response model for document scanning operation
|
||||||
|
|
||||||
|
|
@ -1403,8 +1404,9 @@ async def pipeline_index_files(
|
||||||
async def pipeline_index_files_raganything(
|
async def pipeline_index_files_raganything(
|
||||||
rag_anything: RAGAnything,
|
rag_anything: RAGAnything,
|
||||||
file_paths: List[Path],
|
file_paths: List[Path],
|
||||||
track_id: str = None,
|
|
||||||
scheme_name: str = None,
|
scheme_name: str = None,
|
||||||
|
parser: str = None,
|
||||||
|
source: str = None
|
||||||
):
|
):
|
||||||
"""Index multiple files using RAGAnything framework for multimodal processing.
|
"""Index multiple files using RAGAnything framework for multimodal processing.
|
||||||
|
|
||||||
|
|
@ -1414,6 +1416,10 @@ async def pipeline_index_files_raganything(
|
||||||
track_id (str, optional): Tracking ID for batch monitoring. Defaults to None.
|
track_id (str, optional): Tracking ID for batch monitoring. Defaults to None.
|
||||||
scheme_name (str, optional): Processing scheme name for categorization.
|
scheme_name (str, optional): Processing scheme name for categorization.
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
|
parser (str, optional): Document extraction tool to use.
|
||||||
|
Defaults to None.
|
||||||
|
source (str, optional): The model source used by Mineru.
|
||||||
|
Defaults to None.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
- Uses RAGAnything's process_document_complete_lightrag_api method for each file
|
- Uses RAGAnything's process_document_complete_lightrag_api method for each file
|
||||||
|
|
@ -1438,6 +1444,8 @@ async def pipeline_index_files_raganything(
|
||||||
output_dir="./output",
|
output_dir="./output",
|
||||||
parse_method="auto",
|
parse_method="auto",
|
||||||
scheme_name=scheme_name,
|
scheme_name=scheme_name,
|
||||||
|
parser=parser,
|
||||||
|
source=source
|
||||||
)
|
)
|
||||||
if success:
|
if success:
|
||||||
pass
|
pass
|
||||||
|
|
@ -1481,7 +1489,7 @@ async def run_scanning_process(
|
||||||
rag_anything: RAGAnything,
|
rag_anything: RAGAnything,
|
||||||
doc_manager: DocumentManager,
|
doc_manager: DocumentManager,
|
||||||
track_id: str = None,
|
track_id: str = None,
|
||||||
scheme_name: str = None,
|
schemeConfig = None,
|
||||||
):
|
):
|
||||||
"""Background task to scan and index documents
|
"""Background task to scan and index documents
|
||||||
|
|
||||||
|
|
@ -1490,8 +1498,8 @@ async def run_scanning_process(
|
||||||
rag_anythingL: RAGAnything instance
|
rag_anythingL: RAGAnything instance
|
||||||
doc_manager: DocumentManager instance
|
doc_manager: DocumentManager instance
|
||||||
track_id: Optional tracking ID to pass to all scanned files
|
track_id: Optional tracking ID to pass to all scanned files
|
||||||
scheme_name (str, optional): Processing scheme name for categorization.
|
schemeConfig: Scanning scheme configuration.
|
||||||
Defaults to None
|
Defaults to None
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
new_files = doc_manager.scan_directory_for_new_files()
|
new_files = doc_manager.scan_directory_for_new_files()
|
||||||
|
|
@ -1504,6 +1512,10 @@ async def run_scanning_process(
|
||||||
is_pipeline_scan_busy = pipeline_status.get("scan_disabled", False)
|
is_pipeline_scan_busy = pipeline_status.get("scan_disabled", False)
|
||||||
is_pipeline_busy = pipeline_status.get("busy", False)
|
is_pipeline_busy = pipeline_status.get("busy", False)
|
||||||
|
|
||||||
|
scheme_name = schemeConfig.framework
|
||||||
|
extractor = schemeConfig.extractor
|
||||||
|
modelSource = schemeConfig.modelSource
|
||||||
|
|
||||||
if new_files:
|
if new_files:
|
||||||
# Process all files at once with track_id
|
# Process all files at once with track_id
|
||||||
if is_pipeline_busy:
|
if is_pipeline_busy:
|
||||||
|
|
@ -1525,7 +1537,7 @@ async def run_scanning_process(
|
||||||
)
|
)
|
||||||
elif scheme_name == "raganything":
|
elif scheme_name == "raganything":
|
||||||
await pipeline_index_files_raganything(
|
await pipeline_index_files_raganything(
|
||||||
rag_anything, new_files, track_id, scheme_name=scheme_name
|
rag_anything, new_files, scheme_name=scheme_name, parser=extractor, source=modelSource
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Scanning process completed with raganything: {total_files} files Processed."
|
f"Scanning process completed with raganything: {total_files} files Processed."
|
||||||
|
|
@ -1834,6 +1846,7 @@ def create_document_routes(
|
||||||
"config": {
|
"config": {
|
||||||
"framework": schemes[0].config.framework,
|
"framework": schemes[0].config.framework,
|
||||||
"extractor": schemes[0].config.extractor,
|
"extractor": schemes[0].config.extractor,
|
||||||
|
"modelSource": schemes[0].config.modelSource,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
# 保存新方案
|
# 保存新方案
|
||||||
|
|
@ -1842,6 +1855,7 @@ def create_document_routes(
|
||||||
item["name"] = updated_item["name"]
|
item["name"] = updated_item["name"]
|
||||||
item["config"]["framework"] = updated_item["config"]["framework"]
|
item["config"]["framework"] = updated_item["config"]["framework"]
|
||||||
item["config"]["extractor"] = updated_item["config"]["extractor"]
|
item["config"]["extractor"] = updated_item["config"]["extractor"]
|
||||||
|
item["config"]["modelSource"] = updated_item["config"]["modelSource"]
|
||||||
break
|
break
|
||||||
|
|
||||||
# 写回文件
|
# 写回文件
|
||||||
|
|
@ -1909,6 +1923,7 @@ def create_document_routes(
|
||||||
"config": {
|
"config": {
|
||||||
"framework": scheme.config.framework,
|
"framework": scheme.config.framework,
|
||||||
"extractor": scheme.config.extractor,
|
"extractor": scheme.config.extractor,
|
||||||
|
"modelSource": scheme.config.modelSource,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1989,7 +2004,6 @@ def create_document_routes(
|
||||||
Returns:
|
Returns:
|
||||||
ScanResponse: A response object containing the scanning status and track_id
|
ScanResponse: A response object containing the scanning status and track_id
|
||||||
"""
|
"""
|
||||||
scheme_name = request.framework
|
|
||||||
# Generate track_id with "scan" prefix for scanning operation
|
# Generate track_id with "scan" prefix for scanning operation
|
||||||
track_id = generate_track_id("scan")
|
track_id = generate_track_id("scan")
|
||||||
|
|
||||||
|
|
@ -2000,7 +2014,7 @@ def create_document_routes(
|
||||||
rag_anything,
|
rag_anything,
|
||||||
doc_manager,
|
doc_manager,
|
||||||
track_id,
|
track_id,
|
||||||
scheme_name=scheme_name,
|
schemeConfig=request.schemeConfig,
|
||||||
)
|
)
|
||||||
return ScanResponse(
|
return ScanResponse(
|
||||||
status="scanning_started",
|
status="scanning_started",
|
||||||
|
|
@ -2077,6 +2091,8 @@ def create_document_routes(
|
||||||
|
|
||||||
config = load_config()
|
config = load_config()
|
||||||
current_framework = config.get("framework")
|
current_framework = config.get("framework")
|
||||||
|
current_extractor = config.get("extractor")
|
||||||
|
current_modelSource = config.get("modelSource")
|
||||||
doc_pre_id = f"doc-pre-{safe_filename}"
|
doc_pre_id = f"doc-pre-{safe_filename}"
|
||||||
|
|
||||||
if current_framework and current_framework == "lightrag":
|
if current_framework and current_framework == "lightrag":
|
||||||
|
|
@ -2095,6 +2111,8 @@ def create_document_routes(
|
||||||
output_dir="./output",
|
output_dir="./output",
|
||||||
parse_method="auto",
|
parse_method="auto",
|
||||||
scheme_name=current_framework,
|
scheme_name=current_framework,
|
||||||
|
parser=current_extractor,
|
||||||
|
source=current_modelSource
|
||||||
)
|
)
|
||||||
|
|
||||||
await rag.doc_status.upsert(
|
await rag.doc_status.upsert(
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue