Add Unicode collation for Chinese file sorting of document scanning
This commit is contained in:
parent
df1d6b31ed
commit
31bd274601
3 changed files with 10 additions and 1 deletions
|
|
@ -16,6 +16,7 @@ python-dotenv
|
|||
python-jose[cryptography]
|
||||
python-multipart
|
||||
pytz
|
||||
pyuca
|
||||
tenacity
|
||||
tiktoken
|
||||
uvicorn
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
from pyuca import Collator
|
||||
from lightrag.utils import logger
|
||||
import aiofiles
|
||||
import shutil
|
||||
|
|
@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
|
|||
try:
|
||||
enqueued = False
|
||||
|
||||
# Create Collator for Unicode sorting
|
||||
collator = Collator()
|
||||
sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p)))
|
||||
|
||||
# Process files sequentially
|
||||
for file_path in file_paths:
|
||||
for file_path in sorted_file_paths:
|
||||
if await pipeline_enqueue_file(rag, file_path):
|
||||
enqueued = True
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,9 @@ pipmaster
|
|||
pydantic
|
||||
python-dotenv
|
||||
|
||||
# Unicode Collation Algorithm for proper Chinese sorting
|
||||
pyuca
|
||||
|
||||
setuptools
|
||||
tenacity
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue