From 14e083a1a6ca44753c89cf709097a23c77f2def7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 17 Aug 2025 15:21:24 +0800 Subject: [PATCH] fix: replace pyuca with pypinyin for Chinese pinyin sorting and add file_path sort --- lightrag/api/routers/document_routes.py | 2 +- lightrag/utils.py | 53 +++++++++++++------------ pyproject.toml | 4 +- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 4bbfbf6c..7f092440 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -491,7 +491,7 @@ class DocumentsRequest(BaseModel): status_filter: Filter by document status, None for all statuses page: Page number (1-based) page_size: Number of documents per page (10-200) - sort_field: Field to sort by ('created_at', 'updated_at', 'id') + sort_field: Field to sort by ('created_at', 'updated_at', 'id', 'file_path') sort_direction: Sort direction ('asc' or 'desc') """ diff --git a/lightrag/utils.py b/lightrag/utils.py index 49b7136f..055a2b27 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -18,19 +18,6 @@ from typing import Any, Protocol, Callable, TYPE_CHECKING, List import numpy as np from dotenv import load_dotenv -# Import pyuca for Chinese pinyin sorting -try: - import pyuca - - _pinyin_collator = pyuca.Collator() - _pyuca_available = True -except ImportError: - _pinyin_collator = None - _pyuca_available = False -except Exception: - _pinyin_collator = None - _pyuca_available = False - from lightrag.constants import ( DEFAULT_LOG_MAX_BYTES, DEFAULT_LOG_BACKUP_COUNT, @@ -40,6 +27,21 @@ from lightrag.constants import ( DEFAULT_MAX_FILE_PATH_LENGTH, ) +# Global import for pypinyin with startup-time logging +try: + import pypinyin + + _PYPINYIN_AVAILABLE = True + logger = logging.getLogger("lightrag") + logger.info("pypinyin loaded successfully for Chinese pinyin sorting") +except ImportError: + pypinyin = None + _PYPINYIN_AVAILABLE = False + logger = logging.getLogger("lightrag") + logger.warning( + "pypinyin is not installed. Chinese pinyin sorting will use simple string sorting." + ) + def get_env_value( env_key: str, default: any, value_type: type = str, special_none: bool = False @@ -2078,9 +2080,8 @@ def generate_track_id(prefix: str = "upload") -> str: def get_pinyin_sort_key(text: str) -> str: """Generate sort key for Chinese pinyin sorting - This function uses pyuca (Python Unicode Collation Algorithm) to generate - sort keys that handle Chinese characters by their pinyin pronunciation. - For non-Chinese text, it falls back to standard Unicode sorting. + This function uses pypinyin for true Chinese pinyin sorting. + If pypinyin is not available, it falls back to simple lowercase string sorting. Args: text: Text to generate sort key for @@ -2091,14 +2092,14 @@ def get_pinyin_sort_key(text: str) -> str: if not text: return "" - # Use the globally initialized collator - if _pyuca_available and _pinyin_collator is not None: + if _PYPINYIN_AVAILABLE: try: - return _pinyin_collator.sort_key(text) - except Exception as e: - logger.warning( - f"Failed to generate pinyin sort key for '{text}': {e}. Using fallback." - ) - - # Fallback to standard string sorting - return text.lower() + # Convert Chinese characters to pinyin, keep non-Chinese as-is + pinyin_list = pypinyin.lazy_pinyin(text, style=pypinyin.Style.NORMAL) + return "".join(pinyin_list).lower() + except Exception: + # Silently fall back to simple string sorting on any error + return text.lower() + else: + # pypinyin not available, use simple string sorting + return text.lower() diff --git a/pyproject.toml b/pyproject.toml index 18e525d5..9963b354 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,8 +32,8 @@ dependencies = [ "pandas>=2.0.0", "pipmaster", "pydantic", + "pypinyin", "python-dotenv", - "pyuca", "setuptools", "tenacity", "tiktoken", @@ -52,8 +52,8 @@ api = [ "pandas>=2.0.0", "pipmaster", "pydantic", + "pypinyin", "python-dotenv", - "pyuca", "setuptools", "tenacity", "tiktoken",