fix: replace pyuca with pypinyin for Chinese pinyin sorting and add file_path sort

This commit is contained in:
yangdx 2025-08-17 15:21:24 +08:00
parent 1941df9cf6
commit 14e083a1a6
3 changed files with 30 additions and 29 deletions

View file

@ -491,7 +491,7 @@ class DocumentsRequest(BaseModel):
status_filter: Filter by document status, None for all statuses
page: Page number (1-based)
page_size: Number of documents per page (10-200)
sort_field: Field to sort by ('created_at', 'updated_at', 'id')
sort_field: Field to sort by ('created_at', 'updated_at', 'id', 'file_path')
sort_direction: Sort direction ('asc' or 'desc')
"""

View file

@ -18,19 +18,6 @@ from typing import Any, Protocol, Callable, TYPE_CHECKING, List
import numpy as np
from dotenv import load_dotenv
# Import pyuca for Chinese pinyin sorting
try:
import pyuca
_pinyin_collator = pyuca.Collator()
_pyuca_available = True
except ImportError:
_pinyin_collator = None
_pyuca_available = False
except Exception:
_pinyin_collator = None
_pyuca_available = False
from lightrag.constants import (
DEFAULT_LOG_MAX_BYTES,
DEFAULT_LOG_BACKUP_COUNT,
@ -40,6 +27,21 @@ from lightrag.constants import (
DEFAULT_MAX_FILE_PATH_LENGTH,
)
# Global import for pypinyin with startup-time logging
try:
import pypinyin
_PYPINYIN_AVAILABLE = True
logger = logging.getLogger("lightrag")
logger.info("pypinyin loaded successfully for Chinese pinyin sorting")
except ImportError:
pypinyin = None
_PYPINYIN_AVAILABLE = False
logger = logging.getLogger("lightrag")
logger.warning(
"pypinyin is not installed. Chinese pinyin sorting will use simple string sorting."
)
def get_env_value(
env_key: str, default: any, value_type: type = str, special_none: bool = False
@ -2078,9 +2080,8 @@ def generate_track_id(prefix: str = "upload") -> str:
def get_pinyin_sort_key(text: str) -> str:
"""Generate sort key for Chinese pinyin sorting
This function uses pyuca (Python Unicode Collation Algorithm) to generate
sort keys that handle Chinese characters by their pinyin pronunciation.
For non-Chinese text, it falls back to standard Unicode sorting.
This function uses pypinyin for true Chinese pinyin sorting.
If pypinyin is not available, it falls back to simple lowercase string sorting.
Args:
text: Text to generate sort key for
@ -2091,14 +2092,14 @@ def get_pinyin_sort_key(text: str) -> str:
if not text:
return ""
# Use the globally initialized collator
if _pyuca_available and _pinyin_collator is not None:
if _PYPINYIN_AVAILABLE:
try:
return _pinyin_collator.sort_key(text)
except Exception as e:
logger.warning(
f"Failed to generate pinyin sort key for '{text}': {e}. Using fallback."
)
# Fallback to standard string sorting
# Convert Chinese characters to pinyin, keep non-Chinese as-is
pinyin_list = pypinyin.lazy_pinyin(text, style=pypinyin.Style.NORMAL)
return "".join(pinyin_list).lower()
except Exception:
# Silently fall back to simple string sorting on any error
return text.lower()
else:
# pypinyin not available, use simple string sorting
return text.lower()

View file

@ -32,8 +32,8 @@ dependencies = [
"pandas>=2.0.0",
"pipmaster",
"pydantic",
"pypinyin",
"python-dotenv",
"pyuca",
"setuptools",
"tenacity",
"tiktoken",
@ -52,8 +52,8 @@ api = [
"pandas>=2.0.0",
"pipmaster",
"pydantic",
"pypinyin",
"python-dotenv",
"pyuca",
"setuptools",
"tenacity",
"tiktoken",