fix: replace pyuca with pypinyin for Chinese pinyin sorting and add file_path sort
This commit is contained in:
parent
1941df9cf6
commit
14e083a1a6
3 changed files with 30 additions and 29 deletions
|
|
@ -491,7 +491,7 @@ class DocumentsRequest(BaseModel):
|
|||
status_filter: Filter by document status, None for all statuses
|
||||
page: Page number (1-based)
|
||||
page_size: Number of documents per page (10-200)
|
||||
sort_field: Field to sort by ('created_at', 'updated_at', 'id')
|
||||
sort_field: Field to sort by ('created_at', 'updated_at', 'id', 'file_path')
|
||||
sort_direction: Sort direction ('asc' or 'desc')
|
||||
"""
|
||||
|
||||
|
|
|
|||
|
|
@ -18,19 +18,6 @@ from typing import Any, Protocol, Callable, TYPE_CHECKING, List
|
|||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Import pyuca for Chinese pinyin sorting
|
||||
try:
|
||||
import pyuca
|
||||
|
||||
_pinyin_collator = pyuca.Collator()
|
||||
_pyuca_available = True
|
||||
except ImportError:
|
||||
_pinyin_collator = None
|
||||
_pyuca_available = False
|
||||
except Exception:
|
||||
_pinyin_collator = None
|
||||
_pyuca_available = False
|
||||
|
||||
from lightrag.constants import (
|
||||
DEFAULT_LOG_MAX_BYTES,
|
||||
DEFAULT_LOG_BACKUP_COUNT,
|
||||
|
|
@ -40,6 +27,21 @@ from lightrag.constants import (
|
|||
DEFAULT_MAX_FILE_PATH_LENGTH,
|
||||
)
|
||||
|
||||
# Global import for pypinyin with startup-time logging
|
||||
try:
|
||||
import pypinyin
|
||||
|
||||
_PYPINYIN_AVAILABLE = True
|
||||
logger = logging.getLogger("lightrag")
|
||||
logger.info("pypinyin loaded successfully for Chinese pinyin sorting")
|
||||
except ImportError:
|
||||
pypinyin = None
|
||||
_PYPINYIN_AVAILABLE = False
|
||||
logger = logging.getLogger("lightrag")
|
||||
logger.warning(
|
||||
"pypinyin is not installed. Chinese pinyin sorting will use simple string sorting."
|
||||
)
|
||||
|
||||
|
||||
def get_env_value(
|
||||
env_key: str, default: any, value_type: type = str, special_none: bool = False
|
||||
|
|
@ -2078,9 +2080,8 @@ def generate_track_id(prefix: str = "upload") -> str:
|
|||
def get_pinyin_sort_key(text: str) -> str:
|
||||
"""Generate sort key for Chinese pinyin sorting
|
||||
|
||||
This function uses pyuca (Python Unicode Collation Algorithm) to generate
|
||||
sort keys that handle Chinese characters by their pinyin pronunciation.
|
||||
For non-Chinese text, it falls back to standard Unicode sorting.
|
||||
This function uses pypinyin for true Chinese pinyin sorting.
|
||||
If pypinyin is not available, it falls back to simple lowercase string sorting.
|
||||
|
||||
Args:
|
||||
text: Text to generate sort key for
|
||||
|
|
@ -2091,14 +2092,14 @@ def get_pinyin_sort_key(text: str) -> str:
|
|||
if not text:
|
||||
return ""
|
||||
|
||||
# Use the globally initialized collator
|
||||
if _pyuca_available and _pinyin_collator is not None:
|
||||
if _PYPINYIN_AVAILABLE:
|
||||
try:
|
||||
return _pinyin_collator.sort_key(text)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to generate pinyin sort key for '{text}': {e}. Using fallback."
|
||||
)
|
||||
|
||||
# Fallback to standard string sorting
|
||||
# Convert Chinese characters to pinyin, keep non-Chinese as-is
|
||||
pinyin_list = pypinyin.lazy_pinyin(text, style=pypinyin.Style.NORMAL)
|
||||
return "".join(pinyin_list).lower()
|
||||
except Exception:
|
||||
# Silently fall back to simple string sorting on any error
|
||||
return text.lower()
|
||||
else:
|
||||
# pypinyin not available, use simple string sorting
|
||||
return text.lower()
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ dependencies = [
|
|||
"pandas>=2.0.0",
|
||||
"pipmaster",
|
||||
"pydantic",
|
||||
"pypinyin",
|
||||
"python-dotenv",
|
||||
"pyuca",
|
||||
"setuptools",
|
||||
"tenacity",
|
||||
"tiktoken",
|
||||
|
|
@ -52,8 +52,8 @@ api = [
|
|||
"pandas>=2.0.0",
|
||||
"pipmaster",
|
||||
"pydantic",
|
||||
"pypinyin",
|
||||
"python-dotenv",
|
||||
"pyuca",
|
||||
"setuptools",
|
||||
"tenacity",
|
||||
"tiktoken",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue