fix: replace pyuca with pypinyin for Chinese pinyin sorting and add file_path sort
This commit is contained in:
parent
1941df9cf6
commit
14e083a1a6
3 changed files with 30 additions and 29 deletions
|
|
@ -491,7 +491,7 @@ class DocumentsRequest(BaseModel):
|
||||||
status_filter: Filter by document status, None for all statuses
|
status_filter: Filter by document status, None for all statuses
|
||||||
page: Page number (1-based)
|
page: Page number (1-based)
|
||||||
page_size: Number of documents per page (10-200)
|
page_size: Number of documents per page (10-200)
|
||||||
sort_field: Field to sort by ('created_at', 'updated_at', 'id')
|
sort_field: Field to sort by ('created_at', 'updated_at', 'id', 'file_path')
|
||||||
sort_direction: Sort direction ('asc' or 'desc')
|
sort_direction: Sort direction ('asc' or 'desc')
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,19 +18,6 @@ from typing import Any, Protocol, Callable, TYPE_CHECKING, List
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Import pyuca for Chinese pinyin sorting
|
|
||||||
try:
|
|
||||||
import pyuca
|
|
||||||
|
|
||||||
_pinyin_collator = pyuca.Collator()
|
|
||||||
_pyuca_available = True
|
|
||||||
except ImportError:
|
|
||||||
_pinyin_collator = None
|
|
||||||
_pyuca_available = False
|
|
||||||
except Exception:
|
|
||||||
_pinyin_collator = None
|
|
||||||
_pyuca_available = False
|
|
||||||
|
|
||||||
from lightrag.constants import (
|
from lightrag.constants import (
|
||||||
DEFAULT_LOG_MAX_BYTES,
|
DEFAULT_LOG_MAX_BYTES,
|
||||||
DEFAULT_LOG_BACKUP_COUNT,
|
DEFAULT_LOG_BACKUP_COUNT,
|
||||||
|
|
@ -40,6 +27,21 @@ from lightrag.constants import (
|
||||||
DEFAULT_MAX_FILE_PATH_LENGTH,
|
DEFAULT_MAX_FILE_PATH_LENGTH,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Global import for pypinyin with startup-time logging
|
||||||
|
try:
|
||||||
|
import pypinyin
|
||||||
|
|
||||||
|
_PYPINYIN_AVAILABLE = True
|
||||||
|
logger = logging.getLogger("lightrag")
|
||||||
|
logger.info("pypinyin loaded successfully for Chinese pinyin sorting")
|
||||||
|
except ImportError:
|
||||||
|
pypinyin = None
|
||||||
|
_PYPINYIN_AVAILABLE = False
|
||||||
|
logger = logging.getLogger("lightrag")
|
||||||
|
logger.warning(
|
||||||
|
"pypinyin is not installed. Chinese pinyin sorting will use simple string sorting."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_env_value(
|
def get_env_value(
|
||||||
env_key: str, default: any, value_type: type = str, special_none: bool = False
|
env_key: str, default: any, value_type: type = str, special_none: bool = False
|
||||||
|
|
@ -2078,9 +2080,8 @@ def generate_track_id(prefix: str = "upload") -> str:
|
||||||
def get_pinyin_sort_key(text: str) -> str:
|
def get_pinyin_sort_key(text: str) -> str:
|
||||||
"""Generate sort key for Chinese pinyin sorting
|
"""Generate sort key for Chinese pinyin sorting
|
||||||
|
|
||||||
This function uses pyuca (Python Unicode Collation Algorithm) to generate
|
This function uses pypinyin for true Chinese pinyin sorting.
|
||||||
sort keys that handle Chinese characters by their pinyin pronunciation.
|
If pypinyin is not available, it falls back to simple lowercase string sorting.
|
||||||
For non-Chinese text, it falls back to standard Unicode sorting.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to generate sort key for
|
text: Text to generate sort key for
|
||||||
|
|
@ -2091,14 +2092,14 @@ def get_pinyin_sort_key(text: str) -> str:
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# Use the globally initialized collator
|
if _PYPINYIN_AVAILABLE:
|
||||||
if _pyuca_available and _pinyin_collator is not None:
|
|
||||||
try:
|
try:
|
||||||
return _pinyin_collator.sort_key(text)
|
# Convert Chinese characters to pinyin, keep non-Chinese as-is
|
||||||
except Exception as e:
|
pinyin_list = pypinyin.lazy_pinyin(text, style=pypinyin.Style.NORMAL)
|
||||||
logger.warning(
|
return "".join(pinyin_list).lower()
|
||||||
f"Failed to generate pinyin sort key for '{text}': {e}. Using fallback."
|
except Exception:
|
||||||
)
|
# Silently fall back to simple string sorting on any error
|
||||||
|
return text.lower()
|
||||||
# Fallback to standard string sorting
|
else:
|
||||||
return text.lower()
|
# pypinyin not available, use simple string sorting
|
||||||
|
return text.lower()
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,8 @@ dependencies = [
|
||||||
"pandas>=2.0.0",
|
"pandas>=2.0.0",
|
||||||
"pipmaster",
|
"pipmaster",
|
||||||
"pydantic",
|
"pydantic",
|
||||||
|
"pypinyin",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
"pyuca",
|
|
||||||
"setuptools",
|
"setuptools",
|
||||||
"tenacity",
|
"tenacity",
|
||||||
"tiktoken",
|
"tiktoken",
|
||||||
|
|
@ -52,8 +52,8 @@ api = [
|
||||||
"pandas>=2.0.0",
|
"pandas>=2.0.0",
|
||||||
"pipmaster",
|
"pipmaster",
|
||||||
"pydantic",
|
"pydantic",
|
||||||
|
"pypinyin",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
"pyuca",
|
|
||||||
"setuptools",
|
"setuptools",
|
||||||
"tenacity",
|
"tenacity",
|
||||||
"tiktoken",
|
"tiktoken",
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue