Merge branch 'main' into tongda/main
This commit is contained in:
commit
297e460740
16 changed files with 616 additions and 89 deletions
41
env.example
41
env.example
|
|
@ -29,7 +29,7 @@ WEBUI_DESCRIPTION="Simple and Fast Graph Based RAG System"
|
||||||
# OLLAMA_EMULATING_MODEL_NAME=lightrag
|
# OLLAMA_EMULATING_MODEL_NAME=lightrag
|
||||||
OLLAMA_EMULATING_MODEL_TAG=latest
|
OLLAMA_EMULATING_MODEL_TAG=latest
|
||||||
|
|
||||||
### Max nodes return from graph retrieval in webui
|
### Max nodes for graph retrieval (Ensure WebUI local settings are also updated, which is limited to this value)
|
||||||
# MAX_GRAPH_NODES=1000
|
# MAX_GRAPH_NODES=1000
|
||||||
|
|
||||||
### Logging level
|
### Logging level
|
||||||
|
|
@ -172,6 +172,8 @@ MAX_PARALLEL_INSERT=2
|
||||||
### LLM Configuration
|
### LLM Configuration
|
||||||
### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock, gemini
|
### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock, gemini
|
||||||
### LLM_BINDING_HOST: host only for Ollama, endpoint for other LLM service
|
### LLM_BINDING_HOST: host only for Ollama, endpoint for other LLM service
|
||||||
|
### If LightRAG deployed in Docker:
|
||||||
|
### uses host.docker.internal instead of localhost in LLM_BINDING_HOST
|
||||||
###########################################################################
|
###########################################################################
|
||||||
### LLM request timeout setting for all llm (0 means no timeout for Ollma)
|
### LLM request timeout setting for all llm (0 means no timeout for Ollma)
|
||||||
# LLM_TIMEOUT=180
|
# LLM_TIMEOUT=180
|
||||||
|
|
@ -181,7 +183,7 @@ LLM_MODEL=gpt-4o
|
||||||
LLM_BINDING_HOST=https://api.openai.com/v1
|
LLM_BINDING_HOST=https://api.openai.com/v1
|
||||||
LLM_BINDING_API_KEY=your_api_key
|
LLM_BINDING_API_KEY=your_api_key
|
||||||
|
|
||||||
### Optional for Azure
|
### Env vars for Azure openai
|
||||||
# AZURE_OPENAI_API_VERSION=2024-08-01-preview
|
# AZURE_OPENAI_API_VERSION=2024-08-01-preview
|
||||||
# AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
# AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
||||||
|
|
||||||
|
|
@ -196,22 +198,16 @@ LLM_BINDING_API_KEY=your_api_key
|
||||||
# LLM_MODEL=gemini-flash-latest
|
# LLM_MODEL=gemini-flash-latest
|
||||||
# LLM_BINDING_API_KEY=your_gemini_api_key
|
# LLM_BINDING_API_KEY=your_gemini_api_key
|
||||||
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
|
# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
|
||||||
GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
|
|
||||||
|
### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
|
||||||
|
### lightrag-server --llm-binding gemini --help
|
||||||
|
### Gemini Specific Parameters
|
||||||
# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
|
# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
|
||||||
# GEMINI_LLM_TEMPERATURE=0.7
|
# GEMINI_LLM_TEMPERATURE=0.7
|
||||||
|
### Enable Thinking
|
||||||
### OpenAI Compatible API Specific Parameters
|
# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": -1, "include_thoughts": true}'
|
||||||
### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
|
### Disable Thinking
|
||||||
# OPENAI_LLM_TEMPERATURE=0.9
|
# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
|
||||||
### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
|
|
||||||
### Typically, max_tokens does not include prompt content, though some models, such as Gemini Models, are exceptions
|
|
||||||
### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider
|
|
||||||
# OPENAI_LLM_MAX_TOKENS=9000
|
|
||||||
### For OpenAI o1-mini or newer modles
|
|
||||||
OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
|
|
||||||
|
|
||||||
#### OpenAI's new API utilizes max_completion_tokens instead of max_tokens
|
|
||||||
# OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
|
|
||||||
|
|
||||||
### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
|
### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
|
||||||
### lightrag-server --llm-binding openai --help
|
### lightrag-server --llm-binding openai --help
|
||||||
|
|
@ -222,8 +218,17 @@ OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
|
||||||
### Qwen3 Specific Parameters deploy by vLLM
|
### Qwen3 Specific Parameters deploy by vLLM
|
||||||
# OPENAI_LLM_EXTRA_BODY='{"chat_template_kwargs": {"enable_thinking": false}}'
|
# OPENAI_LLM_EXTRA_BODY='{"chat_template_kwargs": {"enable_thinking": false}}'
|
||||||
|
|
||||||
|
### OpenAI Compatible API Specific Parameters
|
||||||
|
### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
|
||||||
|
# OPENAI_LLM_TEMPERATURE=0.9
|
||||||
|
### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
|
||||||
|
### Typically, max_tokens does not include prompt content
|
||||||
|
### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider
|
||||||
|
# OPENAI_LLM_MAX_TOKENS=9000
|
||||||
|
### For OpenAI o1-mini or newer modles utilizes max_completion_tokens instead of max_tokens
|
||||||
|
OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
|
||||||
|
|
||||||
### use the following command to see all support options for Ollama LLM
|
### use the following command to see all support options for Ollama LLM
|
||||||
### If LightRAG deployed in Docker uses host.docker.internal instead of localhost in LLM_BINDING_HOST
|
|
||||||
### lightrag-server --llm-binding ollama --help
|
### lightrag-server --llm-binding ollama --help
|
||||||
### Ollama Server Specific Parameters
|
### Ollama Server Specific Parameters
|
||||||
### OLLAMA_LLM_NUM_CTX must be provided, and should at least larger than MAX_TOTAL_TOKENS + 2000
|
### OLLAMA_LLM_NUM_CTX must be provided, and should at least larger than MAX_TOTAL_TOKENS + 2000
|
||||||
|
|
@ -240,6 +245,8 @@ OLLAMA_LLM_NUM_CTX=32768
|
||||||
### Embedding Configuration (Should not be changed after the first file processed)
|
### Embedding Configuration (Should not be changed after the first file processed)
|
||||||
### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock
|
### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock
|
||||||
### EMBEDDING_BINDING_HOST: host only for Ollama, endpoint for other Embedding service
|
### EMBEDDING_BINDING_HOST: host only for Ollama, endpoint for other Embedding service
|
||||||
|
### If LightRAG deployed in Docker:
|
||||||
|
### uses host.docker.internal instead of localhost in EMBEDDING_BINDING_HOST
|
||||||
#######################################################################################
|
#######################################################################################
|
||||||
# EMBEDDING_TIMEOUT=30
|
# EMBEDDING_TIMEOUT=30
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1 +1 @@
|
||||||
__api_version__ = "0252"
|
__api_version__ = "0254"
|
||||||
|
|
|
||||||
|
|
@ -1081,11 +1081,11 @@ async def pipeline_enqueue_file(
|
||||||
result = converter.convert(file_path)
|
result = converter.convert(file_path)
|
||||||
content = result.document.export_to_markdown()
|
content = result.document.export_to_markdown()
|
||||||
else:
|
else:
|
||||||
if not pm.is_installed("pypdf2"): # type: ignore
|
if not pm.is_installed("pypdf"): # type: ignore
|
||||||
pm.install("pypdf2")
|
pm.install("pypdf")
|
||||||
if not pm.is_installed("pycryptodome"): # type: ignore
|
if not pm.is_installed("pycryptodome"): # type: ignore
|
||||||
pm.install("pycryptodome")
|
pm.install("pycryptodome")
|
||||||
from PyPDF2 import PdfReader # type: ignore
|
from pypdf import PdfReader # type: ignore
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
pdf_file = BytesIO(file)
|
pdf_file = BytesIO(file)
|
||||||
|
|
|
||||||
|
|
@ -161,7 +161,20 @@ class JsonDocStatusStorage(DocStatusStorage):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[{self.workspace}] Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}"
|
f"[{self.workspace}] Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}"
|
||||||
)
|
)
|
||||||
write_json(data_dict, self._file_name)
|
|
||||||
|
# Write JSON and check if sanitization was applied
|
||||||
|
needs_reload = write_json(data_dict, self._file_name)
|
||||||
|
|
||||||
|
# If data was sanitized, reload cleaned data to update shared memory
|
||||||
|
if needs_reload:
|
||||||
|
logger.info(
|
||||||
|
f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}"
|
||||||
|
)
|
||||||
|
cleaned_data = load_json(self._file_name)
|
||||||
|
if cleaned_data is not None:
|
||||||
|
self._data.clear()
|
||||||
|
self._data.update(cleaned_data)
|
||||||
|
|
||||||
await clear_all_update_flags(self.final_namespace)
|
await clear_all_update_flags(self.final_namespace)
|
||||||
|
|
||||||
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,20 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[{self.workspace}] Process {os.getpid()} KV writting {data_count} records to {self.namespace}"
|
f"[{self.workspace}] Process {os.getpid()} KV writting {data_count} records to {self.namespace}"
|
||||||
)
|
)
|
||||||
write_json(data_dict, self._file_name)
|
|
||||||
|
# Write JSON and check if sanitization was applied
|
||||||
|
needs_reload = write_json(data_dict, self._file_name)
|
||||||
|
|
||||||
|
# If data was sanitized, reload cleaned data to update shared memory
|
||||||
|
if needs_reload:
|
||||||
|
logger.info(
|
||||||
|
f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}"
|
||||||
|
)
|
||||||
|
cleaned_data = load_json(self._file_name)
|
||||||
|
if cleaned_data is not None:
|
||||||
|
self._data.clear()
|
||||||
|
self._data.update(cleaned_data)
|
||||||
|
|
||||||
await clear_all_update_flags(self.final_namespace)
|
await clear_all_update_flags(self.final_namespace)
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
|
|
@ -224,7 +237,7 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
data: Original data dictionary that may contain legacy structure
|
data: Original data dictionary that may contain legacy structure
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Migrated data dictionary with flattened cache keys
|
Migrated data dictionary with flattened cache keys (sanitized if needed)
|
||||||
"""
|
"""
|
||||||
from lightrag.utils import generate_cache_key
|
from lightrag.utils import generate_cache_key
|
||||||
|
|
||||||
|
|
@ -261,8 +274,17 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure"
|
f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure"
|
||||||
)
|
)
|
||||||
# Persist migrated data immediately
|
# Persist migrated data immediately and check if sanitization was applied
|
||||||
write_json(migrated_data, self._file_name)
|
needs_reload = write_json(migrated_data, self._file_name)
|
||||||
|
|
||||||
|
# If data was sanitized during write, reload cleaned data
|
||||||
|
if needs_reload:
|
||||||
|
logger.info(
|
||||||
|
f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
|
||||||
|
)
|
||||||
|
cleaned_data = load_json(self._file_name)
|
||||||
|
if cleaned_data is not None:
|
||||||
|
return cleaned_data # Return cleaned data to update shared memory
|
||||||
|
|
||||||
return migrated_data
|
return migrated_data
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -873,6 +873,31 @@ class CleanupTool:
|
||||||
|
|
||||||
storage_name = STORAGE_TYPES[choice]
|
storage_name = STORAGE_TYPES[choice]
|
||||||
|
|
||||||
|
# Special warning for JsonKVStorage about concurrent access
|
||||||
|
if storage_name == "JsonKVStorage":
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(f"{BOLD_RED}⚠️ IMPORTANT WARNING - JsonKVStorage Concurrency{RESET}")
|
||||||
|
print("=" * 60)
|
||||||
|
print("\nJsonKVStorage is an in-memory database that does NOT support")
|
||||||
|
print("concurrent access to the same file by multiple programs.")
|
||||||
|
print("\nBefore proceeding, please ensure that:")
|
||||||
|
print(" • LightRAG Server is completely shut down")
|
||||||
|
print(" • No other programs are accessing the storage files")
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
|
||||||
|
confirm = (
|
||||||
|
input("\nHas LightRAG Server been shut down? (yes/no): ")
|
||||||
|
.strip()
|
||||||
|
.lower()
|
||||||
|
)
|
||||||
|
if confirm != "yes":
|
||||||
|
print(
|
||||||
|
"\n✓ Operation cancelled - Please shut down LightRAG Server first"
|
||||||
|
)
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
print("✓ Proceeding with JsonKVStorage cleanup...")
|
||||||
|
|
||||||
# Check configuration (warnings only, doesn't block)
|
# Check configuration (warnings only, doesn't block)
|
||||||
print("\nChecking configuration...")
|
print("\nChecking configuration...")
|
||||||
self.check_env_vars(storage_name)
|
self.check_env_vars(storage_name)
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,9 @@ if not logger.handlers:
|
||||||
# Set httpx logging level to WARNING
|
# Set httpx logging level to WARNING
|
||||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
# Precompile regex pattern for JSON sanitization (module-level, compiled once)
|
||||||
|
_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
|
||||||
|
|
||||||
# Global import for pypinyin with startup-time logging
|
# Global import for pypinyin with startup-time logging
|
||||||
try:
|
try:
|
||||||
import pypinyin
|
import pypinyin
|
||||||
|
|
@ -927,9 +930,123 @@ def load_json(file_name):
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_string_for_json(text: str) -> str:
|
||||||
|
"""Remove characters that cannot be encoded in UTF-8 for JSON serialization.
|
||||||
|
|
||||||
|
Uses regex for optimal performance with zero-copy optimization for clean strings.
|
||||||
|
Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: String to sanitize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Original string if clean (zero-copy), sanitized string if dirty
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Fast path: Check if sanitization is needed using C-level regex search
|
||||||
|
if not _SURROGATE_PATTERN.search(text):
|
||||||
|
return text # Zero-copy for clean strings - most common case
|
||||||
|
|
||||||
|
# Slow path: Remove problematic characters using C-level regex substitution
|
||||||
|
return _SURROGATE_PATTERN.sub("", text)
|
||||||
|
|
||||||
|
|
||||||
|
class SanitizingJSONEncoder(json.JSONEncoder):
|
||||||
|
"""
|
||||||
|
Custom JSON encoder that sanitizes data during serialization.
|
||||||
|
|
||||||
|
This encoder cleans strings during the encoding process without creating
|
||||||
|
a full copy of the data structure, making it memory-efficient for large datasets.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def encode(self, o):
|
||||||
|
"""Override encode method to handle simple string cases"""
|
||||||
|
if isinstance(o, str):
|
||||||
|
return json.encoder.encode_basestring(_sanitize_string_for_json(o))
|
||||||
|
return super().encode(o)
|
||||||
|
|
||||||
|
def iterencode(self, o, _one_shot=False):
|
||||||
|
"""
|
||||||
|
Override iterencode to sanitize strings during serialization.
|
||||||
|
This is the core method that handles complex nested structures.
|
||||||
|
"""
|
||||||
|
# Preprocess: sanitize all strings in the object
|
||||||
|
sanitized = self._sanitize_for_encoding(o)
|
||||||
|
|
||||||
|
# Call parent's iterencode with sanitized data
|
||||||
|
for chunk in super().iterencode(sanitized, _one_shot):
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
def _sanitize_for_encoding(self, obj):
|
||||||
|
"""
|
||||||
|
Recursively sanitize strings in an object.
|
||||||
|
Creates new objects only when necessary to avoid deep copies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
obj: Object to sanitize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sanitized object with cleaned strings
|
||||||
|
"""
|
||||||
|
if isinstance(obj, str):
|
||||||
|
return _sanitize_string_for_json(obj)
|
||||||
|
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
# Create new dict with sanitized keys and values
|
||||||
|
new_dict = {}
|
||||||
|
for k, v in obj.items():
|
||||||
|
clean_k = _sanitize_string_for_json(k) if isinstance(k, str) else k
|
||||||
|
clean_v = self._sanitize_for_encoding(v)
|
||||||
|
new_dict[clean_k] = clean_v
|
||||||
|
return new_dict
|
||||||
|
|
||||||
|
elif isinstance(obj, (list, tuple)):
|
||||||
|
# Sanitize list/tuple elements
|
||||||
|
cleaned = [self._sanitize_for_encoding(item) for item in obj]
|
||||||
|
return type(obj)(cleaned) if isinstance(obj, tuple) else cleaned
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Numbers, booleans, None, etc. remain unchanged
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
def write_json(json_obj, file_name):
|
def write_json(json_obj, file_name):
|
||||||
|
"""
|
||||||
|
Write JSON data to file with optimized sanitization strategy.
|
||||||
|
|
||||||
|
This function uses a two-stage approach:
|
||||||
|
1. Fast path: Try direct serialization (works for clean data ~99% of time)
|
||||||
|
2. Slow path: Use custom encoder that sanitizes during serialization
|
||||||
|
|
||||||
|
The custom encoder approach avoids creating a deep copy of the data,
|
||||||
|
making it memory-efficient. When sanitization occurs, the caller should
|
||||||
|
reload the cleaned data from the file to update shared memory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_obj: Object to serialize (may be a shallow copy from shared memory)
|
||||||
|
file_name: Output file path
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if sanitization was applied (caller should reload data),
|
||||||
|
False if direct write succeeded (no reload needed)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Strategy 1: Fast path - try direct serialization
|
||||||
|
with open(file_name, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(json_obj, f, indent=2, ensure_ascii=False)
|
||||||
|
return False # No sanitization needed, no reload required
|
||||||
|
|
||||||
|
except (UnicodeEncodeError, UnicodeDecodeError) as e:
|
||||||
|
logger.debug(f"Direct JSON write failed, using sanitizing encoder: {e}")
|
||||||
|
|
||||||
|
# Strategy 2: Use custom encoder (sanitizes during serialization, zero memory copy)
|
||||||
with open(file_name, "w", encoding="utf-8") as f:
|
with open(file_name, "w", encoding="utf-8") as f:
|
||||||
json.dump(json_obj, f, indent=2, ensure_ascii=False)
|
json.dump(json_obj, f, indent=2, ensure_ascii=False, cls=SanitizingJSONEncoder)
|
||||||
|
|
||||||
|
logger.info(f"JSON sanitization applied during write: {file_name}")
|
||||||
|
return True # Sanitization applied, reload recommended
|
||||||
|
|
||||||
|
|
||||||
class TokenizerInterface(Protocol):
|
class TokenizerInterface(Protocol):
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,6 @@ export default function QuerySettings() {
|
||||||
// Default values for reset functionality
|
// Default values for reset functionality
|
||||||
const defaultValues = useMemo(() => ({
|
const defaultValues = useMemo(() => ({
|
||||||
mode: 'mix' as QueryMode,
|
mode: 'mix' as QueryMode,
|
||||||
response_type: 'Multiple Paragraphs',
|
|
||||||
top_k: 40,
|
top_k: 40,
|
||||||
chunk_top_k: 20,
|
chunk_top_k: 20,
|
||||||
max_entity_tokens: 6000,
|
max_entity_tokens: 6000,
|
||||||
|
|
@ -153,46 +152,6 @@ export default function QuerySettings() {
|
||||||
</div>
|
</div>
|
||||||
</>
|
</>
|
||||||
|
|
||||||
{/* Response Format */}
|
|
||||||
<>
|
|
||||||
<TooltipProvider>
|
|
||||||
<Tooltip>
|
|
||||||
<TooltipTrigger asChild>
|
|
||||||
<label htmlFor="response_format_select" className="ml-1 cursor-help">
|
|
||||||
{t('retrievePanel.querySettings.responseFormat')}
|
|
||||||
</label>
|
|
||||||
</TooltipTrigger>
|
|
||||||
<TooltipContent side="left">
|
|
||||||
<p>{t('retrievePanel.querySettings.responseFormatTooltip')}</p>
|
|
||||||
</TooltipContent>
|
|
||||||
</Tooltip>
|
|
||||||
</TooltipProvider>
|
|
||||||
<div className="flex items-center gap-1">
|
|
||||||
<Select
|
|
||||||
value={querySettings.response_type}
|
|
||||||
onValueChange={(v) => handleChange('response_type', v)}
|
|
||||||
>
|
|
||||||
<SelectTrigger
|
|
||||||
id="response_format_select"
|
|
||||||
className="hover:bg-primary/5 h-9 cursor-pointer focus:ring-0 focus:ring-offset-0 focus:outline-0 active:right-0 flex-1 text-left [&>span]:break-all [&>span]:line-clamp-1"
|
|
||||||
>
|
|
||||||
<SelectValue />
|
|
||||||
</SelectTrigger>
|
|
||||||
<SelectContent>
|
|
||||||
<SelectGroup>
|
|
||||||
<SelectItem value="Multiple Paragraphs">{t('retrievePanel.querySettings.responseFormatOptions.multipleParagraphs')}</SelectItem>
|
|
||||||
<SelectItem value="Single Paragraph">{t('retrievePanel.querySettings.responseFormatOptions.singleParagraph')}</SelectItem>
|
|
||||||
<SelectItem value="Bullet Points">{t('retrievePanel.querySettings.responseFormatOptions.bulletPoints')}</SelectItem>
|
|
||||||
</SelectGroup>
|
|
||||||
</SelectContent>
|
|
||||||
</Select>
|
|
||||||
<ResetButton
|
|
||||||
onClick={() => handleReset('response_type')}
|
|
||||||
title="Reset to default (Multiple Paragraphs)"
|
|
||||||
/>
|
|
||||||
</div>
|
|
||||||
</>
|
|
||||||
|
|
||||||
{/* Top K */}
|
{/* Top K */}
|
||||||
<>
|
<>
|
||||||
<TooltipProvider>
|
<TooltipProvider>
|
||||||
|
|
|
||||||
|
|
@ -357,6 +357,7 @@ export default function RetrievalTesting() {
|
||||||
const queryParams = {
|
const queryParams = {
|
||||||
...state.querySettings,
|
...state.querySettings,
|
||||||
query: actualQuery,
|
query: actualQuery,
|
||||||
|
response_type: 'Multiple Paragraphs',
|
||||||
conversation_history: effectiveHistoryTurns > 0
|
conversation_history: effectiveHistoryTurns > 0
|
||||||
? prevMessages
|
? prevMessages
|
||||||
.filter((m) => m.isError !== true)
|
.filter((m) => m.isError !== true)
|
||||||
|
|
|
||||||
|
|
@ -123,7 +123,6 @@ const useSettingsStoreBase = create<SettingsState>()(
|
||||||
|
|
||||||
querySettings: {
|
querySettings: {
|
||||||
mode: 'global',
|
mode: 'global',
|
||||||
response_type: 'Multiple Paragraphs',
|
|
||||||
top_k: 40,
|
top_k: 40,
|
||||||
chunk_top_k: 20,
|
chunk_top_k: 20,
|
||||||
max_entity_tokens: 6000,
|
max_entity_tokens: 6000,
|
||||||
|
|
@ -239,7 +238,7 @@ const useSettingsStoreBase = create<SettingsState>()(
|
||||||
{
|
{
|
||||||
name: 'settings-storage',
|
name: 'settings-storage',
|
||||||
storage: createJSONStorage(() => localStorage),
|
storage: createJSONStorage(() => localStorage),
|
||||||
version: 18,
|
version: 19,
|
||||||
migrate: (state: any, version: number) => {
|
migrate: (state: any, version: number) => {
|
||||||
if (version < 2) {
|
if (version < 2) {
|
||||||
state.showEdgeLabel = false
|
state.showEdgeLabel = false
|
||||||
|
|
@ -336,6 +335,12 @@ const useSettingsStoreBase = create<SettingsState>()(
|
||||||
// Add userPromptHistory field for older versions
|
// Add userPromptHistory field for older versions
|
||||||
state.userPromptHistory = []
|
state.userPromptHistory = []
|
||||||
}
|
}
|
||||||
|
if (version < 19) {
|
||||||
|
// Remove deprecated response_type parameter
|
||||||
|
if (state.querySettings) {
|
||||||
|
delete state.querySettings.response_type
|
||||||
|
}
|
||||||
|
}
|
||||||
return state
|
return state
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -86,7 +86,7 @@ offline-docs = [
|
||||||
# Document processing dependencies
|
# Document processing dependencies
|
||||||
"openpyxl>=3.0.0,<4.0.0",
|
"openpyxl>=3.0.0,<4.0.0",
|
||||||
"pycryptodome>=3.0.0,<4.0.0",
|
"pycryptodome>=3.0.0,<4.0.0",
|
||||||
"pypdf2>=3.0.0",
|
"pypdf>=6.1.0",
|
||||||
"python-docx>=0.8.11,<2.0.0",
|
"python-docx>=0.8.11,<2.0.0",
|
||||||
"python-pptx>=0.6.21,<2.0.0",
|
"python-pptx>=0.6.21,<2.0.0",
|
||||||
]
|
]
|
||||||
|
|
@ -98,7 +98,7 @@ offline-storage = [
|
||||||
"pymilvus>=2.6.2,<3.0.0",
|
"pymilvus>=2.6.2,<3.0.0",
|
||||||
"pymongo>=4.0.0,<5.0.0",
|
"pymongo>=4.0.0,<5.0.0",
|
||||||
"asyncpg>=0.29.0,<1.0.0",
|
"asyncpg>=0.29.0,<1.0.0",
|
||||||
"qdrant-client>=1.7.0,<2.0.0",
|
"qdrant-client>=1.11.0,<2.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
offline-llm = [
|
offline-llm = [
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,6 @@
|
||||||
# Document processing dependencies (with version constraints matching pyproject.toml)
|
# Document processing dependencies (with version constraints matching pyproject.toml)
|
||||||
openpyxl>=3.0.0,<4.0.0
|
openpyxl>=3.0.0,<4.0.0
|
||||||
pycryptodome>=3.0.0,<4.0.0
|
pycryptodome>=3.0.0,<4.0.0
|
||||||
pypdf2>=3.0.0
|
pypdf>=6.1.0
|
||||||
python-docx>=0.8.11,<2.0.0
|
python-docx>=0.8.11,<2.0.0
|
||||||
python-pptx>=0.6.21,<2.0.0
|
python-pptx>=0.6.21,<2.0.0
|
||||||
|
|
|
||||||
|
|
@ -12,5 +12,5 @@ asyncpg>=0.29.0,<1.0.0
|
||||||
neo4j>=5.0.0,<7.0.0
|
neo4j>=5.0.0,<7.0.0
|
||||||
pymilvus>=2.6.2,<3.0.0
|
pymilvus>=2.6.2,<3.0.0
|
||||||
pymongo>=4.0.0,<5.0.0
|
pymongo>=4.0.0,<5.0.0
|
||||||
qdrant-client>=1.7.0,<2.0.0
|
qdrant-client>=1.11.0,<2.0.0
|
||||||
redis>=5.0.0,<8.0.0
|
redis>=5.0.0,<8.0.0
|
||||||
|
|
|
||||||
|
|
@ -24,10 +24,10 @@ openpyxl>=3.0.0,<4.0.0
|
||||||
pycryptodome>=3.0.0,<4.0.0
|
pycryptodome>=3.0.0,<4.0.0
|
||||||
pymilvus>=2.6.2,<3.0.0
|
pymilvus>=2.6.2,<3.0.0
|
||||||
pymongo>=4.0.0,<5.0.0
|
pymongo>=4.0.0,<5.0.0
|
||||||
pypdf2>=3.0.0
|
pypdf>=6.1.0
|
||||||
python-docx>=0.8.11,<2.0.0
|
python-docx>=0.8.11,<2.0.0
|
||||||
python-pptx>=0.6.21,<2.0.0
|
python-pptx>=0.6.21,<2.0.0
|
||||||
qdrant-client>=1.7.0,<2.0.0
|
qdrant-client>=1.11.0,<2.0.0
|
||||||
redis>=5.0.0,<8.0.0
|
redis>=5.0.0,<8.0.0
|
||||||
voyageai>=0.2.0,<1.0.0
|
voyageai>=0.2.0,<1.0.0
|
||||||
zhipuai>=2.0.0,<3.0.0
|
zhipuai>=2.0.0,<3.0.0
|
||||||
|
|
|
||||||
387
tests/test_write_json_optimization.py
Normal file
387
tests/test_write_json_optimization.py
Normal file
|
|
@ -0,0 +1,387 @@
|
||||||
|
"""
|
||||||
|
Test suite for write_json optimization
|
||||||
|
|
||||||
|
This test verifies:
|
||||||
|
1. Fast path works for clean data (no sanitization)
|
||||||
|
2. Slow path applies sanitization for dirty data
|
||||||
|
3. Sanitization is done during encoding (memory-efficient)
|
||||||
|
4. Reloading updates shared memory with cleaned data
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
from lightrag.utils import write_json, load_json, SanitizingJSONEncoder
|
||||||
|
|
||||||
|
|
||||||
|
class TestWriteJsonOptimization:
|
||||||
|
"""Test write_json optimization with two-stage approach"""
|
||||||
|
|
||||||
|
def test_fast_path_clean_data(self):
|
||||||
|
"""Test that clean data takes the fast path without sanitization"""
|
||||||
|
clean_data = {
|
||||||
|
"name": "John Doe",
|
||||||
|
"age": 30,
|
||||||
|
"items": ["apple", "banana", "cherry"],
|
||||||
|
"nested": {"key": "value", "number": 42},
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Write clean data - should return False (no sanitization)
|
||||||
|
needs_reload = write_json(clean_data, temp_file)
|
||||||
|
assert not needs_reload, "Clean data should not require sanitization"
|
||||||
|
|
||||||
|
# Verify data was written correctly
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
assert loaded_data == clean_data, "Loaded data should match original"
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_slow_path_dirty_data(self):
|
||||||
|
"""Test that dirty data triggers sanitization"""
|
||||||
|
# Create data with surrogate characters (U+D800 to U+DFFF)
|
||||||
|
dirty_string = "Hello\ud800World" # Contains surrogate character
|
||||||
|
dirty_data = {"text": dirty_string, "number": 123}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Write dirty data - should return True (sanitization applied)
|
||||||
|
needs_reload = write_json(dirty_data, temp_file)
|
||||||
|
assert needs_reload, "Dirty data should trigger sanitization"
|
||||||
|
|
||||||
|
# Verify data was written and sanitized
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
assert loaded_data is not None, "Data should be written"
|
||||||
|
assert loaded_data["number"] == 123, "Clean fields should remain unchanged"
|
||||||
|
# Surrogate character should be removed
|
||||||
|
assert (
|
||||||
|
"\ud800" not in loaded_data["text"]
|
||||||
|
), "Surrogate character should be removed"
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_sanitizing_encoder_removes_surrogates(self):
|
||||||
|
"""Test that SanitizingJSONEncoder removes surrogate characters"""
|
||||||
|
data_with_surrogates = {
|
||||||
|
"text": "Hello\ud800\udc00World", # Contains surrogate pair
|
||||||
|
"clean": "Clean text",
|
||||||
|
"nested": {"dirty_key\ud801": "value", "clean_key": "clean\ud802value"},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Encode using custom encoder
|
||||||
|
encoded = json.dumps(
|
||||||
|
data_with_surrogates, cls=SanitizingJSONEncoder, ensure_ascii=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify no surrogate characters in output
|
||||||
|
assert "\ud800" not in encoded, "Surrogate U+D800 should be removed"
|
||||||
|
assert "\udc00" not in encoded, "Surrogate U+DC00 should be removed"
|
||||||
|
assert "\ud801" not in encoded, "Surrogate U+D801 should be removed"
|
||||||
|
assert "\ud802" not in encoded, "Surrogate U+D802 should be removed"
|
||||||
|
|
||||||
|
# Verify clean parts remain
|
||||||
|
assert "Clean text" in encoded, "Clean text should remain"
|
||||||
|
assert "clean_key" in encoded, "Clean keys should remain"
|
||||||
|
|
||||||
|
def test_nested_structure_sanitization(self):
|
||||||
|
"""Test sanitization of deeply nested structures"""
|
||||||
|
nested_data = {
|
||||||
|
"level1": {
|
||||||
|
"level2": {
|
||||||
|
"level3": {"dirty": "text\ud800here", "clean": "normal text"},
|
||||||
|
"list": ["item1", "item\ud801dirty", "item3"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
needs_reload = write_json(nested_data, temp_file)
|
||||||
|
assert needs_reload, "Nested dirty data should trigger sanitization"
|
||||||
|
|
||||||
|
# Verify nested structure is preserved
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
assert "level1" in loaded_data
|
||||||
|
assert "level2" in loaded_data["level1"]
|
||||||
|
assert "level3" in loaded_data["level1"]["level2"]
|
||||||
|
|
||||||
|
# Verify surrogates are removed
|
||||||
|
dirty_text = loaded_data["level1"]["level2"]["level3"]["dirty"]
|
||||||
|
assert "\ud800" not in dirty_text, "Nested surrogate should be removed"
|
||||||
|
|
||||||
|
# Verify list items are sanitized
|
||||||
|
list_items = loaded_data["level1"]["level2"]["list"]
|
||||||
|
assert (
|
||||||
|
"\ud801" not in list_items[1]
|
||||||
|
), "List item surrogates should be removed"
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_unicode_non_characters_removed(self):
|
||||||
|
"""Test that Unicode non-characters (U+FFFE, U+FFFF) don't cause encoding errors
|
||||||
|
|
||||||
|
Note: U+FFFE and U+FFFF are valid UTF-8 characters (though discouraged),
|
||||||
|
so they don't trigger sanitization. They only get removed when explicitly
|
||||||
|
using the SanitizingJSONEncoder.
|
||||||
|
"""
|
||||||
|
data_with_nonchars = {"text1": "Hello\ufffeWorld", "text2": "Test\uffffString"}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# These characters are valid UTF-8, so they take the fast path
|
||||||
|
needs_reload = write_json(data_with_nonchars, temp_file)
|
||||||
|
assert not needs_reload, "U+FFFE/U+FFFF are valid UTF-8 characters"
|
||||||
|
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
# They're written as-is in the fast path
|
||||||
|
assert loaded_data == data_with_nonchars
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_mixed_clean_dirty_data(self):
|
||||||
|
"""Test data with both clean and dirty fields"""
|
||||||
|
mixed_data = {
|
||||||
|
"clean_field": "This is perfectly fine",
|
||||||
|
"dirty_field": "This has\ud800issues",
|
||||||
|
"number": 42,
|
||||||
|
"boolean": True,
|
||||||
|
"null_value": None,
|
||||||
|
"clean_list": [1, 2, 3],
|
||||||
|
"dirty_list": ["clean", "dirty\ud801item"],
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
needs_reload = write_json(mixed_data, temp_file)
|
||||||
|
assert (
|
||||||
|
needs_reload
|
||||||
|
), "Mixed data with dirty fields should trigger sanitization"
|
||||||
|
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
|
||||||
|
# Clean fields should remain unchanged
|
||||||
|
assert loaded_data["clean_field"] == "This is perfectly fine"
|
||||||
|
assert loaded_data["number"] == 42
|
||||||
|
assert loaded_data["boolean"]
|
||||||
|
assert loaded_data["null_value"] is None
|
||||||
|
assert loaded_data["clean_list"] == [1, 2, 3]
|
||||||
|
|
||||||
|
# Dirty fields should be sanitized
|
||||||
|
assert "\ud800" not in loaded_data["dirty_field"]
|
||||||
|
assert "\ud801" not in loaded_data["dirty_list"][1]
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_empty_and_none_strings(self):
|
||||||
|
"""Test handling of empty and None values"""
|
||||||
|
data = {
|
||||||
|
"empty": "",
|
||||||
|
"none": None,
|
||||||
|
"zero": 0,
|
||||||
|
"false": False,
|
||||||
|
"empty_list": [],
|
||||||
|
"empty_dict": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
needs_reload = write_json(data, temp_file)
|
||||||
|
assert (
|
||||||
|
not needs_reload
|
||||||
|
), "Clean empty values should not trigger sanitization"
|
||||||
|
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
assert loaded_data == data, "Empty/None values should be preserved"
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_specific_surrogate_udc9a(self):
|
||||||
|
"""Test specific surrogate character \\udc9a mentioned in the issue"""
|
||||||
|
# Test the exact surrogate character from the error message:
|
||||||
|
# UnicodeEncodeError: 'utf-8' codec can't encode character '\\udc9a'
|
||||||
|
data_with_udc9a = {
|
||||||
|
"text": "Some text with surrogate\udc9acharacter",
|
||||||
|
"position": 201, # As mentioned in the error
|
||||||
|
"clean_field": "Normal text",
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Write data - should trigger sanitization
|
||||||
|
needs_reload = write_json(data_with_udc9a, temp_file)
|
||||||
|
assert needs_reload, "Data with \\udc9a should trigger sanitization"
|
||||||
|
|
||||||
|
# Verify surrogate was removed
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
assert loaded_data is not None
|
||||||
|
assert "\udc9a" not in loaded_data["text"], "\\udc9a should be removed"
|
||||||
|
assert (
|
||||||
|
loaded_data["clean_field"] == "Normal text"
|
||||||
|
), "Clean fields should remain"
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_migration_with_surrogate_sanitization(self):
|
||||||
|
"""Test that migration process handles surrogate characters correctly
|
||||||
|
|
||||||
|
This test simulates the scenario where legacy cache contains surrogate
|
||||||
|
characters and ensures they are cleaned during migration.
|
||||||
|
"""
|
||||||
|
# Simulate legacy cache data with surrogate characters
|
||||||
|
legacy_data_with_surrogates = {
|
||||||
|
"cache_entry_1": {
|
||||||
|
"return": "Result with\ud800surrogate",
|
||||||
|
"cache_type": "extract",
|
||||||
|
"original_prompt": "Some\udc9aprompt",
|
||||||
|
},
|
||||||
|
"cache_entry_2": {
|
||||||
|
"return": "Clean result",
|
||||||
|
"cache_type": "query",
|
||||||
|
"original_prompt": "Clean prompt",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# First write the dirty data directly (simulating legacy cache file)
|
||||||
|
# Use custom encoder to force write even with surrogates
|
||||||
|
with open(temp_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(
|
||||||
|
legacy_data_with_surrogates,
|
||||||
|
f,
|
||||||
|
cls=SanitizingJSONEncoder,
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load and verify surrogates were cleaned during initial write
|
||||||
|
loaded_data = load_json(temp_file)
|
||||||
|
assert loaded_data is not None
|
||||||
|
|
||||||
|
# The data should be sanitized
|
||||||
|
assert (
|
||||||
|
"\ud800" not in loaded_data["cache_entry_1"]["return"]
|
||||||
|
), "Surrogate in return should be removed"
|
||||||
|
assert (
|
||||||
|
"\udc9a" not in loaded_data["cache_entry_1"]["original_prompt"]
|
||||||
|
), "Surrogate in prompt should be removed"
|
||||||
|
|
||||||
|
# Clean data should remain unchanged
|
||||||
|
assert (
|
||||||
|
loaded_data["cache_entry_2"]["return"] == "Clean result"
|
||||||
|
), "Clean data should remain"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
def test_empty_values_after_sanitization(self):
|
||||||
|
"""Test that data with empty values after sanitization is properly handled
|
||||||
|
|
||||||
|
Critical edge case: When sanitization results in data with empty string values,
|
||||||
|
we must use 'if cleaned_data is not None' instead of 'if cleaned_data' to ensure
|
||||||
|
proper reload, since truthy check on dict depends on content, not just existence.
|
||||||
|
"""
|
||||||
|
# Create data where ALL values are only surrogate characters
|
||||||
|
all_dirty_data = {
|
||||||
|
"key1": "\ud800\udc00\ud801",
|
||||||
|
"key2": "\ud802\ud803",
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Write dirty data - should trigger sanitization
|
||||||
|
needs_reload = write_json(all_dirty_data, temp_file)
|
||||||
|
assert needs_reload, "All-dirty data should trigger sanitization"
|
||||||
|
|
||||||
|
# Load the sanitized data
|
||||||
|
cleaned_data = load_json(temp_file)
|
||||||
|
|
||||||
|
# Critical assertions for the edge case
|
||||||
|
assert cleaned_data is not None, "Cleaned data should not be None"
|
||||||
|
# Sanitization removes surrogates but preserves keys with empty values
|
||||||
|
assert cleaned_data == {
|
||||||
|
"key1": "",
|
||||||
|
"key2": "",
|
||||||
|
}, "Surrogates should be removed, keys preserved"
|
||||||
|
# This dict is truthy because it has keys (even with empty values)
|
||||||
|
assert cleaned_data, "Dict with keys is truthy"
|
||||||
|
|
||||||
|
# Test the actual edge case: empty dict
|
||||||
|
empty_data = {}
|
||||||
|
needs_reload2 = write_json(empty_data, temp_file)
|
||||||
|
assert not needs_reload2, "Empty dict is clean"
|
||||||
|
|
||||||
|
reloaded_empty = load_json(temp_file)
|
||||||
|
assert reloaded_empty is not None, "Empty dict should not be None"
|
||||||
|
assert reloaded_empty == {}, "Empty dict should remain empty"
|
||||||
|
assert (
|
||||||
|
not reloaded_empty
|
||||||
|
), "Empty dict evaluates to False (the critical check)"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run tests
|
||||||
|
test = TestWriteJsonOptimization()
|
||||||
|
|
||||||
|
print("Running test_fast_path_clean_data...")
|
||||||
|
test.test_fast_path_clean_data()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_slow_path_dirty_data...")
|
||||||
|
test.test_slow_path_dirty_data()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_sanitizing_encoder_removes_surrogates...")
|
||||||
|
test.test_sanitizing_encoder_removes_surrogates()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_nested_structure_sanitization...")
|
||||||
|
test.test_nested_structure_sanitization()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_unicode_non_characters_removed...")
|
||||||
|
test.test_unicode_non_characters_removed()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_mixed_clean_dirty_data...")
|
||||||
|
test.test_mixed_clean_dirty_data()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_empty_and_none_strings...")
|
||||||
|
test.test_empty_and_none_strings()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_specific_surrogate_udc9a...")
|
||||||
|
test.test_specific_surrogate_udc9a()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_migration_with_surrogate_sanitization...")
|
||||||
|
test.test_migration_with_surrogate_sanitization()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("Running test_empty_values_after_sanitization...")
|
||||||
|
test.test_empty_values_after_sanitization()
|
||||||
|
print("✓ Passed")
|
||||||
|
|
||||||
|
print("\n✅ All tests passed!")
|
||||||
17
uv.lock
generated
17
uv.lock
generated
|
|
@ -1981,7 +1981,7 @@ offline = [
|
||||||
{ name = "pycryptodome" },
|
{ name = "pycryptodome" },
|
||||||
{ name = "pymilvus" },
|
{ name = "pymilvus" },
|
||||||
{ name = "pymongo" },
|
{ name = "pymongo" },
|
||||||
{ name = "pypdf2" },
|
{ name = "pypdf" },
|
||||||
{ name = "python-docx" },
|
{ name = "python-docx" },
|
||||||
{ name = "python-pptx" },
|
{ name = "python-pptx" },
|
||||||
{ name = "qdrant-client" },
|
{ name = "qdrant-client" },
|
||||||
|
|
@ -1992,7 +1992,7 @@ offline = [
|
||||||
offline-docs = [
|
offline-docs = [
|
||||||
{ name = "openpyxl" },
|
{ name = "openpyxl" },
|
||||||
{ name = "pycryptodome" },
|
{ name = "pycryptodome" },
|
||||||
{ name = "pypdf2" },
|
{ name = "pypdf" },
|
||||||
{ name = "python-docx" },
|
{ name = "python-docx" },
|
||||||
{ name = "python-pptx" },
|
{ name = "python-pptx" },
|
||||||
]
|
]
|
||||||
|
|
@ -2071,7 +2071,7 @@ requires-dist = [
|
||||||
{ name = "pyjwt", marker = "extra == 'api'", specifier = ">=2.8.0,<3.0.0" },
|
{ name = "pyjwt", marker = "extra == 'api'", specifier = ">=2.8.0,<3.0.0" },
|
||||||
{ name = "pymilvus", marker = "extra == 'offline-storage'", specifier = ">=2.6.2,<3.0.0" },
|
{ name = "pymilvus", marker = "extra == 'offline-storage'", specifier = ">=2.6.2,<3.0.0" },
|
||||||
{ name = "pymongo", marker = "extra == 'offline-storage'", specifier = ">=4.0.0,<5.0.0" },
|
{ name = "pymongo", marker = "extra == 'offline-storage'", specifier = ">=4.0.0,<5.0.0" },
|
||||||
{ name = "pypdf2", marker = "extra == 'offline-docs'", specifier = ">=3.0.0" },
|
{ name = "pypdf", marker = "extra == 'offline-docs'", specifier = ">=6.1.0" },
|
||||||
{ name = "pypinyin" },
|
{ name = "pypinyin" },
|
||||||
{ name = "pypinyin", marker = "extra == 'api'" },
|
{ name = "pypinyin", marker = "extra == 'api'" },
|
||||||
{ name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
|
{ name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
|
||||||
|
|
@ -2083,7 +2083,7 @@ requires-dist = [
|
||||||
{ name = "python-multipart", marker = "extra == 'api'" },
|
{ name = "python-multipart", marker = "extra == 'api'" },
|
||||||
{ name = "python-pptx", marker = "extra == 'offline-docs'", specifier = ">=0.6.21,<2.0.0" },
|
{ name = "python-pptx", marker = "extra == 'offline-docs'", specifier = ">=0.6.21,<2.0.0" },
|
||||||
{ name = "pytz", marker = "extra == 'api'" },
|
{ name = "pytz", marker = "extra == 'api'" },
|
||||||
{ name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.7.0,<2.0.0" },
|
{ name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
|
||||||
{ name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
|
{ name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
|
||||||
{ name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
|
{ name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
|
||||||
{ name = "setuptools" },
|
{ name = "setuptools" },
|
||||||
|
|
@ -3977,15 +3977,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/fa/ed/494fd0cc1190a7c335e6958eeaee6f373a281869830255c2ed4785dac135/pypdf-6.1.3-py3-none-any.whl", hash = "sha256:eb049195e46f014fc155f566fa20e09d70d4646a9891164ac25fa0cbcfcdbcb5", size = 323863, upload-time = "2025-10-22T16:13:44.174Z" },
|
{ url = "https://files.pythonhosted.org/packages/fa/ed/494fd0cc1190a7c335e6958eeaee6f373a281869830255c2ed4785dac135/pypdf-6.1.3-py3-none-any.whl", hash = "sha256:eb049195e46f014fc155f566fa20e09d70d4646a9891164ac25fa0cbcfcdbcb5", size = 323863, upload-time = "2025-10-22T16:13:44.174Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pypdf2"
|
|
||||||
version = "3.0.1"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pypinyin"
|
name = "pypinyin"
|
||||||
version = "0.55.0"
|
version = "0.55.0"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue