diff --git a/env.example b/env.example index 534bd22a..a95ff9bf 100644 --- a/env.example +++ b/env.example @@ -29,7 +29,7 @@ WEBUI_DESCRIPTION="Simple and Fast Graph Based RAG System" # OLLAMA_EMULATING_MODEL_NAME=lightrag OLLAMA_EMULATING_MODEL_TAG=latest -### Max nodes return from graph retrieval in webui +### Max nodes for graph retrieval (Ensure WebUI local settings are also updated, which is limited to this value) # MAX_GRAPH_NODES=1000 ### Logging level @@ -172,6 +172,8 @@ MAX_PARALLEL_INSERT=2 ### LLM Configuration ### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock, gemini ### LLM_BINDING_HOST: host only for Ollama, endpoint for other LLM service +### If LightRAG deployed in Docker: +### uses host.docker.internal instead of localhost in LLM_BINDING_HOST ########################################################################### ### LLM request timeout setting for all llm (0 means no timeout for Ollma) # LLM_TIMEOUT=180 @@ -181,7 +183,7 @@ LLM_MODEL=gpt-4o LLM_BINDING_HOST=https://api.openai.com/v1 LLM_BINDING_API_KEY=your_api_key -### Optional for Azure +### Env vars for Azure openai # AZURE_OPENAI_API_VERSION=2024-08-01-preview # AZURE_OPENAI_DEPLOYMENT=gpt-4o @@ -196,22 +198,16 @@ LLM_BINDING_API_KEY=your_api_key # LLM_MODEL=gemini-flash-latest # LLM_BINDING_API_KEY=your_gemini_api_key # LLM_BINDING_HOST=https://generativelanguage.googleapis.com -GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}' + +### use the following command to see all support options for OpenAI, azure_openai or OpenRouter +### lightrag-server --llm-binding gemini --help +### Gemini Specific Parameters # GEMINI_LLM_MAX_OUTPUT_TOKENS=9000 # GEMINI_LLM_TEMPERATURE=0.7 - -### OpenAI Compatible API Specific Parameters -### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B. -# OPENAI_LLM_TEMPERATURE=0.9 -### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s) -### Typically, max_tokens does not include prompt content, though some models, such as Gemini Models, are exceptions -### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider -# OPENAI_LLM_MAX_TOKENS=9000 -### For OpenAI o1-mini or newer modles -OPENAI_LLM_MAX_COMPLETION_TOKENS=9000 - -#### OpenAI's new API utilizes max_completion_tokens instead of max_tokens -# OPENAI_LLM_MAX_COMPLETION_TOKENS=9000 +### Enable Thinking +# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": -1, "include_thoughts": true}' +### Disable Thinking +# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}' ### use the following command to see all support options for OpenAI, azure_openai or OpenRouter ### lightrag-server --llm-binding openai --help @@ -222,8 +218,17 @@ OPENAI_LLM_MAX_COMPLETION_TOKENS=9000 ### Qwen3 Specific Parameters deploy by vLLM # OPENAI_LLM_EXTRA_BODY='{"chat_template_kwargs": {"enable_thinking": false}}' +### OpenAI Compatible API Specific Parameters +### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B. +# OPENAI_LLM_TEMPERATURE=0.9 +### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s) +### Typically, max_tokens does not include prompt content +### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider +# OPENAI_LLM_MAX_TOKENS=9000 +### For OpenAI o1-mini or newer modles utilizes max_completion_tokens instead of max_tokens +OPENAI_LLM_MAX_COMPLETION_TOKENS=9000 + ### use the following command to see all support options for Ollama LLM -### If LightRAG deployed in Docker uses host.docker.internal instead of localhost in LLM_BINDING_HOST ### lightrag-server --llm-binding ollama --help ### Ollama Server Specific Parameters ### OLLAMA_LLM_NUM_CTX must be provided, and should at least larger than MAX_TOTAL_TOKENS + 2000 @@ -240,6 +245,8 @@ OLLAMA_LLM_NUM_CTX=32768 ### Embedding Configuration (Should not be changed after the first file processed) ### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock ### EMBEDDING_BINDING_HOST: host only for Ollama, endpoint for other Embedding service +### If LightRAG deployed in Docker: +### uses host.docker.internal instead of localhost in EMBEDDING_BINDING_HOST ####################################################################################### # EMBEDDING_TIMEOUT=30 diff --git a/lightrag/api/__init__.py b/lightrag/api/__init__.py index 02b01dcc..8fdc9fae 100644 --- a/lightrag/api/__init__.py +++ b/lightrag/api/__init__.py @@ -1 +1 @@ -__api_version__ = "0252" +__api_version__ = "0254" diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 3e479a53..f86bcea8 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1081,11 +1081,11 @@ async def pipeline_enqueue_file( result = converter.convert(file_path) content = result.document.export_to_markdown() else: - if not pm.is_installed("pypdf2"): # type: ignore - pm.install("pypdf2") + if not pm.is_installed("pypdf"): # type: ignore + pm.install("pypdf") if not pm.is_installed("pycryptodome"): # type: ignore pm.install("pycryptodome") - from PyPDF2 import PdfReader # type: ignore + from pypdf import PdfReader # type: ignore from io import BytesIO pdf_file = BytesIO(file) diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 014499f2..bf6e7b17 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -161,7 +161,20 @@ class JsonDocStatusStorage(DocStatusStorage): logger.debug( f"[{self.workspace}] Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}" ) - write_json(data_dict, self._file_name) + + # Write JSON and check if sanitization was applied + needs_reload = write_json(data_dict, self._file_name) + + # If data was sanitized, reload cleaned data to update shared memory + if needs_reload: + logger.info( + f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}" + ) + cleaned_data = load_json(self._file_name) + if cleaned_data is not None: + self._data.clear() + self._data.update(cleaned_data) + await clear_all_update_flags(self.final_namespace) async def upsert(self, data: dict[str, dict[str, Any]]) -> None: diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index fd016b14..f9adb20f 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -81,7 +81,20 @@ class JsonKVStorage(BaseKVStorage): logger.debug( f"[{self.workspace}] Process {os.getpid()} KV writting {data_count} records to {self.namespace}" ) - write_json(data_dict, self._file_name) + + # Write JSON and check if sanitization was applied + needs_reload = write_json(data_dict, self._file_name) + + # If data was sanitized, reload cleaned data to update shared memory + if needs_reload: + logger.info( + f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}" + ) + cleaned_data = load_json(self._file_name) + if cleaned_data is not None: + self._data.clear() + self._data.update(cleaned_data) + await clear_all_update_flags(self.final_namespace) async def get_by_id(self, id: str) -> dict[str, Any] | None: @@ -224,7 +237,7 @@ class JsonKVStorage(BaseKVStorage): data: Original data dictionary that may contain legacy structure Returns: - Migrated data dictionary with flattened cache keys + Migrated data dictionary with flattened cache keys (sanitized if needed) """ from lightrag.utils import generate_cache_key @@ -261,8 +274,17 @@ class JsonKVStorage(BaseKVStorage): logger.info( f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure" ) - # Persist migrated data immediately - write_json(migrated_data, self._file_name) + # Persist migrated data immediately and check if sanitization was applied + needs_reload = write_json(migrated_data, self._file_name) + + # If data was sanitized during write, reload cleaned data + if needs_reload: + logger.info( + f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}" + ) + cleaned_data = load_json(self._file_name) + if cleaned_data is not None: + return cleaned_data # Return cleaned data to update shared memory return migrated_data diff --git a/lightrag/tools/clean_llm_query_cache.py b/lightrag/tools/clean_llm_query_cache.py index 1b688c29..eca658c7 100644 --- a/lightrag/tools/clean_llm_query_cache.py +++ b/lightrag/tools/clean_llm_query_cache.py @@ -873,6 +873,31 @@ class CleanupTool: storage_name = STORAGE_TYPES[choice] + # Special warning for JsonKVStorage about concurrent access + if storage_name == "JsonKVStorage": + print("\n" + "=" * 60) + print(f"{BOLD_RED}⚠️ IMPORTANT WARNING - JsonKVStorage Concurrency{RESET}") + print("=" * 60) + print("\nJsonKVStorage is an in-memory database that does NOT support") + print("concurrent access to the same file by multiple programs.") + print("\nBefore proceeding, please ensure that:") + print(" • LightRAG Server is completely shut down") + print(" • No other programs are accessing the storage files") + print("\n" + "=" * 60) + + confirm = ( + input("\nHas LightRAG Server been shut down? (yes/no): ") + .strip() + .lower() + ) + if confirm != "yes": + print( + "\n✓ Operation cancelled - Please shut down LightRAG Server first" + ) + return None, None, None + + print("✓ Proceeding with JsonKVStorage cleanup...") + # Check configuration (warnings only, doesn't block) print("\nChecking configuration...") self.check_env_vars(storage_name) diff --git a/lightrag/utils.py b/lightrag/utils.py index 460ede3c..b78b7523 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -56,6 +56,9 @@ if not logger.handlers: # Set httpx logging level to WARNING logging.getLogger("httpx").setLevel(logging.WARNING) +# Precompile regex pattern for JSON sanitization (module-level, compiled once) +_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]") + # Global import for pypinyin with startup-time logging try: import pypinyin @@ -927,9 +930,123 @@ def load_json(file_name): return json.load(f) +def _sanitize_string_for_json(text: str) -> str: + """Remove characters that cannot be encoded in UTF-8 for JSON serialization. + + Uses regex for optimal performance with zero-copy optimization for clean strings. + Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings. + + Args: + text: String to sanitize + + Returns: + Original string if clean (zero-copy), sanitized string if dirty + """ + if not text: + return text + + # Fast path: Check if sanitization is needed using C-level regex search + if not _SURROGATE_PATTERN.search(text): + return text # Zero-copy for clean strings - most common case + + # Slow path: Remove problematic characters using C-level regex substitution + return _SURROGATE_PATTERN.sub("", text) + + +class SanitizingJSONEncoder(json.JSONEncoder): + """ + Custom JSON encoder that sanitizes data during serialization. + + This encoder cleans strings during the encoding process without creating + a full copy of the data structure, making it memory-efficient for large datasets. + """ + + def encode(self, o): + """Override encode method to handle simple string cases""" + if isinstance(o, str): + return json.encoder.encode_basestring(_sanitize_string_for_json(o)) + return super().encode(o) + + def iterencode(self, o, _one_shot=False): + """ + Override iterencode to sanitize strings during serialization. + This is the core method that handles complex nested structures. + """ + # Preprocess: sanitize all strings in the object + sanitized = self._sanitize_for_encoding(o) + + # Call parent's iterencode with sanitized data + for chunk in super().iterencode(sanitized, _one_shot): + yield chunk + + def _sanitize_for_encoding(self, obj): + """ + Recursively sanitize strings in an object. + Creates new objects only when necessary to avoid deep copies. + + Args: + obj: Object to sanitize + + Returns: + Sanitized object with cleaned strings + """ + if isinstance(obj, str): + return _sanitize_string_for_json(obj) + + elif isinstance(obj, dict): + # Create new dict with sanitized keys and values + new_dict = {} + for k, v in obj.items(): + clean_k = _sanitize_string_for_json(k) if isinstance(k, str) else k + clean_v = self._sanitize_for_encoding(v) + new_dict[clean_k] = clean_v + return new_dict + + elif isinstance(obj, (list, tuple)): + # Sanitize list/tuple elements + cleaned = [self._sanitize_for_encoding(item) for item in obj] + return type(obj)(cleaned) if isinstance(obj, tuple) else cleaned + + else: + # Numbers, booleans, None, etc. remain unchanged + return obj + + def write_json(json_obj, file_name): + """ + Write JSON data to file with optimized sanitization strategy. + + This function uses a two-stage approach: + 1. Fast path: Try direct serialization (works for clean data ~99% of time) + 2. Slow path: Use custom encoder that sanitizes during serialization + + The custom encoder approach avoids creating a deep copy of the data, + making it memory-efficient. When sanitization occurs, the caller should + reload the cleaned data from the file to update shared memory. + + Args: + json_obj: Object to serialize (may be a shallow copy from shared memory) + file_name: Output file path + + Returns: + bool: True if sanitization was applied (caller should reload data), + False if direct write succeeded (no reload needed) + """ + try: + # Strategy 1: Fast path - try direct serialization + with open(file_name, "w", encoding="utf-8") as f: + json.dump(json_obj, f, indent=2, ensure_ascii=False) + return False # No sanitization needed, no reload required + + except (UnicodeEncodeError, UnicodeDecodeError) as e: + logger.debug(f"Direct JSON write failed, using sanitizing encoder: {e}") + + # Strategy 2: Use custom encoder (sanitizes during serialization, zero memory copy) with open(file_name, "w", encoding="utf-8") as f: - json.dump(json_obj, f, indent=2, ensure_ascii=False) + json.dump(json_obj, f, indent=2, ensure_ascii=False, cls=SanitizingJSONEncoder) + + logger.info(f"JSON sanitization applied during write: {file_name}") + return True # Sanitization applied, reload recommended class TokenizerInterface(Protocol): diff --git a/lightrag_webui/src/components/retrieval/QuerySettings.tsx b/lightrag_webui/src/components/retrieval/QuerySettings.tsx index 4ffebbb1..0b0096c0 100644 --- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx +++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx @@ -40,7 +40,6 @@ export default function QuerySettings() { // Default values for reset functionality const defaultValues = useMemo(() => ({ mode: 'mix' as QueryMode, - response_type: 'Multiple Paragraphs', top_k: 40, chunk_top_k: 20, max_entity_tokens: 6000, @@ -153,46 +152,6 @@ export default function QuerySettings() { - {/* Response Format */} - <> - - - - - - -

{t('retrievePanel.querySettings.responseFormatTooltip')}

-
-
-
-
- - handleReset('response_type')} - title="Reset to default (Multiple Paragraphs)" - /> -
- - {/* Top K */} <> diff --git a/lightrag_webui/src/features/RetrievalTesting.tsx b/lightrag_webui/src/features/RetrievalTesting.tsx index 3fe8ec13..d7a475f6 100644 --- a/lightrag_webui/src/features/RetrievalTesting.tsx +++ b/lightrag_webui/src/features/RetrievalTesting.tsx @@ -357,6 +357,7 @@ export default function RetrievalTesting() { const queryParams = { ...state.querySettings, query: actualQuery, + response_type: 'Multiple Paragraphs', conversation_history: effectiveHistoryTurns > 0 ? prevMessages .filter((m) => m.isError !== true) diff --git a/lightrag_webui/src/stores/settings.ts b/lightrag_webui/src/stores/settings.ts index 966a2504..ac300af8 100644 --- a/lightrag_webui/src/stores/settings.ts +++ b/lightrag_webui/src/stores/settings.ts @@ -123,7 +123,6 @@ const useSettingsStoreBase = create()( querySettings: { mode: 'global', - response_type: 'Multiple Paragraphs', top_k: 40, chunk_top_k: 20, max_entity_tokens: 6000, @@ -239,7 +238,7 @@ const useSettingsStoreBase = create()( { name: 'settings-storage', storage: createJSONStorage(() => localStorage), - version: 18, + version: 19, migrate: (state: any, version: number) => { if (version < 2) { state.showEdgeLabel = false @@ -336,6 +335,12 @@ const useSettingsStoreBase = create()( // Add userPromptHistory field for older versions state.userPromptHistory = [] } + if (version < 19) { + // Remove deprecated response_type parameter + if (state.querySettings) { + delete state.querySettings.response_type + } + } return state } } diff --git a/pyproject.toml b/pyproject.toml index d0052b74..81e44aff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,7 @@ offline-docs = [ # Document processing dependencies "openpyxl>=3.0.0,<4.0.0", "pycryptodome>=3.0.0,<4.0.0", - "pypdf2>=3.0.0", + "pypdf>=6.1.0", "python-docx>=0.8.11,<2.0.0", "python-pptx>=0.6.21,<2.0.0", ] @@ -98,7 +98,7 @@ offline-storage = [ "pymilvus>=2.6.2,<3.0.0", "pymongo>=4.0.0,<5.0.0", "asyncpg>=0.29.0,<1.0.0", - "qdrant-client>=1.7.0,<2.0.0", + "qdrant-client>=1.11.0,<2.0.0", ] offline-llm = [ diff --git a/requirements-offline-docs.txt b/requirements-offline-docs.txt index 14d782fd..12f02080 100644 --- a/requirements-offline-docs.txt +++ b/requirements-offline-docs.txt @@ -10,6 +10,6 @@ # Document processing dependencies (with version constraints matching pyproject.toml) openpyxl>=3.0.0,<4.0.0 pycryptodome>=3.0.0,<4.0.0 -pypdf2>=3.0.0 +pypdf>=6.1.0 python-docx>=0.8.11,<2.0.0 python-pptx>=0.6.21,<2.0.0 diff --git a/requirements-offline-storage.txt b/requirements-offline-storage.txt index 6c960623..13a9c0e2 100644 --- a/requirements-offline-storage.txt +++ b/requirements-offline-storage.txt @@ -12,5 +12,5 @@ asyncpg>=0.29.0,<1.0.0 neo4j>=5.0.0,<7.0.0 pymilvus>=2.6.2,<3.0.0 pymongo>=4.0.0,<5.0.0 -qdrant-client>=1.7.0,<2.0.0 +qdrant-client>=1.11.0,<2.0.0 redis>=5.0.0,<8.0.0 diff --git a/requirements-offline.txt b/requirements-offline.txt index 8dfb1b01..50848093 100644 --- a/requirements-offline.txt +++ b/requirements-offline.txt @@ -24,10 +24,10 @@ openpyxl>=3.0.0,<4.0.0 pycryptodome>=3.0.0,<4.0.0 pymilvus>=2.6.2,<3.0.0 pymongo>=4.0.0,<5.0.0 -pypdf2>=3.0.0 +pypdf>=6.1.0 python-docx>=0.8.11,<2.0.0 python-pptx>=0.6.21,<2.0.0 -qdrant-client>=1.7.0,<2.0.0 +qdrant-client>=1.11.0,<2.0.0 redis>=5.0.0,<8.0.0 voyageai>=0.2.0,<1.0.0 zhipuai>=2.0.0,<3.0.0 diff --git a/tests/test_write_json_optimization.py b/tests/test_write_json_optimization.py new file mode 100644 index 00000000..0a92904f --- /dev/null +++ b/tests/test_write_json_optimization.py @@ -0,0 +1,387 @@ +""" +Test suite for write_json optimization + +This test verifies: +1. Fast path works for clean data (no sanitization) +2. Slow path applies sanitization for dirty data +3. Sanitization is done during encoding (memory-efficient) +4. Reloading updates shared memory with cleaned data +""" + +import os +import json +import tempfile +from lightrag.utils import write_json, load_json, SanitizingJSONEncoder + + +class TestWriteJsonOptimization: + """Test write_json optimization with two-stage approach""" + + def test_fast_path_clean_data(self): + """Test that clean data takes the fast path without sanitization""" + clean_data = { + "name": "John Doe", + "age": 30, + "items": ["apple", "banana", "cherry"], + "nested": {"key": "value", "number": 42}, + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + # Write clean data - should return False (no sanitization) + needs_reload = write_json(clean_data, temp_file) + assert not needs_reload, "Clean data should not require sanitization" + + # Verify data was written correctly + loaded_data = load_json(temp_file) + assert loaded_data == clean_data, "Loaded data should match original" + finally: + os.unlink(temp_file) + + def test_slow_path_dirty_data(self): + """Test that dirty data triggers sanitization""" + # Create data with surrogate characters (U+D800 to U+DFFF) + dirty_string = "Hello\ud800World" # Contains surrogate character + dirty_data = {"text": dirty_string, "number": 123} + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + # Write dirty data - should return True (sanitization applied) + needs_reload = write_json(dirty_data, temp_file) + assert needs_reload, "Dirty data should trigger sanitization" + + # Verify data was written and sanitized + loaded_data = load_json(temp_file) + assert loaded_data is not None, "Data should be written" + assert loaded_data["number"] == 123, "Clean fields should remain unchanged" + # Surrogate character should be removed + assert ( + "\ud800" not in loaded_data["text"] + ), "Surrogate character should be removed" + finally: + os.unlink(temp_file) + + def test_sanitizing_encoder_removes_surrogates(self): + """Test that SanitizingJSONEncoder removes surrogate characters""" + data_with_surrogates = { + "text": "Hello\ud800\udc00World", # Contains surrogate pair + "clean": "Clean text", + "nested": {"dirty_key\ud801": "value", "clean_key": "clean\ud802value"}, + } + + # Encode using custom encoder + encoded = json.dumps( + data_with_surrogates, cls=SanitizingJSONEncoder, ensure_ascii=False + ) + + # Verify no surrogate characters in output + assert "\ud800" not in encoded, "Surrogate U+D800 should be removed" + assert "\udc00" not in encoded, "Surrogate U+DC00 should be removed" + assert "\ud801" not in encoded, "Surrogate U+D801 should be removed" + assert "\ud802" not in encoded, "Surrogate U+D802 should be removed" + + # Verify clean parts remain + assert "Clean text" in encoded, "Clean text should remain" + assert "clean_key" in encoded, "Clean keys should remain" + + def test_nested_structure_sanitization(self): + """Test sanitization of deeply nested structures""" + nested_data = { + "level1": { + "level2": { + "level3": {"dirty": "text\ud800here", "clean": "normal text"}, + "list": ["item1", "item\ud801dirty", "item3"], + } + } + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + needs_reload = write_json(nested_data, temp_file) + assert needs_reload, "Nested dirty data should trigger sanitization" + + # Verify nested structure is preserved + loaded_data = load_json(temp_file) + assert "level1" in loaded_data + assert "level2" in loaded_data["level1"] + assert "level3" in loaded_data["level1"]["level2"] + + # Verify surrogates are removed + dirty_text = loaded_data["level1"]["level2"]["level3"]["dirty"] + assert "\ud800" not in dirty_text, "Nested surrogate should be removed" + + # Verify list items are sanitized + list_items = loaded_data["level1"]["level2"]["list"] + assert ( + "\ud801" not in list_items[1] + ), "List item surrogates should be removed" + finally: + os.unlink(temp_file) + + def test_unicode_non_characters_removed(self): + """Test that Unicode non-characters (U+FFFE, U+FFFF) don't cause encoding errors + + Note: U+FFFE and U+FFFF are valid UTF-8 characters (though discouraged), + so they don't trigger sanitization. They only get removed when explicitly + using the SanitizingJSONEncoder. + """ + data_with_nonchars = {"text1": "Hello\ufffeWorld", "text2": "Test\uffffString"} + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + # These characters are valid UTF-8, so they take the fast path + needs_reload = write_json(data_with_nonchars, temp_file) + assert not needs_reload, "U+FFFE/U+FFFF are valid UTF-8 characters" + + loaded_data = load_json(temp_file) + # They're written as-is in the fast path + assert loaded_data == data_with_nonchars + finally: + os.unlink(temp_file) + + def test_mixed_clean_dirty_data(self): + """Test data with both clean and dirty fields""" + mixed_data = { + "clean_field": "This is perfectly fine", + "dirty_field": "This has\ud800issues", + "number": 42, + "boolean": True, + "null_value": None, + "clean_list": [1, 2, 3], + "dirty_list": ["clean", "dirty\ud801item"], + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + needs_reload = write_json(mixed_data, temp_file) + assert ( + needs_reload + ), "Mixed data with dirty fields should trigger sanitization" + + loaded_data = load_json(temp_file) + + # Clean fields should remain unchanged + assert loaded_data["clean_field"] == "This is perfectly fine" + assert loaded_data["number"] == 42 + assert loaded_data["boolean"] + assert loaded_data["null_value"] is None + assert loaded_data["clean_list"] == [1, 2, 3] + + # Dirty fields should be sanitized + assert "\ud800" not in loaded_data["dirty_field"] + assert "\ud801" not in loaded_data["dirty_list"][1] + finally: + os.unlink(temp_file) + + def test_empty_and_none_strings(self): + """Test handling of empty and None values""" + data = { + "empty": "", + "none": None, + "zero": 0, + "false": False, + "empty_list": [], + "empty_dict": {}, + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + needs_reload = write_json(data, temp_file) + assert ( + not needs_reload + ), "Clean empty values should not trigger sanitization" + + loaded_data = load_json(temp_file) + assert loaded_data == data, "Empty/None values should be preserved" + finally: + os.unlink(temp_file) + + def test_specific_surrogate_udc9a(self): + """Test specific surrogate character \\udc9a mentioned in the issue""" + # Test the exact surrogate character from the error message: + # UnicodeEncodeError: 'utf-8' codec can't encode character '\\udc9a' + data_with_udc9a = { + "text": "Some text with surrogate\udc9acharacter", + "position": 201, # As mentioned in the error + "clean_field": "Normal text", + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + # Write data - should trigger sanitization + needs_reload = write_json(data_with_udc9a, temp_file) + assert needs_reload, "Data with \\udc9a should trigger sanitization" + + # Verify surrogate was removed + loaded_data = load_json(temp_file) + assert loaded_data is not None + assert "\udc9a" not in loaded_data["text"], "\\udc9a should be removed" + assert ( + loaded_data["clean_field"] == "Normal text" + ), "Clean fields should remain" + finally: + os.unlink(temp_file) + + def test_migration_with_surrogate_sanitization(self): + """Test that migration process handles surrogate characters correctly + + This test simulates the scenario where legacy cache contains surrogate + characters and ensures they are cleaned during migration. + """ + # Simulate legacy cache data with surrogate characters + legacy_data_with_surrogates = { + "cache_entry_1": { + "return": "Result with\ud800surrogate", + "cache_type": "extract", + "original_prompt": "Some\udc9aprompt", + }, + "cache_entry_2": { + "return": "Clean result", + "cache_type": "query", + "original_prompt": "Clean prompt", + }, + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + # First write the dirty data directly (simulating legacy cache file) + # Use custom encoder to force write even with surrogates + with open(temp_file, "w", encoding="utf-8") as f: + json.dump( + legacy_data_with_surrogates, + f, + cls=SanitizingJSONEncoder, + ensure_ascii=False, + ) + + # Load and verify surrogates were cleaned during initial write + loaded_data = load_json(temp_file) + assert loaded_data is not None + + # The data should be sanitized + assert ( + "\ud800" not in loaded_data["cache_entry_1"]["return"] + ), "Surrogate in return should be removed" + assert ( + "\udc9a" not in loaded_data["cache_entry_1"]["original_prompt"] + ), "Surrogate in prompt should be removed" + + # Clean data should remain unchanged + assert ( + loaded_data["cache_entry_2"]["return"] == "Clean result" + ), "Clean data should remain" + + finally: + os.unlink(temp_file) + + def test_empty_values_after_sanitization(self): + """Test that data with empty values after sanitization is properly handled + + Critical edge case: When sanitization results in data with empty string values, + we must use 'if cleaned_data is not None' instead of 'if cleaned_data' to ensure + proper reload, since truthy check on dict depends on content, not just existence. + """ + # Create data where ALL values are only surrogate characters + all_dirty_data = { + "key1": "\ud800\udc00\ud801", + "key2": "\ud802\ud803", + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + # Write dirty data - should trigger sanitization + needs_reload = write_json(all_dirty_data, temp_file) + assert needs_reload, "All-dirty data should trigger sanitization" + + # Load the sanitized data + cleaned_data = load_json(temp_file) + + # Critical assertions for the edge case + assert cleaned_data is not None, "Cleaned data should not be None" + # Sanitization removes surrogates but preserves keys with empty values + assert cleaned_data == { + "key1": "", + "key2": "", + }, "Surrogates should be removed, keys preserved" + # This dict is truthy because it has keys (even with empty values) + assert cleaned_data, "Dict with keys is truthy" + + # Test the actual edge case: empty dict + empty_data = {} + needs_reload2 = write_json(empty_data, temp_file) + assert not needs_reload2, "Empty dict is clean" + + reloaded_empty = load_json(temp_file) + assert reloaded_empty is not None, "Empty dict should not be None" + assert reloaded_empty == {}, "Empty dict should remain empty" + assert ( + not reloaded_empty + ), "Empty dict evaluates to False (the critical check)" + + finally: + os.unlink(temp_file) + + +if __name__ == "__main__": + # Run tests + test = TestWriteJsonOptimization() + + print("Running test_fast_path_clean_data...") + test.test_fast_path_clean_data() + print("✓ Passed") + + print("Running test_slow_path_dirty_data...") + test.test_slow_path_dirty_data() + print("✓ Passed") + + print("Running test_sanitizing_encoder_removes_surrogates...") + test.test_sanitizing_encoder_removes_surrogates() + print("✓ Passed") + + print("Running test_nested_structure_sanitization...") + test.test_nested_structure_sanitization() + print("✓ Passed") + + print("Running test_unicode_non_characters_removed...") + test.test_unicode_non_characters_removed() + print("✓ Passed") + + print("Running test_mixed_clean_dirty_data...") + test.test_mixed_clean_dirty_data() + print("✓ Passed") + + print("Running test_empty_and_none_strings...") + test.test_empty_and_none_strings() + print("✓ Passed") + + print("Running test_specific_surrogate_udc9a...") + test.test_specific_surrogate_udc9a() + print("✓ Passed") + + print("Running test_migration_with_surrogate_sanitization...") + test.test_migration_with_surrogate_sanitization() + print("✓ Passed") + + print("Running test_empty_values_after_sanitization...") + test.test_empty_values_after_sanitization() + print("✓ Passed") + + print("\n✅ All tests passed!") diff --git a/uv.lock b/uv.lock index 003d86c4..b942632f 100644 --- a/uv.lock +++ b/uv.lock @@ -1981,7 +1981,7 @@ offline = [ { name = "pycryptodome" }, { name = "pymilvus" }, { name = "pymongo" }, - { name = "pypdf2" }, + { name = "pypdf" }, { name = "python-docx" }, { name = "python-pptx" }, { name = "qdrant-client" }, @@ -1992,7 +1992,7 @@ offline = [ offline-docs = [ { name = "openpyxl" }, { name = "pycryptodome" }, - { name = "pypdf2" }, + { name = "pypdf" }, { name = "python-docx" }, { name = "python-pptx" }, ] @@ -2071,7 +2071,7 @@ requires-dist = [ { name = "pyjwt", marker = "extra == 'api'", specifier = ">=2.8.0,<3.0.0" }, { name = "pymilvus", marker = "extra == 'offline-storage'", specifier = ">=2.6.2,<3.0.0" }, { name = "pymongo", marker = "extra == 'offline-storage'", specifier = ">=4.0.0,<5.0.0" }, - { name = "pypdf2", marker = "extra == 'offline-docs'", specifier = ">=3.0.0" }, + { name = "pypdf", marker = "extra == 'offline-docs'", specifier = ">=6.1.0" }, { name = "pypinyin" }, { name = "pypinyin", marker = "extra == 'api'" }, { name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" }, @@ -2083,7 +2083,7 @@ requires-dist = [ { name = "python-multipart", marker = "extra == 'api'" }, { name = "python-pptx", marker = "extra == 'offline-docs'", specifier = ">=0.6.21,<2.0.0" }, { name = "pytz", marker = "extra == 'api'" }, - { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.7.0,<2.0.0" }, + { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" }, { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" }, { name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" }, { name = "setuptools" }, @@ -3977,15 +3977,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/ed/494fd0cc1190a7c335e6958eeaee6f373a281869830255c2ed4785dac135/pypdf-6.1.3-py3-none-any.whl", hash = "sha256:eb049195e46f014fc155f566fa20e09d70d4646a9891164ac25fa0cbcfcdbcb5", size = 323863, upload-time = "2025-10-22T16:13:44.174Z" }, ] -[[package]] -name = "pypdf2" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" }, -] - [[package]] name = "pypinyin" version = "0.55.0"