Merge branch 'main' into tongda/main

2025-11-13 12:37:37 +08:00 · 2025-11-13 12:37:37 +08:00 · 297e460740
commit 297e460740
parent 940bec0b31 343d30727a
16 changed files with 616 additions and 89 deletions
--- a/env.example
+++ b/env.example
@ -29,7 +29,7 @@ WEBUI_DESCRIPTION="Simple and Fast Graph Based RAG System"
 # OLLAMA_EMULATING_MODEL_NAME=lightrag
 OLLAMA_EMULATING_MODEL_TAG=latest
-### Max nodes return from graph retrieval in webui
+### Max nodes for graph retrieval (Ensure WebUI local settings are also updated, which is limited to this value)
 # MAX_GRAPH_NODES=1000
 ### Logging level
@ -172,6 +172,8 @@ MAX_PARALLEL_INSERT=2
 ### LLM Configuration
 ### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock, gemini
 ### LLM_BINDING_HOST: host only for Ollama, endpoint for other LLM service
 ### If LightRAG deployed in Docker:
 ###    uses host.docker.internal instead of localhost in LLM_BINDING_HOST
 ###########################################################################
 ### LLM request timeout setting for all llm (0 means no timeout for Ollma)
 # LLM_TIMEOUT=180
@ -181,7 +183,7 @@ LLM_MODEL=gpt-4o
 LLM_BINDING_HOST=https://api.openai.com/v1
 LLM_BINDING_API_KEY=your_api_key
-### Optional for Azure
+### Env vars for Azure openai
 # AZURE_OPENAI_API_VERSION=2024-08-01-preview
 # AZURE_OPENAI_DEPLOYMENT=gpt-4o
@ -196,22 +198,16 @@ LLM_BINDING_API_KEY=your_api_key
 # LLM_MODEL=gemini-flash-latest
 # LLM_BINDING_API_KEY=your_gemini_api_key
 # LLM_BINDING_HOST=https://generativelanguage.googleapis.com
-GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
+
 ### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
 ### lightrag-server --llm-binding gemini --help
 ### Gemini Specific Parameters
 # GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
 # GEMINI_LLM_TEMPERATURE=0.7
-
+### Enable Thinking
-### OpenAI Compatible API Specific Parameters
+# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": -1, "include_thoughts": true}'
-### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
+### Disable Thinking
-# OPENAI_LLM_TEMPERATURE=0.9
+# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
 ### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
 ### Typically, max_tokens does not include prompt content, though some models, such as Gemini Models, are exceptions
 ### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider
 # OPENAI_LLM_MAX_TOKENS=9000
 ### For OpenAI o1-mini or newer modles
 OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
 #### OpenAI's new API utilizes max_completion_tokens instead of max_tokens
 # OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
 ### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
 ### lightrag-server --llm-binding openai --help
@ -222,8 +218,17 @@ OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
 ### Qwen3 Specific Parameters deploy by vLLM
 # OPENAI_LLM_EXTRA_BODY='{"chat_template_kwargs": {"enable_thinking": false}}'
 ### OpenAI Compatible API Specific Parameters
 ### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
 # OPENAI_LLM_TEMPERATURE=0.9
 ### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
 ### Typically, max_tokens does not include prompt content
 ### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider
 # OPENAI_LLM_MAX_TOKENS=9000
 ### For OpenAI o1-mini or newer modles utilizes max_completion_tokens instead of max_tokens
 OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
 ### use the following command to see all support options for Ollama LLM
 ### If LightRAG deployed in Docker uses host.docker.internal instead of localhost in LLM_BINDING_HOST
 ### lightrag-server --llm-binding ollama --help
 ### Ollama Server Specific Parameters
 ### OLLAMA_LLM_NUM_CTX must be provided, and should at least larger than MAX_TOTAL_TOKENS + 2000
@ -240,6 +245,8 @@ OLLAMA_LLM_NUM_CTX=32768
 ### Embedding Configuration (Should not be changed after the first file processed)
 ### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock
 ### EMBEDDING_BINDING_HOST: host only for Ollama, endpoint for other Embedding service
 ### If LightRAG deployed in Docker:
 ###    uses host.docker.internal instead of localhost in EMBEDDING_BINDING_HOST
 #######################################################################################
 # EMBEDDING_TIMEOUT=30
--- a/lightrag/api/init.py
+++ b/lightrag/api/init.py
@ -1 +1 @@
-__api_version__ = "0252"
+__api_version__ = "0254"
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -1081,11 +1081,11 @@ async def pipeline_enqueue_file(
                            result = converter.convert(file_path)
                            content = result.document.export_to_markdown()
                        else:
-                            if not pm.is_installed("pypdf2"):  # type: ignore
+                            if not pm.is_installed("pypdf"):  # type: ignore
-                                pm.install("pypdf2")
+                                pm.install("pypdf")
                            if not pm.is_installed("pycryptodome"):  # type: ignore
                                pm.install("pycryptodome")
-                            from PyPDF2 import PdfReader  # type: ignore
+                            from pypdf import PdfReader  # type: ignore
                            from io import BytesIO
                            pdf_file = BytesIO(file)
--- a/lightrag/kg/json_doc_status_impl.py
+++ b/lightrag/kg/json_doc_status_impl.py
@ -161,7 +161,20 @@ class JsonDocStatusStorage(DocStatusStorage):
                logger.debug(
                    f"[{self.workspace}] Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}"
                )
-                write_json(data_dict, self._file_name)
+
                # Write JSON and check if sanitization was applied
                needs_reload = write_json(data_dict, self._file_name)
                # If data was sanitized, reload cleaned data to update shared memory
                if needs_reload:
                    logger.info(
                        f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}"
                    )
                    cleaned_data = load_json(self._file_name)
                    if cleaned_data is not None:
                        self._data.clear()
                        self._data.update(cleaned_data)
                await clear_all_update_flags(self.final_namespace)
    async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@ -81,7 +81,20 @@ class JsonKVStorage(BaseKVStorage):
                logger.debug(
                    f"[{self.workspace}] Process {os.getpid()} KV writting {data_count} records to {self.namespace}"
                )
-                write_json(data_dict, self._file_name)
+
                # Write JSON and check if sanitization was applied
                needs_reload = write_json(data_dict, self._file_name)
                # If data was sanitized, reload cleaned data to update shared memory
                if needs_reload:
                    logger.info(
                        f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}"
                    )
                    cleaned_data = load_json(self._file_name)
                    if cleaned_data is not None:
                        self._data.clear()
                        self._data.update(cleaned_data)
                await clear_all_update_flags(self.final_namespace)
    async def get_by_id(self, id: str) -> dict[str, Any] | None:
@ -224,7 +237,7 @@ class JsonKVStorage(BaseKVStorage):
            data: Original data dictionary that may contain legacy structure
        Returns:
-            Migrated data dictionary with flattened cache keys
+            Migrated data dictionary with flattened cache keys (sanitized if needed)
        """
        from lightrag.utils import generate_cache_key
@ -261,8 +274,17 @@ class JsonKVStorage(BaseKVStorage):
            logger.info(
                f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure"
            )
-            # Persist migrated data immediately
+            # Persist migrated data immediately and check if sanitization was applied
-            write_json(migrated_data, self._file_name)
+            needs_reload = write_json(migrated_data, self._file_name)
            # If data was sanitized during write, reload cleaned data
            if needs_reload:
                logger.info(
                    f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
                )
                cleaned_data = load_json(self._file_name)
                if cleaned_data is not None:
                    return cleaned_data  # Return cleaned data to update shared memory
        return migrated_data
--- a/lightrag/tools/clean_llm_query_cache.py
+++ b/lightrag/tools/clean_llm_query_cache.py
@ -873,6 +873,31 @@ class CleanupTool:
        storage_name = STORAGE_TYPES[choice]
        # Special warning for JsonKVStorage about concurrent access
        if storage_name == "JsonKVStorage":
            print("\n" + "=" * 60)
            print(f"{BOLD_RED}⚠️  IMPORTANT WARNING - JsonKVStorage Concurrency{RESET}")
            print("=" * 60)
            print("\nJsonKVStorage is an in-memory database that does NOT support")
            print("concurrent access to the same file by multiple programs.")
            print("\nBefore proceeding, please ensure that:")
            print("  • LightRAG Server is completely shut down")
            print("  • No other programs are accessing the storage files")
            print("\n" + "=" * 60)
            confirm = (
                input("\nHas LightRAG Server been shut down? (yes/no): ")
                .strip()
                .lower()
            )
            if confirm != "yes":
                print(
                    "\n✓ Operation cancelled - Please shut down LightRAG Server first"
                )
                return None, None, None
            print("✓ Proceeding with JsonKVStorage cleanup...")
        # Check configuration (warnings only, doesn't block)
        print("\nChecking configuration...")
        self.check_env_vars(storage_name)
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -56,6 +56,9 @@ if not logger.handlers:
 # Set httpx logging level to WARNING
 logging.getLogger("httpx").setLevel(logging.WARNING)
 # Precompile regex pattern for JSON sanitization (module-level, compiled once)
 _SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
 # Global import for pypinyin with startup-time logging
 try:
    import pypinyin
@ -927,9 +930,123 @@ def load_json(file_name):
        return json.load(f)
 def _sanitize_string_for_json(text: str) -> str:
    """Remove characters that cannot be encoded in UTF-8 for JSON serialization.
    Uses regex for optimal performance with zero-copy optimization for clean strings.
    Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings.
    Args:
        text: String to sanitize
    Returns:
        Original string if clean (zero-copy), sanitized string if dirty
    """
    if not text:
        return text
    # Fast path: Check if sanitization is needed using C-level regex search
    if not _SURROGATE_PATTERN.search(text):
        return text  # Zero-copy for clean strings - most common case
    # Slow path: Remove problematic characters using C-level regex substitution
    return _SURROGATE_PATTERN.sub("", text)
 class SanitizingJSONEncoder(json.JSONEncoder):
    """
    Custom JSON encoder that sanitizes data during serialization.
    This encoder cleans strings during the encoding process without creating
    a full copy of the data structure, making it memory-efficient for large datasets.
    """
    def encode(self, o):
        """Override encode method to handle simple string cases"""
        if isinstance(o, str):
            return json.encoder.encode_basestring(_sanitize_string_for_json(o))
        return super().encode(o)
    def iterencode(self, o, _one_shot=False):
        """
        Override iterencode to sanitize strings during serialization.
        This is the core method that handles complex nested structures.
        """
        # Preprocess: sanitize all strings in the object
        sanitized = self._sanitize_for_encoding(o)
        # Call parent's iterencode with sanitized data
        for chunk in super().iterencode(sanitized, _one_shot):
            yield chunk
    def _sanitize_for_encoding(self, obj):
        """
        Recursively sanitize strings in an object.
        Creates new objects only when necessary to avoid deep copies.
        Args:
            obj: Object to sanitize
        Returns:
            Sanitized object with cleaned strings
        """
        if isinstance(obj, str):
            return _sanitize_string_for_json(obj)
        elif isinstance(obj, dict):
            # Create new dict with sanitized keys and values
            new_dict = {}
            for k, v in obj.items():
                clean_k = _sanitize_string_for_json(k) if isinstance(k, str) else k
                clean_v = self._sanitize_for_encoding(v)
                new_dict[clean_k] = clean_v
            return new_dict
        elif isinstance(obj, (list, tuple)):
            # Sanitize list/tuple elements
            cleaned = [self._sanitize_for_encoding(item) for item in obj]
            return type(obj)(cleaned) if isinstance(obj, tuple) else cleaned
        else:
            # Numbers, booleans, None, etc. remain unchanged
            return obj
 def write_json(json_obj, file_name):
    """
    Write JSON data to file with optimized sanitization strategy.
    This function uses a two-stage approach:
    1. Fast path: Try direct serialization (works for clean data ~99% of time)
    2. Slow path: Use custom encoder that sanitizes during serialization
    The custom encoder approach avoids creating a deep copy of the data,
    making it memory-efficient. When sanitization occurs, the caller should
    reload the cleaned data from the file to update shared memory.
    Args:
        json_obj: Object to serialize (may be a shallow copy from shared memory)
        file_name: Output file path
    Returns:
        bool: True if sanitization was applied (caller should reload data),
              False if direct write succeeded (no reload needed)
    """
    try:
        # Strategy 1: Fast path - try direct serialization
        with open(file_name, "w", encoding="utf-8") as f:
            json.dump(json_obj, f, indent=2, ensure_ascii=False)
        return False  # No sanitization needed, no reload required
    except (UnicodeEncodeError, UnicodeDecodeError) as e:
        logger.debug(f"Direct JSON write failed, using sanitizing encoder: {e}")
    # Strategy 2: Use custom encoder (sanitizes during serialization, zero memory copy)
    with open(file_name, "w", encoding="utf-8") as f:
-        json.dump(json_obj, f, indent=2, ensure_ascii=False)
+        json.dump(json_obj, f, indent=2, ensure_ascii=False, cls=SanitizingJSONEncoder)
    logger.info(f"JSON sanitization applied during write: {file_name}")
    return True  # Sanitization applied, reload recommended
 class TokenizerInterface(Protocol):
--- a/lightrag_webui/src/components/retrieval/QuerySettings.tsx
+++ b/lightrag_webui/src/components/retrieval/QuerySettings.tsx
@ -40,7 +40,6 @@ export default function QuerySettings() {
  // Default values for reset functionality
  const defaultValues = useMemo(() => ({
    mode: 'mix' as QueryMode,
    response_type: 'Multiple Paragraphs',
    top_k: 40,
    chunk_top_k: 20,
    max_entity_tokens: 6000,
@ -153,46 +152,6 @@ export default function QuerySettings() {
              </div>
            </>
            {/* Response Format */}
            <>
              <TooltipProvider>
                <Tooltip>
                  <TooltipTrigger asChild>
                    <label htmlFor="response_format_select" className="ml-1 cursor-help">
                      {t('retrievePanel.querySettings.responseFormat')}
                    </label>
                  </TooltipTrigger>
                  <TooltipContent side="left">
                    <p>{t('retrievePanel.querySettings.responseFormatTooltip')}</p>
                  </TooltipContent>
                </Tooltip>
              </TooltipProvider>
              <div className="flex items-center gap-1">
                <Select
                  value={querySettings.response_type}
                  onValueChange={(v) => handleChange('response_type', v)}
                >
                  <SelectTrigger
                    id="response_format_select"
                    className="hover:bg-primary/5 h-9 cursor-pointer focus:ring-0 focus:ring-offset-0 focus:outline-0 active:right-0 flex-1 text-left [&>span]:break-all [&>span]:line-clamp-1"
                  >
                    <SelectValue />
                  </SelectTrigger>
                  <SelectContent>
                    <SelectGroup>
                      <SelectItem value="Multiple Paragraphs">{t('retrievePanel.querySettings.responseFormatOptions.multipleParagraphs')}</SelectItem>
                      <SelectItem value="Single Paragraph">{t('retrievePanel.querySettings.responseFormatOptions.singleParagraph')}</SelectItem>
                      <SelectItem value="Bullet Points">{t('retrievePanel.querySettings.responseFormatOptions.bulletPoints')}</SelectItem>
                    </SelectGroup>
                  </SelectContent>
                </Select>
                <ResetButton
                  onClick={() => handleReset('response_type')}
                  title="Reset to default (Multiple Paragraphs)"
                />
              </div>
            </>
            {/* Top K */}
            <>
              <TooltipProvider>
--- a/lightrag_webui/src/features/RetrievalTesting.tsx
+++ b/lightrag_webui/src/features/RetrievalTesting.tsx
@ -357,6 +357,7 @@ export default function RetrievalTesting() {
      const queryParams = {
        ...state.querySettings,
        query: actualQuery,
        response_type: 'Multiple Paragraphs',
        conversation_history: effectiveHistoryTurns > 0
          ? prevMessages
            .filter((m) => m.isError !== true)
--- a/lightrag_webui/src/stores/settings.ts
+++ b/lightrag_webui/src/stores/settings.ts
@ -123,7 +123,6 @@ const useSettingsStoreBase = create<SettingsState>()(
      querySettings: {
        mode: 'global',
        response_type: 'Multiple Paragraphs',
        top_k: 40,
        chunk_top_k: 20,
        max_entity_tokens: 6000,
@ -239,7 +238,7 @@ const useSettingsStoreBase = create<SettingsState>()(
    {
      name: 'settings-storage',
      storage: createJSONStorage(() => localStorage),
-      version: 18,
+      version: 19,
      migrate: (state: any, version: number) => {
        if (version < 2) {
          state.showEdgeLabel = false
@ -336,6 +335,12 @@ const useSettingsStoreBase = create<SettingsState>()(
          // Add userPromptHistory field for older versions
          state.userPromptHistory = []
        }
        if (version < 19) {
          // Remove deprecated response_type parameter
          if (state.querySettings) {
            delete state.querySettings.response_type
          }
        }
        return state
      }
    }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -86,7 +86,7 @@ offline-docs = [
    # Document processing dependencies
    "openpyxl>=3.0.0,<4.0.0",
    "pycryptodome>=3.0.0,<4.0.0",
-    "pypdf2>=3.0.0",
+    "pypdf>=6.1.0",
    "python-docx>=0.8.11,<2.0.0",
    "python-pptx>=0.6.21,<2.0.0",
 ]
@ -98,7 +98,7 @@ offline-storage = [
    "pymilvus>=2.6.2,<3.0.0",
    "pymongo>=4.0.0,<5.0.0",
    "asyncpg>=0.29.0,<1.0.0",
-    "qdrant-client>=1.7.0,<2.0.0",
+    "qdrant-client>=1.11.0,<2.0.0",
 ]
 offline-llm = [
--- a/requirements-offline-docs.txt
+++ b/requirements-offline-docs.txt
@ -10,6 +10,6 @@
 # Document processing dependencies (with version constraints matching pyproject.toml)
 openpyxl>=3.0.0,<4.0.0
 pycryptodome>=3.0.0,<4.0.0
-pypdf2>=3.0.0
+pypdf>=6.1.0
 python-docx>=0.8.11,<2.0.0
 python-pptx>=0.6.21,<2.0.0
--- a/requirements-offline-storage.txt
+++ b/requirements-offline-storage.txt
@ -12,5 +12,5 @@ asyncpg>=0.29.0,<1.0.0
 neo4j>=5.0.0,<7.0.0
 pymilvus>=2.6.2,<3.0.0
 pymongo>=4.0.0,<5.0.0
-qdrant-client>=1.7.0,<2.0.0
+qdrant-client>=1.11.0,<2.0.0
 redis>=5.0.0,<8.0.0
--- a/requirements-offline.txt
+++ b/requirements-offline.txt
@ -24,10 +24,10 @@ openpyxl>=3.0.0,<4.0.0
 pycryptodome>=3.0.0,<4.0.0
 pymilvus>=2.6.2,<3.0.0
 pymongo>=4.0.0,<5.0.0
-pypdf2>=3.0.0
+pypdf>=6.1.0
 python-docx>=0.8.11,<2.0.0
 python-pptx>=0.6.21,<2.0.0
-qdrant-client>=1.7.0,<2.0.0
+qdrant-client>=1.11.0,<2.0.0
 redis>=5.0.0,<8.0.0
 voyageai>=0.2.0,<1.0.0
 zhipuai>=2.0.0,<3.0.0
--- a/tests/test_write_json_optimization.py
+++ b/tests/test_write_json_optimization.py
@ -0,0 +1,387 @@
 """
 Test suite for write_json optimization
 This test verifies:
 1. Fast path works for clean data (no sanitization)
 2. Slow path applies sanitization for dirty data
 3. Sanitization is done during encoding (memory-efficient)
 4. Reloading updates shared memory with cleaned data
 """
 import os
 import json
 import tempfile
 from lightrag.utils import write_json, load_json, SanitizingJSONEncoder
 class TestWriteJsonOptimization:
    """Test write_json optimization with two-stage approach"""
    def test_fast_path_clean_data(self):
        """Test that clean data takes the fast path without sanitization"""
        clean_data = {
            "name": "John Doe",
            "age": 30,
            "items": ["apple", "banana", "cherry"],
            "nested": {"key": "value", "number": 42},
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # Write clean data - should return False (no sanitization)
            needs_reload = write_json(clean_data, temp_file)
            assert not needs_reload, "Clean data should not require sanitization"
            # Verify data was written correctly
            loaded_data = load_json(temp_file)
            assert loaded_data == clean_data, "Loaded data should match original"
        finally:
            os.unlink(temp_file)
    def test_slow_path_dirty_data(self):
        """Test that dirty data triggers sanitization"""
        # Create data with surrogate characters (U+D800 to U+DFFF)
        dirty_string = "Hello\ud800World"  # Contains surrogate character
        dirty_data = {"text": dirty_string, "number": 123}
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # Write dirty data - should return True (sanitization applied)
            needs_reload = write_json(dirty_data, temp_file)
            assert needs_reload, "Dirty data should trigger sanitization"
            # Verify data was written and sanitized
            loaded_data = load_json(temp_file)
            assert loaded_data is not None, "Data should be written"
            assert loaded_data["number"] == 123, "Clean fields should remain unchanged"
            # Surrogate character should be removed
            assert (
                "\ud800" not in loaded_data["text"]
            ), "Surrogate character should be removed"
        finally:
            os.unlink(temp_file)
    def test_sanitizing_encoder_removes_surrogates(self):
        """Test that SanitizingJSONEncoder removes surrogate characters"""
        data_with_surrogates = {
            "text": "Hello\ud800\udc00World",  # Contains surrogate pair
            "clean": "Clean text",
            "nested": {"dirty_key\ud801": "value", "clean_key": "clean\ud802value"},
        }
        # Encode using custom encoder
        encoded = json.dumps(
            data_with_surrogates, cls=SanitizingJSONEncoder, ensure_ascii=False
        )
        # Verify no surrogate characters in output
        assert "\ud800" not in encoded, "Surrogate U+D800 should be removed"
        assert "\udc00" not in encoded, "Surrogate U+DC00 should be removed"
        assert "\ud801" not in encoded, "Surrogate U+D801 should be removed"
        assert "\ud802" not in encoded, "Surrogate U+D802 should be removed"
        # Verify clean parts remain
        assert "Clean text" in encoded, "Clean text should remain"
        assert "clean_key" in encoded, "Clean keys should remain"
    def test_nested_structure_sanitization(self):
        """Test sanitization of deeply nested structures"""
        nested_data = {
            "level1": {
                "level2": {
                    "level3": {"dirty": "text\ud800here", "clean": "normal text"},
                    "list": ["item1", "item\ud801dirty", "item3"],
                }
            }
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            needs_reload = write_json(nested_data, temp_file)
            assert needs_reload, "Nested dirty data should trigger sanitization"
            # Verify nested structure is preserved
            loaded_data = load_json(temp_file)
            assert "level1" in loaded_data
            assert "level2" in loaded_data["level1"]
            assert "level3" in loaded_data["level1"]["level2"]
            # Verify surrogates are removed
            dirty_text = loaded_data["level1"]["level2"]["level3"]["dirty"]
            assert "\ud800" not in dirty_text, "Nested surrogate should be removed"
            # Verify list items are sanitized
            list_items = loaded_data["level1"]["level2"]["list"]
            assert (
                "\ud801" not in list_items[1]
            ), "List item surrogates should be removed"
        finally:
            os.unlink(temp_file)
    def test_unicode_non_characters_removed(self):
        """Test that Unicode non-characters (U+FFFE, U+FFFF) don't cause encoding errors
        Note: U+FFFE and U+FFFF are valid UTF-8 characters (though discouraged),
        so they don't trigger sanitization. They only get removed when explicitly
        using the SanitizingJSONEncoder.
        """
        data_with_nonchars = {"text1": "Hello\ufffeWorld", "text2": "Test\uffffString"}
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # These characters are valid UTF-8, so they take the fast path
            needs_reload = write_json(data_with_nonchars, temp_file)
            assert not needs_reload, "U+FFFE/U+FFFF are valid UTF-8 characters"
            loaded_data = load_json(temp_file)
            # They're written as-is in the fast path
            assert loaded_data == data_with_nonchars
        finally:
            os.unlink(temp_file)
    def test_mixed_clean_dirty_data(self):
        """Test data with both clean and dirty fields"""
        mixed_data = {
            "clean_field": "This is perfectly fine",
            "dirty_field": "This has\ud800issues",
            "number": 42,
            "boolean": True,
            "null_value": None,
            "clean_list": [1, 2, 3],
            "dirty_list": ["clean", "dirty\ud801item"],
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            needs_reload = write_json(mixed_data, temp_file)
            assert (
                needs_reload
            ), "Mixed data with dirty fields should trigger sanitization"
            loaded_data = load_json(temp_file)
            # Clean fields should remain unchanged
            assert loaded_data["clean_field"] == "This is perfectly fine"
            assert loaded_data["number"] == 42
            assert loaded_data["boolean"]
            assert loaded_data["null_value"] is None
            assert loaded_data["clean_list"] == [1, 2, 3]
            # Dirty fields should be sanitized
            assert "\ud800" not in loaded_data["dirty_field"]
            assert "\ud801" not in loaded_data["dirty_list"][1]
        finally:
            os.unlink(temp_file)
    def test_empty_and_none_strings(self):
        """Test handling of empty and None values"""
        data = {
            "empty": "",
            "none": None,
            "zero": 0,
            "false": False,
            "empty_list": [],
            "empty_dict": {},
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            needs_reload = write_json(data, temp_file)
            assert (
                not needs_reload
            ), "Clean empty values should not trigger sanitization"
            loaded_data = load_json(temp_file)
            assert loaded_data == data, "Empty/None values should be preserved"
        finally:
            os.unlink(temp_file)
    def test_specific_surrogate_udc9a(self):
        """Test specific surrogate character \\udc9a mentioned in the issue"""
        # Test the exact surrogate character from the error message:
        # UnicodeEncodeError: 'utf-8' codec can't encode character '\\udc9a'
        data_with_udc9a = {
            "text": "Some text with surrogate\udc9acharacter",
            "position": 201,  # As mentioned in the error
            "clean_field": "Normal text",
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # Write data - should trigger sanitization
            needs_reload = write_json(data_with_udc9a, temp_file)
            assert needs_reload, "Data with \\udc9a should trigger sanitization"
            # Verify surrogate was removed
            loaded_data = load_json(temp_file)
            assert loaded_data is not None
            assert "\udc9a" not in loaded_data["text"], "\\udc9a should be removed"
            assert (
                loaded_data["clean_field"] == "Normal text"
            ), "Clean fields should remain"
        finally:
            os.unlink(temp_file)
    def test_migration_with_surrogate_sanitization(self):
        """Test that migration process handles surrogate characters correctly
        This test simulates the scenario where legacy cache contains surrogate
        characters and ensures they are cleaned during migration.
        """
        # Simulate legacy cache data with surrogate characters
        legacy_data_with_surrogates = {
            "cache_entry_1": {
                "return": "Result with\ud800surrogate",
                "cache_type": "extract",
                "original_prompt": "Some\udc9aprompt",
            },
            "cache_entry_2": {
                "return": "Clean result",
                "cache_type": "query",
                "original_prompt": "Clean prompt",
            },
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # First write the dirty data directly (simulating legacy cache file)
            # Use custom encoder to force write even with surrogates
            with open(temp_file, "w", encoding="utf-8") as f:
                json.dump(
                    legacy_data_with_surrogates,
                    f,
                    cls=SanitizingJSONEncoder,
                    ensure_ascii=False,
                )
            # Load and verify surrogates were cleaned during initial write
            loaded_data = load_json(temp_file)
            assert loaded_data is not None
            # The data should be sanitized
            assert (
                "\ud800" not in loaded_data["cache_entry_1"]["return"]
            ), "Surrogate in return should be removed"
            assert (
                "\udc9a" not in loaded_data["cache_entry_1"]["original_prompt"]
            ), "Surrogate in prompt should be removed"
            # Clean data should remain unchanged
            assert (
                loaded_data["cache_entry_2"]["return"] == "Clean result"
            ), "Clean data should remain"
        finally:
            os.unlink(temp_file)
    def test_empty_values_after_sanitization(self):
        """Test that data with empty values after sanitization is properly handled
        Critical edge case: When sanitization results in data with empty string values,
        we must use 'if cleaned_data is not None' instead of 'if cleaned_data' to ensure
        proper reload, since truthy check on dict depends on content, not just existence.
        """
        # Create data where ALL values are only surrogate characters
        all_dirty_data = {
            "key1": "\ud800\udc00\ud801",
            "key2": "\ud802\ud803",
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # Write dirty data - should trigger sanitization
            needs_reload = write_json(all_dirty_data, temp_file)
            assert needs_reload, "All-dirty data should trigger sanitization"
            # Load the sanitized data
            cleaned_data = load_json(temp_file)
            # Critical assertions for the edge case
            assert cleaned_data is not None, "Cleaned data should not be None"
            # Sanitization removes surrogates but preserves keys with empty values
            assert cleaned_data == {
                "key1": "",
                "key2": "",
            }, "Surrogates should be removed, keys preserved"
            # This dict is truthy because it has keys (even with empty values)
            assert cleaned_data, "Dict with keys is truthy"
            # Test the actual edge case: empty dict
            empty_data = {}
            needs_reload2 = write_json(empty_data, temp_file)
            assert not needs_reload2, "Empty dict is clean"
            reloaded_empty = load_json(temp_file)
            assert reloaded_empty is not None, "Empty dict should not be None"
            assert reloaded_empty == {}, "Empty dict should remain empty"
            assert (
                not reloaded_empty
            ), "Empty dict evaluates to False (the critical check)"
        finally:
            os.unlink(temp_file)
 if __name__ == "__main__":
    # Run tests
    test = TestWriteJsonOptimization()
    print("Running test_fast_path_clean_data...")
    test.test_fast_path_clean_data()
    print("✓ Passed")
    print("Running test_slow_path_dirty_data...")
    test.test_slow_path_dirty_data()
    print("✓ Passed")
    print("Running test_sanitizing_encoder_removes_surrogates...")
    test.test_sanitizing_encoder_removes_surrogates()
    print("✓ Passed")
    print("Running test_nested_structure_sanitization...")
    test.test_nested_structure_sanitization()
    print("✓ Passed")
    print("Running test_unicode_non_characters_removed...")
    test.test_unicode_non_characters_removed()
    print("✓ Passed")
    print("Running test_mixed_clean_dirty_data...")
    test.test_mixed_clean_dirty_data()
    print("✓ Passed")
    print("Running test_empty_and_none_strings...")
    test.test_empty_and_none_strings()
    print("✓ Passed")
    print("Running test_specific_surrogate_udc9a...")
    test.test_specific_surrogate_udc9a()
    print("✓ Passed")
    print("Running test_migration_with_surrogate_sanitization...")
    test.test_migration_with_surrogate_sanitization()
    print("✓ Passed")
    print("Running test_empty_values_after_sanitization...")
    test.test_empty_values_after_sanitization()
    print("✓ Passed")
    print("\n✅ All tests passed!")
--- a/uv.lock
+++ b/uv.lock
@ -1981,7 +1981,7 @@ offline = [
    { name = "pycryptodome" },
    { name = "pymilvus" },
    { name = "pymongo" },
-    { name = "pypdf2" },
+    { name = "pypdf" },
    { name = "python-docx" },
    { name = "python-pptx" },
    { name = "qdrant-client" },
@ -1992,7 +1992,7 @@ offline = [
 offline-docs = [
    { name = "openpyxl" },
    { name = "pycryptodome" },
-    { name = "pypdf2" },
+    { name = "pypdf" },
    { name = "python-docx" },
    { name = "python-pptx" },
 ]
@ -2071,7 +2071,7 @@ requires-dist = [
    { name = "pyjwt", marker = "extra == 'api'", specifier = ">=2.8.0,<3.0.0" },
    { name = "pymilvus", marker = "extra == 'offline-storage'", specifier = ">=2.6.2,<3.0.0" },
    { name = "pymongo", marker = "extra == 'offline-storage'", specifier = ">=4.0.0,<5.0.0" },
-    { name = "pypdf2", marker = "extra == 'offline-docs'", specifier = ">=3.0.0" },
+    { name = "pypdf", marker = "extra == 'offline-docs'", specifier = ">=6.1.0" },
    { name = "pypinyin" },
    { name = "pypinyin", marker = "extra == 'api'" },
    { name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
@ -2083,7 +2083,7 @@ requires-dist = [
    { name = "python-multipart", marker = "extra == 'api'" },
    { name = "python-pptx", marker = "extra == 'offline-docs'", specifier = ">=0.6.21,<2.0.0" },
    { name = "pytz", marker = "extra == 'api'" },
-    { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.7.0,<2.0.0" },
+    { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
    { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
    { name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
    { name = "setuptools" },
@ -3977,15 +3977,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/fa/ed/494fd0cc1190a7c335e6958eeaee6f373a281869830255c2ed4785dac135/pypdf-6.1.3-py3-none-any.whl", hash = "sha256:eb049195e46f014fc155f566fa20e09d70d4646a9891164ac25fa0cbcfcdbcb5", size = 323863, upload-time = "2025-10-22T16:13:44.174Z" },
 ]
 [[package]]
 name = "pypdf2"
 version = "3.0.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
 ]
 [[package]]
 name = "pypinyin"
 version = "0.55.0"
`@ -1 +1 @@`
	`__api_version__ = "0252"`	`__api_version__ = "0254"`