Merge branch 'main' into chore/support-voyageai-embed-directly

2025-12-05 06:43:22 -08:00 · 2025-12-05 06:43:22 -08:00 · 9afc59956f
commit 9afc59956f
parent 37f34fb12d f0d67f166a
41 changed files with 2414 additions and 785 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,206 @@
+# Keep GitHub Actions up to date with GitHub's Dependabot...
+# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem
+version: 2
+updates:
+  # ============================================================
+  # GitHub Actions
+  # PR Strategy:
+  #   - All updates (major/minor/patch): Grouped into a single PR
+  # ============================================================
+  - package-ecosystem: github-actions
+    directory: /
+    groups:
+      github-actions:
+        patterns:
+          - "*"  # Group all Actions updates into a single larger pull request
+    schedule:
+      interval: weekly
+      day: monday
+      time: "02:00"
+      timezone: "Asia/Shanghai"
+    labels:
+      - "dependencies"
+      - "github-actions"
+    open-pull-requests-limit: 2
+
+  # ============================================================
+  # Python (pip) Dependencies
+  # PR Strategy:
+  #   - Major updates: Individual PR per package (except numpy which is ignored)
+  #   - Minor updates: Grouped by category (llm-providers, storage, etc.)
+  #   - Patch updates: Grouped by category
+  # ============================================================
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "wednesday"
+      time: "02:00"
+      timezone: "Asia/Shanghai"
+    cooldown:
+      default-days: 5
+      semver-major-days: 30
+      semver-minor-days: 7
+      semver-patch-days: 3
+    groups:
+      # Core dependencies - LLM providers and embeddings
+      llm-providers:
+        patterns:
+          - "openai"
+          - "anthropic"
+          - "google-*"
+          - "boto3"
+          - "botocore"
+          - "ollama"
+        update-types:
+          - "minor"
+          - "patch"
+      # Storage backends
+      storage:
+        patterns:
+          - "neo4j"
+          - "pymongo"
+          - "redis"
+          - "psycopg*"
+          - "asyncpg"
+          - "milvus*"
+          - "qdrant*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Data processing and ML
+      data-processing:
+        patterns:
+          - "numpy"
+          - "scipy"
+          - "pandas"
+          - "tiktoken"
+          - "transformers"
+          - "torch*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Web framework and API
+      web-framework:
+        patterns:
+          - "fastapi"
+          - "uvicorn"
+          - "gunicorn"
+          - "starlette"
+          - "pydantic*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Development and testing tools
+      dev-tools:
+        patterns:
+          - "pytest*"
+          - "ruff"
+          - "pre-commit"
+          - "black"
+          - "mypy"
+        update-types:
+          - "minor"
+          - "patch"
+      # Minor and patch updates for everything else
+      python-minor-patch:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+    ignore:
+      - dependency-name: "numpy"
+        update-types:
+          - "version-update:semver-major"
+    labels:
+      - "dependencies"
+      - "python"
+    open-pull-requests-limit: 5
+
+  # ============================================================
+  # Frontend (bun) Dependencies
+  # PR Strategy:
+  #   - Major updates: Individual PR per package
+  #   - Minor updates: Grouped by category (react, ui-components, etc.)
+  #   - Patch updates: Grouped by category
+  # ============================================================
+  - package-ecosystem: "bun"
+    directory: "/lightrag_webui"
+    schedule:
+      interval: "weekly"
+      day: "friday"
+      time: "02:00"
+      timezone: "Asia/Shanghai"
+    cooldown:
+      default-days: 5
+      semver-major-days: 30
+      semver-minor-days: 7
+      semver-patch-days: 3
+    groups:
+      # React ecosystem
+      react:
+        patterns:
+          - "react"
+          - "react-dom"
+          - "react-router*"
+          - "@types/react*"
+        update-types:
+          - "minor"
+          - "patch"
+      # UI components and styling
+      ui-components:
+        patterns:
+          - "@radix-ui/*"
+          - "tailwind*"
+          - "@tailwindcss/*"
+          - "lucide-react"
+          - "class-variance-authority"
+          - "clsx"
+        update-types:
+          - "minor"
+          - "patch"
+      # Graph visualization
+      graph-viz:
+        patterns:
+          - "sigma"
+          - "@sigma/*"
+          - "graphology*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Build tools and dev dependencies
+      build-tools:
+        patterns:
+          - "vite"
+          - "@vitejs/*"
+          - "typescript"
+          - "eslint*"
+          - "@eslint/*"
+          - "typescript-eslint"
+          - "prettier"
+          - "prettier-*"
+          - "@types/bun"
+        update-types:
+          - "minor"
+          - "patch"
+      # Content rendering libraries (math, diagrams, etc.)
+      content-rendering:
+        patterns:
+          - "katex"
+          - "mermaid"
+        update-types:
+          - "minor"
+          - "patch"
+      # All other minor and patch updates
+      frontend-minor-patch:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+    labels:
+      - "dependencies"
+      - "frontend"
+    open-pull-requests-limit: 5
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@ -0,0 +1,58 @@
+name: "Copilot Setup Steps"
+
+# Automatically run the setup steps when they are changed to allow for easy validation, and
+# allow manual testing through the repository's "Actions" tab
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+  pull_request:
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+
+jobs:
+  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
+  copilot-setup-steps:
+    runs-on: ubuntu-latest
+
+    # Timeout after 30 minutes (maximum is 59)
+    timeout-minutes: 30
+
+    # You can define any steps you want, and they will run before the agent starts.
+    # If you do not check out your code, Copilot will do this for you.
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.11'
+
+      - name: Cache pip packages
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-copilot-${{ hashFiles('**/pyproject.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-copilot-
+            ${{ runner.os }}-pip-
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[api]"
+          pip install pytest pytest-asyncio httpx
+
+      - name: Create minimal frontend stub for Copilot agent
+        run: |
+          mkdir -p lightrag/api/webui
+          echo '<!DOCTYPE html><html><head><title>LightRAG - Copilot Agent</title></head><body><h1>Copilot Agent Mode</h1></body></html>' > lightrag/api/webui/index.html
+          echo "Created minimal frontend stub for Copilot agent environment"
+
+      - name: Verify installation
+        run: |
+          python --version
+          pip list | grep lightrag
+          lightrag-server --help || echo "Note: Server requires .env configuration to run"
--- a/.github/workflows/docker-build-lite.yml
+++ b/.github/workflows/docker-build-lite.yml
@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0

@ -66,7 +66,7 @@ jobs:
            type=raw,value=lite

      - name: Build and push lite Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile.lite
--- a/.github/workflows/docker-build-manual.yml
+++ b/.github/workflows/docker-build-manual.yml
@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0  # Fetch all history for tags

@ -61,7 +61,7 @@ jobs:
            type=raw,value=${{ steps.get_tag.outputs.tag }}

      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0  # Fetch all history for tags

@ -63,7 +63,7 @@ jobs:
            type=raw,value=latest,enable=${{ steps.check_prerelease.outputs.is_prerelease == 'false' }}

      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
--- a/.github/workflows/linting.yaml
+++ b/.github/workflows/linting.yaml
@ -10,14 +10,15 @@ on:

 jobs:
    lint-and-format:
+        name: Linting and Formatting
        runs-on: ubuntu-latest

        steps:
            - name: Checkout code
-              uses: actions/checkout@v2
+              uses: actions/checkout@v6

            - name: Set up Python
-              uses: actions/setup-python@v2
+              uses: actions/setup-python@v6
              with:
                python-version: '3.x'

--- a/.github/workflows/pypi-publish.yml
+++ b/.github/workflows/pypi-publish.yml
@ -13,13 +13,13 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: 0  # Fetch all history for tags

      # Build frontend WebUI
      - name: Setup Bun
-        uses: oven-sh/setup-bun@v1
+        uses: oven-sh/setup-bun@v2
        with:
          bun-version: latest

@ -40,7 +40,7 @@ jobs:
          echo "Frontend files:"
          ls -lh lightrag/api/webui/ | head -10

-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
        with:
          python-version: "3.x"

@ -64,7 +64,7 @@ jobs:
          python -m build

      - name: Upload distributions
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
        with:
          name: release-dists
          path: dist/
@ -81,7 +81,7 @@ jobs:

    steps:
      - name: Retrieve release distributions
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v6
        with:
          name: release-dists
          path: dist/
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@ -13,7 +13,7 @@ jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@v9
+      - uses: actions/stale@v10
        with:
          days-before-stale: 90 # 90 days
          days-before-close: 7 # 7 days after marked as stale
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -13,13 +13,13 @@ jobs:

    strategy:
      matrix:
-        python-version: ['3.10', '3.11', '3.12']
+        python-version: ['3.12', '3.13', '3.14']

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6

    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}

@ -45,7 +45,7 @@ jobs:

    - name: Upload test results
      if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v5
      with:
        name: test-results-py${{ matrix.python-version }}
        path: |
--- a/README-zh.md
+++ b/README-zh.md
@ -407,6 +407,11 @@ LightRAG 需要利用LLM和Embeding模型来完成文档索引和知识库查询
 * LightRAG还支持类OpenAI的聊天/嵌入API：

 ```python
+import os
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.openai import openai_complete_if_cache, openai_embed
+
 async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
 ) -> str:
@ -420,8 +425,9 @@ async def llm_model_func(
        **kwargs
    )

+@wrap_embedding_func_with_attrs(embedding_dim=4096, max_token_size=8192)
 async def embedding_func(texts: list[str]) -> np.ndarray:
-    return await openai_embed(
+    return await openai_embed.func(
        texts,
        model="solar-embedding-1-large-query",
        api_key=os.getenv("UPSTAGE_API_KEY"),
@ -432,16 +438,17 @@ async def initialize_rag():
    rag = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=llm_model_func,
-        embedding_func=EmbeddingFunc(
-            embedding_dim=4096,
-            func=embedding_func
-        )
+        embedding_func=embedding_func  # 直接传入装饰后的函数
    )

    await rag.initialize_storages()
    return rag
 ```

+> **关于嵌入函数封装的重要说明：**
+>
+> `EmbeddingFunc` 不能嵌套封装。已经被 `@wrap_embedding_func_with_attrs` 装饰过的嵌入函数（如 `openai_embed`、`ollama_embed` 等）不能再次使用 `EmbeddingFunc()` 封装。这就是为什么在创建自定义嵌入函数时，我们调用 `xxx_embed.func`（底层未封装的函数）而不是直接调用 `xxx_embed`。
+
 </details>

 <details>
@ -478,19 +485,20 @@ rag = LightRAG(
 然后您只需要按如下方式设置LightRAG：

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 # 使用Ollama模型初始化LightRAG
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # 使用Ollama模型进行文本生成
    llm_model_name='your_model_name', # 您的模型名称
-    # 使用Ollama嵌入函数
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # 直接传入装饰后的函数
 )
 ```

@ -529,22 +537,27 @@ ollama create -f Modelfile qwen2m
 您可以使用`llm_model_kwargs`参数配置ollama：

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # 使用Ollama模型进行文本生成
    llm_model_name='your_model_name', # 您的模型名称
    llm_model_kwargs={"options": {"num_ctx": 32768}},
-    # 使用Ollama嵌入函数
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # 直接传入装饰后的函数
 )
 ```

+> **关于嵌入函数封装的重要说明：**
+>
+> `EmbeddingFunc` 不能嵌套封装。已经被 `@wrap_embedding_func_with_attrs` 装饰过的嵌入函数（如 `openai_embed`、`ollama_embed` 等）不能再次使用 `EmbeddingFunc()` 封装。这就是为什么在创建自定义嵌入函数时，我们调用 `xxx_embed.func`（底层未封装的函数）而不是直接调用 `xxx_embed`。
+
 * **低RAM GPU**

 为了在低RAM GPU上运行此实验，您应该选择小型模型并调整上下文窗口（增加上下文会增加内存消耗）。例如，在6Gb RAM的改装挖矿GPU上运行这个ollama示例需要将上下文大小设置为26k，同时使用`gemma2:2b`。它能够在`book.txt`中找到197个实体和19个关系。
--- a/README.md
+++ b/README.md
@ -51,24 +51,24 @@

 ---
 ## 🎉 News
- [2025.11.05]🎯Add **RAGAS-based** Evaluation Framework and **Langfuse** observability for LightRAG (API can return retrieved contexts with query results).
- [2025.10.22]🎯Eliminate bottlenecks in processing **large-scale datasets**.
- [2025.09.15]🎯Significantly enhances KG extraction accuracy for **small LLMs** like Qwen3-30B-A3B.
- [2025.08.29]🎯**Reranker** is supported now , significantly boosting performance for mixed queries(Set as default query mode now).
- [2025.08.04]🎯**Document deletion** with KG regeneration to ensure query performance.
- [2025.06.16]🎯Our team has released [RAG-Anything](https://github.com/HKUDS/RAG-Anything) an All-in-One Multimodal RAG System for seamless text, image, table, and equation processing.
- [2025.06.05]🎯LightRAG now supports comprehensive multimodal data handling through [RAG-Anything](https://github.com/HKUDS/RAG-Anything) integration, enabling seamless document parsing and RAG capabilities across diverse formats including PDFs, images, Office documents, tables, and formulas. Please refer to the new [multimodal section](https://github.com/HKUDS/LightRAG/?tab=readme-ov-file#multimodal-document-processing-rag-anything-integration) for details.
- [2025.03.18]🎯LightRAG now supports citation functionality, enabling proper source attribution.
- [2025.02.12]🎯You can now use MongoDB as all in-one Storage.
- [2025.02.05]🎯Our team has released [VideoRAG](https://github.com/HKUDS/VideoRAG) understanding extremely long-context videos.
- [2025.01.13]🎯Our team has released [MiniRAG](https://github.com/HKUDS/MiniRAG) making RAG simpler with small models.
- [2025.01.06]🎯You can now use PostgreSQL as all in-one Storage.
- [2024.11.19]🎯A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author.
- [2024.11.09]🎯Introducing the LightRAG Webui, which allows you to insert, query, visualize LightRAG knowledge.
- [2024.11.04]🎯You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage).
- [2024.10.18]🎯We've added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author!
- [2024.10.17]🎯We have created a [Discord channel](https://discord.gg/yF2MmDJyGJ)! Welcome to join for sharing and discussions! 🎉🎉
- [2024.10.16]🎯LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)!
+- [2025.11]🎯[New Feature]: Integrated **RAGAS for Evaluation** and **Langfuse for Tracing**. Updated the API to return retrieved contexts alongside query results to support context precision metrics.
+- [2025.10]🎯[Scalability Enhancement]: Eliminated processing bottlenecks to support **Large-Scale Datasets Efficiently**.
+- [2025.09]🎯[New Feature] Enhances knowledge graph extraction accuracy for **Open-Sourced LLMs** such as Qwen3-30B-A3B.
+- [2025.08]🎯[New Feature] **Reranker** is now supported, significantly boosting performance for mixed queries (set as default query mode).
+- [2025.08]🎯[New Feature] Added **Document Deletion** with automatic KG regeneration to ensure optimal query performance.
+- [2025.06]🎯[New Release] Our team has released [RAG-Anything](https://github.com/HKUDS/RAG-Anything) — an **All-in-One Multimodal RAG** system for seamless processing of text, images, tables, and equations.
+- [2025.06]🎯[New Feature] LightRAG now supports comprehensive multimodal data handling through [RAG-Anything](https://github.com/HKUDS/RAG-Anything) integration, enabling seamless document parsing and RAG capabilities across diverse formats including PDFs, images, Office documents, tables, and formulas. Please refer to the new [multimodal section](https://github.com/HKUDS/LightRAG/?tab=readme-ov-file#multimodal-document-processing-rag-anything-integration) for details.
+- [2025.03]🎯[New Feature] LightRAG now supports citation functionality, enabling proper source attribution and enhanced document traceability.
+- [2025.02]🎯[New Feature] You can now use MongoDB as an all-in-one storage solution for unified data management.
+- [2025.02]🎯[New Release] Our team has released [VideoRAG](https://github.com/HKUDS/VideoRAG)-a RAG system for understanding extremely long-context videos
+- [2025.01]🎯[New Release] Our team has released [MiniRAG](https://github.com/HKUDS/MiniRAG) making RAG simpler with small models.
+- [2025.01]🎯You can now use PostgreSQL as an all-in-one storage solution for data management.
+- [2024.11]🎯[New Resource] A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). — explore in-depth tutorials and best practices. Many thanks to the blog author for this excellent contribution!
+- [2024.11]🎯[New Feature] Introducing the LightRAG WebUI — an interface that allows you to insert, query, and visualize LightRAG knowledge through an intuitive web-based dashboard.
+- [2024.11]🎯[New Feature] You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage)-enabling graph database support.
+- [2024.10]🎯[New Feature] We've added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). — a walkthrough of LightRAG's capabilities. Thanks to the author for this excellent contribution!
+- [2024.10]🎯[New Channel] We have created a [Discord channel](https://discord.gg/yF2MmDJyGJ)!💬 Welcome to join our community for sharing, discussions, and collaboration! 🎉🎉
+- [2024.10]🎯[New Feature] LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)!

 <details>
  <summary style="font-size: 1.4em; font-weight: bold; cursor: pointer; display: list-item;">
@ -214,7 +214,7 @@ For a streaming response implementation example, please see `examples/lightrag_o

 **Note 2**: Only `lightrag_openai_demo.py` and `lightrag_openai_compatible_demo.py` are officially supported sample codes. Other sample files are community contributions that haven't undergone full testing and optimization.

-## Programing with LightRAG Core
+## Programming with LightRAG Core

 > ⚠️ **If you would like to integrate LightRAG into your project, we recommend utilizing the REST API provided by the LightRAG Server**. LightRAG Core is typically intended for embedded applications or for researchers who wish to conduct studies and evaluations.

@ -313,7 +313,7 @@ A full list of LightRAG init parameters:
 | **vector_db_storage_cls_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval | cosine_better_than_threshold: 0.2（default value changed by env var COSINE_THRESHOLD) |
 | **enable_llm_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
 | **enable_llm_cache_for_entity_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application | `TRUE` |
-| **addon_params** | `dict` | Additional parameters, e.g., `{"language": "Simplified Chinese", "entity_types": ["organization", "person", "location", "event"]}`: sets example limit, entiy/relation extraction output language | language: English` |
+| **addon_params** | `dict` | Additional parameters, e.g., `{"language": "Simplified Chinese", "entity_types": ["organization", "person", "location", "event"]}`: sets example limit, entity/relation extraction output language | language: English` |
 | **embedding_cache_config** | `dict` | Configuration for question-answer caching. Contains three parameters: `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers. `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM. `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` |

 </details>
@ -364,7 +364,7 @@ class QueryParam:
    max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "30000"))
    """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""

-    # History mesages is only send to LLM for context, not used for retrieval
+    # History messages are only sent to LLM for context, not used for retrieval
    conversation_history: list[dict[str, str]] = field(default_factory=list)
    """Stores past conversation history to maintain context.
    Format: [{"role": "user/assistant", "content": "message"}].
@ -403,6 +403,11 @@ LightRAG requires the utilization of LLM and Embedding models to accomplish docu
 * LightRAG also supports Open AI-like chat/embeddings APIs:

 ```python
+import os
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.openai import openai_complete_if_cache, openai_embed
+
 async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
 ) -> str:
@ -416,8 +421,9 @@ async def llm_model_func(
        **kwargs
    )

+@wrap_embedding_func_with_attrs(embedding_dim=4096, max_token_size=8192)
 async def embedding_func(texts: list[str]) -> np.ndarray:
-    return await openai_embed(
+    return await openai_embed.func(
        texts,
        model="solar-embedding-1-large-query",
        api_key=os.getenv("UPSTAGE_API_KEY"),
@ -428,16 +434,17 @@ async def initialize_rag():
    rag = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=llm_model_func,
-        embedding_func=EmbeddingFunc(
-            embedding_dim=4096,
-            func=embedding_func
-        )
+        embedding_func=embedding_func  # Pass the decorated function directly
    )

    await rag.initialize_storages()
    return rag
 ```

+> **Important Note on Embedding Function Wrapping:**
+>
+> `EmbeddingFunc` cannot be nested. Functions that have been decorated with `@wrap_embedding_func_with_attrs` (such as `openai_embed`, `ollama_embed`, etc.) cannot be wrapped again using `EmbeddingFunc()`. This is why we call `xxx_embed.func` (the underlying unwrapped function) instead of `xxx_embed` directly when creating custom embedding functions.
+
 </details>

 <details>
@ -476,19 +483,20 @@ If you want to use Ollama models, you need to pull model you plan to use and emb
 Then you only need to set LightRAG as follows:

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 # Initialize LightRAG with Ollama model
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # Use Ollama model for text generation
    llm_model_name='your_model_name', # Your model name
-    # Use Ollama embedding function
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # Pass the decorated function directly
 )
 ```

@ -527,22 +535,27 @@ ollama create -f Modelfile qwen2m
 Tiy can use `llm_model_kwargs` param to configure ollama:

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # Use Ollama model for text generation
    llm_model_name='your_model_name', # Your model name
    llm_model_kwargs={"options": {"num_ctx": 32768}},
-    # Use Ollama embedding function
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # Pass the decorated function directly
 )
 ```

+> **Important Note on Embedding Function Wrapping:**
+>
+> `EmbeddingFunc` cannot be nested. Functions that have been decorated with `@wrap_embedding_func_with_attrs` (such as `openai_embed`, `ollama_embed`, etc.) cannot be wrapped again using `EmbeddingFunc()`. This is why we call `xxx_embed.func` (the underlying unwrapped function) instead of `xxx_embed` directly when creating custom embedding functions.
+
 * **Low RAM GPUs**

 In order to run this experiment on low RAM GPU you should select small model and tune context window (increasing context increase memory consumption). For example, running this ollama example on repurposed mining GPU with 6Gb of RAM required to set context size to 26k while using `gemma2:2b`. It was able to find 197 entities and 19 relations on `book.txt`.
@ -1555,7 +1568,7 @@ Langfuse provides a drop-in replacement for the OpenAI client that automatically
 pip install lightrag-hku
 pip install lightrag-hku[observability]

-# Or install from souce code with debug mode enabled
+# Or install from source code with debug mode enabled
 pip install -e .
 pip install -e ".[observability]"
 ```
--- a/env.example
+++ b/env.example
@ -102,6 +102,9 @@ RERANK_BINDING=null
 # RERANK_MODEL=rerank-v3.5
 # RERANK_BINDING_HOST=https://api.cohere.com/v2/rerank
 # RERANK_BINDING_API_KEY=your_rerank_api_key_here
+### Cohere rerank chunking configuration (useful for models with token limits like ColBERT)
+# RERANK_ENABLE_CHUNKING=true
+# RERANK_MAX_TOKENS_PER_DOC=480

 ### Default value for Jina AI
 # RERANK_MODEL=jina-reranker-v2-base-multilingual
@ -183,9 +186,13 @@ LLM_MODEL=gpt-4o
 LLM_BINDING_HOST=https://api.openai.com/v1
 LLM_BINDING_API_KEY=your_api_key

-### Env vars for Azure openai
+### Azure OpenAI example
+### Use deployment name as model name or set AZURE_OPENAI_DEPLOYMENT instead
 # AZURE_OPENAI_API_VERSION=2024-08-01-preview
-# AZURE_OPENAI_DEPLOYMENT=gpt-4o
+# LLM_BINDING=azure_openai
+# LLM_BINDING_HOST=https://xxxx.openai.azure.com/
+# LLM_BINDING_API_KEY=your_api_key
+# LLM_MODEL=my-gpt-mini-deployment

 ### Openrouter example
 # LLM_MODEL=google/gemini-2.5-flash
@ -273,11 +280,14 @@ EMBEDDING_TOKEN_LIMIT=8192
 EMBEDDING_BINDING_HOST=https://api.openai.com/v1
 EMBEDDING_BINDING_API_KEY=your_api_key

-### Optional for Azure
-# AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large
-# AZURE_EMBEDDING_API_VERSION=2023-05-15
-# AZURE_EMBEDDING_ENDPOINT=your_endpoint
-# AZURE_EMBEDDING_API_KEY=your_api_key
+### Optional for Azure embedding
+### Use deployment name as model name or set AZURE_EMBEDDING_DEPLOYMENT instead
+# AZURE_EMBEDDING_API_VERSION=2024-08-01-preview
+# EMBEDDING_BINDING=azure_openai
+# EMBEDDING_BINDING_HOST=https://xxxx.openai.azure.com/
+# EMBEDDING_API_KEY=your_api_key
+# EMBEDDING_MODEL==my-text-embedding-3-large-deployment
+# EMBEDDING_DIM=3072

 ### Gemini embedding
 # EMBEDDING_BINDING=gemini
@ -440,6 +450,17 @@ MEMGRAPH_DATABASE=memgraph
 ### DB specific workspace should not be set, keep for compatible only
 ### MEMGRAPH_WORKSPACE=forced_workspace_name

+###########################################################
+### Langfuse Observability Configuration
+### Only works with LLM provided by OpenAI compatible API
+### Install with: pip install lightrag-hku[observability]
+### Sign up at: https://cloud.langfuse.com or self-host
+###########################################################
+# LANGFUSE_SECRET_KEY=""
+# LANGFUSE_PUBLIC_KEY=""
+# LANGFUSE_HOST="https://cloud.langfuse.com"  # 或您的自托管实例地址
+# LANGFUSE_ENABLE_TRACE=true
+
 ############################
 ### Evaluation Configuration
 ############################
--- a/examples/rerank_example.py
+++ b/examples/rerank_example.py
@ -15,9 +15,12 @@ Configuration Required:
    EMBEDDING_BINDING_HOST
    EMBEDDING_BINDING_API_KEY
 3. Set your vLLM deployed AI rerank model setting with env vars:
-    RERANK_MODEL
-    RERANK_BINDING_HOST
+    RERANK_BINDING=cohere
+    RERANK_MODEL (e.g., answerai-colbert-small-v1 or rerank-v3.5)
+    RERANK_BINDING_HOST (e.g., https://api.cohere.com/v2/rerank or LiteLLM proxy)
    RERANK_BINDING_API_KEY
+    RERANK_ENABLE_CHUNKING=true (optional, for models with token limits)
+    RERANK_MAX_TOKENS_PER_DOC=480 (optional, default 4096)

 Note: Rerank is controlled per query via the 'enable_rerank' parameter (default: True)
 """
@ -66,9 +69,11 @@ async def embedding_func(texts: list[str]) -> np.ndarray:

 rerank_model_func = partial(
    cohere_rerank,
-    model=os.getenv("RERANK_MODEL"),
+    model=os.getenv("RERANK_MODEL", "rerank-v3.5"),
    api_key=os.getenv("RERANK_BINDING_API_KEY"),
-    base_url=os.getenv("RERANK_BINDING_HOST"),
+    base_url=os.getenv("RERANK_BINDING_HOST", "https://api.cohere.com/v2/rerank"),
+    enable_chunking=os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true",
+    max_tokens_per_doc=int(os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")),
 )


--- a/lightrag/api/init.py
+++ b/lightrag/api/init.py
@ -1 +1 @@
-__api_version__ = "0256"
+__api_version__ = "0259"
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -365,8 +365,12 @@ def parse_args() -> argparse.Namespace:

    # Inject model configuration
    args.llm_model = get_env_value("LLM_MODEL", "mistral-nemo:latest")
-    args.embedding_model = get_env_value("EMBEDDING_MODEL", "bge-m3:latest")
-    args.embedding_dim = get_env_value("EMBEDDING_DIM", 1024, int)
+    # EMBEDDING_MODEL defaults to None - each binding will use its own default model
+    # e.g., OpenAI uses "text-embedding-3-small", Jina uses "jina-embeddings-v4"
+    args.embedding_model = get_env_value("EMBEDDING_MODEL", None, special_none=True)
+    # EMBEDDING_DIM defaults to None - each binding will use its own default dimension
+    # Value is inherited from provider defaults via wrap_embedding_func_with_attrs decorator
+    args.embedding_dim = get_env_value("EMBEDDING_DIM", None, int, special_none=True)
    args.embedding_send_dim = get_env_value("EMBEDDING_SEND_DIM", False, bool)

    # Inject chunk configuration
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@ -159,19 +159,22 @@ def check_frontend_build():
    """Check if frontend is built and optionally check if source is up-to-date

    Returns:
-        bool: True if frontend is outdated, False if up-to-date or production environment
+        tuple: (assets_exist: bool, is_outdated: bool)
+            - assets_exist: True if WebUI build files exist
+            - is_outdated: True if source is newer than build (only in dev environment)
    """
    webui_dir = Path(__file__).parent / "webui"
    index_html = webui_dir / "index.html"

-    # 1. Check if build files exist (required)
+    # 1. Check if build files exist
    if not index_html.exists():
-        ASCIIColors.red("\n" + "=" * 80)
-        ASCIIColors.red("ERROR: Frontend Not Built")
-        ASCIIColors.red("=" * 80)
+        ASCIIColors.yellow("\n" + "=" * 80)
+        ASCIIColors.yellow("WARNING: Frontend Not Built")
+        ASCIIColors.yellow("=" * 80)
        ASCIIColors.yellow("The WebUI frontend has not been built yet.")
+        ASCIIColors.yellow("The API server will start without the WebUI interface.")
        ASCIIColors.yellow(
-            "Please build the frontend code first using the following commands:\n"
+            "\nTo enable WebUI, build the frontend using these commands:\n"
        )
        ASCIIColors.cyan("    cd lightrag_webui")
        ASCIIColors.cyan("    bun install --frozen-lockfile")
@ -181,8 +184,8 @@ def check_frontend_build():
        ASCIIColors.cyan(
            "Note: Make sure you have Bun installed. Visit https://bun.sh for installation."
        )
-        ASCIIColors.red("=" * 80 + "\n")
-        sys.exit(1)  # Exit immediately
+        ASCIIColors.yellow("=" * 80 + "\n")
+        return (False, False)  # Assets don't exist, not outdated

    # 2. Check if this is a development environment (source directory exists)
    try:
@ -195,7 +198,7 @@ def check_frontend_build():
            logger.debug(
                "Production environment detected, skipping source freshness check"
            )
-            return False
+            return (True, False)  # Assets exist, not outdated (prod environment)

        # Development environment, perform source code timestamp check
        logger.debug("Development environment detected, checking source freshness")
@ -270,20 +273,20 @@ def check_frontend_build():
            ASCIIColors.cyan("    cd ..")
            ASCIIColors.yellow("\nThe server will continue with the current build.")
            ASCIIColors.yellow("=" * 80 + "\n")
-            return True  # Frontend is outdated
+            return (True, True)  # Assets exist, outdated
        else:
            logger.info("Frontend build is up-to-date")
-            return False  # Frontend is up-to-date
+            return (True, False)  # Assets exist, up-to-date

    except Exception as e:
        # If check fails, log warning but don't affect startup
        logger.warning(f"Failed to check frontend source freshness: {e}")
-        return False  # Assume up-to-date on error
+        return (True, False)  # Assume assets exist and up-to-date on error


 def create_app(args):
-    # Check frontend build first and get outdated status
-    is_frontend_outdated = check_frontend_build()
+    # Check frontend build first and get status
+    webui_assets_exist, is_frontend_outdated = check_frontend_build()

    # Create unified API version display with warning symbol if frontend is outdated
    api_version_display = (
@ -652,6 +655,17 @@ def create_app(args):
        2. Extracts max_token_size and embedding_dim from provider if it's an EmbeddingFunc
        3. Creates an optimized wrapper that calls the underlying function directly (avoiding double-wrapping)
        4. Returns a properly configured EmbeddingFunc instance
+
+        Configuration Rules:
+        - When EMBEDDING_MODEL is not set: Uses provider's default model and dimension
+          (e.g., jina-embeddings-v4 with 2048 dims, text-embedding-3-small with 1536 dims)
+        - When EMBEDDING_MODEL is set to a custom model: User MUST also set EMBEDDING_DIM
+          to match the custom model's dimension (e.g., for jina-embeddings-v3, set EMBEDDING_DIM=1024)
+
+        Note: The embedding_dim parameter is automatically injected by EmbeddingFunc wrapper
+        when send_dimensions=True (enabled for Jina and Gemini bindings). This wrapper calls
+        the underlying provider function directly (.func) to avoid double-wrapping, so we must
+        explicitly pass embedding_dim to the provider's underlying function.
        """

        # Step 1: Import provider function and extract default attributes
@ -714,6 +728,7 @@ def create_app(args):
        )

        # Step 3: Create optimized embedding function (calls underlying function directly)
+        # Note: When model is None, each binding will use its own default model
        async def optimized_embedding_function(texts, embedding_dim=None):
            try:
                if binding == "lollms":
@ -725,9 +740,9 @@ def create_app(args):
                        if isinstance(lollms_embed, EmbeddingFunc)
                        else lollms_embed
                    )
-                    return await actual_func(
-                        texts, embed_model=model, host=host, api_key=api_key
-                    )
+                    # lollms embed_model is not used (server uses configured vectorizer)
+                    # Only pass base_url and api_key
+                    return await actual_func(texts, base_url=host, api_key=api_key)
                elif binding == "ollama":
                    from lightrag.llm.ollama import ollama_embed

@ -746,13 +761,16 @@ def create_app(args):

                        ollama_options = OllamaEmbeddingOptions.options_dict(args)

-                    return await actual_func(
-                        texts,
-                        embed_model=model,
-                        host=host,
-                        api_key=api_key,
-                        options=ollama_options,
-                    )
+                    # Pass embed_model only if provided, let function use its default (bge-m3:latest)
+                    kwargs = {
+                        "texts": texts,
+                        "host": host,
+                        "api_key": api_key,
+                        "options": ollama_options,
+                    }
+                    if model:
+                        kwargs["embed_model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "azure_openai":
                    from lightrag.llm.azure_openai import azure_openai_embed

@ -761,7 +779,11 @@ def create_app(args):
                        if isinstance(azure_openai_embed, EmbeddingFunc)
                        else azure_openai_embed
                    )
-                    return await actual_func(texts, model=model, api_key=api_key)
+                    # Pass model only if provided, let function use its default otherwise
+                    kwargs = {"texts": texts, "api_key": api_key}
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "aws_bedrock":
                    from lightrag.llm.bedrock import bedrock_embed

@ -770,7 +792,11 @@ def create_app(args):
                        if isinstance(bedrock_embed, EmbeddingFunc)
                        else bedrock_embed
                    )
-                    return await actual_func(texts, model=model)
+                    # Pass model only if provided, let function use its default otherwise
+                    kwargs = {"texts": texts}
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "jina":
                    from lightrag.llm.jina import jina_embed

@ -779,12 +805,16 @@ def create_app(args):
                        if isinstance(jina_embed, EmbeddingFunc)
                        else jina_embed
                    )
-                    return await actual_func(
-                        texts,
-                        embedding_dim=embedding_dim,
-                        base_url=host,
-                        api_key=api_key,
-                    )
+                    # Pass model only if provided, let function use its default (jina-embeddings-v4)
+                    kwargs = {
+                        "texts": texts,
+                        "embedding_dim": embedding_dim,
+                        "base_url": host,
+                        "api_key": api_key,
+                    }
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "gemini":
                    from lightrag.llm.gemini import gemini_embed

@ -801,15 +831,19 @@ def create_app(args):
                        from lightrag.llm.binding_options import GeminiEmbeddingOptions

                        gemini_options = GeminiEmbeddingOptions.options_dict(args)
-
-                    return await actual_func(
-                        texts,
-                        model=model,
-                        base_url=host,
-                        api_key=api_key,
-                        embedding_dim=embedding_dim,
-                        task_type=gemini_options.get("task_type", "RETRIEVAL_DOCUMENT"),
-                    )
+                    # Pass model only if provided, let function use its default (gemini-embedding-001)
+                    kwargs = {
+                        "texts": texts,
+                        "base_url": host,
+                        "api_key": api_key,
+                        "embedding_dim": embedding_dim,
+                        "task_type": gemini_options.get(
+                            "task_type", "RETRIEVAL_DOCUMENT"
+                        ),
+                    }
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "voyageai":
                    from lightrag.llm.voyageai import voyageai_embed

@ -832,12 +866,16 @@ def create_app(args):
                        if isinstance(openai_embed, EmbeddingFunc)
                        else openai_embed
                    )
-                    return await actual_func(
-                        texts,
-                        model=model,
-                        api_key=api_key,
-                        embedding_dim=embedding_dim,
-                    )
+                    # Pass model only if provided, let function use its default (text-embedding-3-small)
+                    kwargs = {
+                        "texts": texts,
+                        "base_url": host,
+                        "api_key": api_key,
+                        "embedding_dim": embedding_dim,
+                    }
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
            except ImportError as e:
                raise Exception(f"Failed to import {binding} embedding: {e}")

@ -984,15 +1022,27 @@ def create_app(args):
            query: str, documents: list, top_n: int = None, extra_body: dict = None
        ):
            """Server rerank function with configuration from environment variables"""
-            return await selected_rerank_func(
-                query=query,
-                documents=documents,
-                top_n=top_n,
-                api_key=args.rerank_binding_api_key,
-                model=args.rerank_model,
-                base_url=args.rerank_binding_host,
-                extra_body=extra_body,
-            )
+            # Prepare kwargs for rerank function
+            kwargs = {
+                "query": query,
+                "documents": documents,
+                "top_n": top_n,
+                "api_key": args.rerank_binding_api_key,
+                "model": args.rerank_model,
+                "base_url": args.rerank_binding_host,
+            }
+
+            # Add Cohere-specific parameters if using cohere binding
+            if args.rerank_binding == "cohere":
+                # Enable chunking if configured (useful for models with token limits like ColBERT)
+                kwargs["enable_chunking"] = (
+                    os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true"
+                )
+                kwargs["max_tokens_per_doc"] = int(
+                    os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")
+                )
+
+            return await selected_rerank_func(**kwargs, extra_body=extra_body)

        rerank_model_func = server_rerank_func
        logger.info(
@ -1084,8 +1134,11 @@ def create_app(args):

    @app.get("/")
    async def redirect_to_webui():
-        """Redirect root path to /webui"""
-        return RedirectResponse(url="/webui")
+        """Redirect root path based on WebUI availability"""
+        if webui_assets_exist:
+            return RedirectResponse(url="/webui")
+        else:
+            return RedirectResponse(url="/docs")

    @app.get("/auth-status")
    async def get_auth_status():
@ -1152,9 +1205,41 @@ def create_app(args):
            "webui_description": webui_description,
        }

-    @app.get("/health", dependencies=[Depends(combined_auth)])
+    @app.get(
+        "/health",
+        dependencies=[Depends(combined_auth)],
+        summary="Get system health and configuration status",
+        description="Returns comprehensive system status including WebUI availability, configuration, and operational metrics",
+        response_description="System health status with configuration details",
+        responses={
+            200: {
+                "description": "Successful response with system status",
+                "content": {
+                    "application/json": {
+                        "example": {
+                            "status": "healthy",
+                            "webui_available": True,
+                            "working_directory": "/path/to/working/dir",
+                            "input_directory": "/path/to/input/dir",
+                            "configuration": {
+                                "llm_binding": "openai",
+                                "llm_model": "gpt-4",
+                                "embedding_binding": "openai",
+                                "embedding_model": "text-embedding-ada-002",
+                                "workspace": "default",
+                            },
+                            "auth_mode": "enabled",
+                            "pipeline_busy": False,
+                            "core_version": "0.0.1",
+                            "api_version": "0.0.1",
+                        }
+                    }
+                },
+            }
+        },
+    )
    async def get_status(request: Request):
-        """Get current system status"""
+        """Get current system status including WebUI availability"""
        try:
            workspace = get_workspace_from_request(request)
            default_workspace = get_default_workspace()
@ -1174,6 +1259,7 @@ def create_app(args):

            return {
                "status": "healthy",
+                "webui_available": webui_assets_exist,
                "working_directory": str(args.working_dir),
                "input_directory": str(args.input_dir),
                "configuration": {
@ -1263,16 +1349,27 @@ def create_app(args):
            name="swagger-ui-static",
        )

-    # Webui mount webui/index.html
-    static_dir = Path(__file__).parent / "webui"
-    static_dir.mkdir(exist_ok=True)
-    app.mount(
-        "/webui",
-        SmartStaticFiles(
-            directory=static_dir, html=True, check_dir=True
-        ),  # Use SmartStaticFiles
-        name="webui",
-    )
+    # Conditionally mount WebUI only if assets exist
+    if webui_assets_exist:
+        static_dir = Path(__file__).parent / "webui"
+        static_dir.mkdir(exist_ok=True)
+        app.mount(
+            "/webui",
+            SmartStaticFiles(
+                directory=static_dir, html=True, check_dir=True
+            ),  # Use SmartStaticFiles
+            name="webui",
+        )
+        logger.info("WebUI assets mounted at /webui")
+    else:
+        logger.info("WebUI assets not available, /webui route not mounted")
+
+        # Add redirect for /webui when assets are not available
+        @app.get("/webui")
+        @app.get("/webui/")
+        async def webui_redirect_to_docs():
+            """Redirect /webui to /docs when WebUI is not available"""
+            return RedirectResponse(url="/docs")

    return app

--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -24,7 +24,11 @@ from pydantic import BaseModel, Field, field_validator

 from lightrag import LightRAG
 from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus
-from lightrag.utils import generate_track_id
+from lightrag.utils import (
+    generate_track_id,
+    compute_mdhash_id,
+    sanitize_text_for_encoding,
+)
 from lightrag.api.utils_api import get_combined_auth_dependency
 from ..config import global_args

@ -159,7 +163,7 @@ class ReprocessResponse(BaseModel):
    Attributes:
        status: Status of the reprocessing operation
        message: Message describing the operation result
-        track_id: Tracking ID for monitoring reprocessing progress
+        track_id: Always empty string. Reprocessed documents retain their original track_id.
    """

    status: Literal["reprocessing_started"] = Field(
@ -167,7 +171,8 @@ class ReprocessResponse(BaseModel):
    )
    message: str = Field(description="Human-readable message describing the operation")
    track_id: str = Field(
-        description="Tracking ID for monitoring reprocessing progress"
+        default="",
+        description="Always empty string. Reprocessed documents retain their original track_id from initial upload.",
    )

    class Config:
@ -175,7 +180,7 @@ class ReprocessResponse(BaseModel):
            "example": {
                "status": "reprocessing_started",
                "message": "Reprocessing of failed documents has been initiated in background",
-                "track_id": "retry_20250729_170612_def456",
+                "track_id": "",
            }
        }

@ -2097,12 +2102,14 @@ def create_document_routes(
            # Check if filename already exists in doc_status storage
            existing_doc_data = await rag.doc_status.get_doc_by_file_path(safe_filename)
            if existing_doc_data:
-                # Get document status information for error message
+                # Get document status and track_id from existing document
                status = existing_doc_data.get("status", "unknown")
+                # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
+                existing_track_id = existing_doc_data.get("track_id") or ""
                return InsertResponse(
                    status="duplicated",
                    message=f"File '{safe_filename}' already exists in document storage (Status: {status}).",
-                    track_id="",
+                    track_id=existing_track_id,
                )

            file_path = doc_manager.input_dir / safe_filename
@ -2166,14 +2173,30 @@ def create_document_routes(
                    request.file_source
                )
                if existing_doc_data:
-                    # Get document status information for error message
+                    # Get document status and track_id from existing document
                    status = existing_doc_data.get("status", "unknown")
+                    # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
+                    existing_track_id = existing_doc_data.get("track_id") or ""
                    return InsertResponse(
                        status="duplicated",
                        message=f"File source '{request.file_source}' already exists in document storage (Status: {status}).",
-                        track_id="",
+                        track_id=existing_track_id,
                    )

+            # Check if content already exists by computing content hash (doc_id)
+            sanitized_text = sanitize_text_for_encoding(request.text)
+            content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
+            existing_doc = await rag.doc_status.get_by_id(content_doc_id)
+            if existing_doc:
+                # Content already exists, return duplicated with existing track_id
+                status = existing_doc.get("status", "unknown")
+                existing_track_id = existing_doc.get("track_id") or ""
+                return InsertResponse(
+                    status="duplicated",
+                    message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
+                    track_id=existing_track_id,
+                )
+
            # Generate track_id for text insertion
            track_id = generate_track_id("insert")

@ -2232,14 +2255,31 @@ def create_document_routes(
                            file_source
                        )
                        if existing_doc_data:
-                            # Get document status information for error message
+                            # Get document status and track_id from existing document
                            status = existing_doc_data.get("status", "unknown")
+                            # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
+                            existing_track_id = existing_doc_data.get("track_id") or ""
                            return InsertResponse(
                                status="duplicated",
                                message=f"File source '{file_source}' already exists in document storage (Status: {status}).",
-                                track_id="",
+                                track_id=existing_track_id,
                            )

+            # Check if any content already exists by computing content hash (doc_id)
+            for text in request.texts:
+                sanitized_text = sanitize_text_for_encoding(text)
+                content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
+                existing_doc = await rag.doc_status.get_by_id(content_doc_id)
+                if existing_doc:
+                    # Content already exists, return duplicated with existing track_id
+                    status = existing_doc.get("status", "unknown")
+                    existing_track_id = existing_doc.get("track_id") or ""
+                    return InsertResponse(
+                        status="duplicated",
+                        message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
+                        track_id=existing_track_id,
+                    )
+
            # Generate track_id for texts insertion
            track_id = generate_track_id("insert")

@ -3058,29 +3098,27 @@ def create_document_routes(
        This is useful for recovering from server crashes, network errors, LLM service
        outages, or other temporary failures that caused document processing to fail.

-        The processing happens in the background and can be monitored using the
-        returned track_id or by checking the pipeline status.
+        The processing happens in the background and can be monitored by checking the
+        pipeline status. The reprocessed documents retain their original track_id from
+        initial upload, so use their original track_id to monitor progress.

        Returns:
-            ReprocessResponse: Response with status, message, and track_id
+            ReprocessResponse: Response with status and message.
+                track_id is always empty string because reprocessed documents retain
+                their original track_id from initial upload.

        Raises:
            HTTPException: If an error occurs while initiating reprocessing (500).
        """
        try:
-            # Generate track_id with "retry" prefix for retry operation
-            track_id = generate_track_id("retry")
-
            # Start the reprocessing in the background
+            # Note: Reprocessed documents retain their original track_id from initial upload
            background_tasks.add_task(rag.apipeline_process_enqueue_documents)
-            logger.info(
-                f"Reprocessing of failed documents initiated with track_id: {track_id}"
-            )
+            logger.info("Reprocessing of failed documents initiated")

            return ReprocessResponse(
                status="reprocessing_started",
-                message="Reprocessing of failed documents has been initiated in background",
-                track_id=track_id,
+                message="Reprocessing of failed documents has been initiated in background. Documents retain their original track_id.",
            )

        except Exception as e:
--- a/lightrag/api/run_with_gunicorn.py
+++ b/lightrag/api/run_with_gunicorn.py
@ -100,7 +100,7 @@ def main():
        print("\nHow to fix:")
        print("  Option 1 - Set environment variable before starting (recommended):")
        print("     export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES")
-        print("     lightrag-server")
+        print("     lightrag-gunicorn --workers 2")
        print("\n  Option 2 - Add to your shell profile (~/.zshrc or ~/.bash_profile):")
        print("     echo 'export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES' >> ~/.zshrc")
        print("     source ~/.zshrc")
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@ -44,6 +44,23 @@ config.read("config.ini", "utf-8")
 logging.getLogger("neo4j").setLevel(logging.ERROR)


+READ_RETRY_EXCEPTIONS = (
+    neo4jExceptions.ServiceUnavailable,
+    neo4jExceptions.TransientError,
+    neo4jExceptions.SessionExpired,
+    ConnectionResetError,
+    OSError,
+    AttributeError,
+)
+
+READ_RETRY = retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type(READ_RETRY_EXCEPTIONS),
+    reraise=True,
+)
+
+
@final
@dataclass
 class Neo4JStorage(BaseGraphStorage):
@ -352,6 +369,7 @@ class Neo4JStorage(BaseGraphStorage):
        # Neo4J handles persistence automatically
        pass

+    @READ_RETRY
    async def has_node(self, node_id: str) -> bool:
        """
        Check if a node with the given label exists in the database
@ -385,6 +403,7 @@ class Neo4JStorage(BaseGraphStorage):
                    await result.consume()  # Ensure results are consumed even on error
                raise

+    @READ_RETRY
    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
        """
        Check if an edge exists between two nodes
@ -426,6 +445,7 @@ class Neo4JStorage(BaseGraphStorage):
                    await result.consume()  # Ensure results are consumed even on error
                raise

+    @READ_RETRY
    async def get_node(self, node_id: str) -> dict[str, str] | None:
        """Get node by its label identifier, return only node properties

@ -479,6 +499,7 @@ class Neo4JStorage(BaseGraphStorage):
                )
                raise

+    @READ_RETRY
    async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, dict]:
        """
        Retrieve multiple nodes in one query using UNWIND.
@ -515,6 +536,7 @@ class Neo4JStorage(BaseGraphStorage):
            await result.consume()  # Make sure to consume the result fully
            return nodes

+    @READ_RETRY
    async def node_degree(self, node_id: str) -> int:
        """Get the degree (number of relationships) of a node with the given label.
        If multiple nodes have the same label, returns the degree of the first node.
@ -563,6 +585,7 @@ class Neo4JStorage(BaseGraphStorage):
                )
                raise

+    @READ_RETRY
    async def node_degrees_batch(self, node_ids: list[str]) -> dict[str, int]:
        """
        Retrieve the degree for multiple nodes in a single query using UNWIND.
@ -621,6 +644,7 @@ class Neo4JStorage(BaseGraphStorage):
        degrees = int(src_degree) + int(trg_degree)
        return degrees

+    @READ_RETRY
    async def edge_degrees_batch(
        self, edge_pairs: list[tuple[str, str]]
    ) -> dict[tuple[str, str], int]:
@ -647,6 +671,7 @@ class Neo4JStorage(BaseGraphStorage):
            edge_degrees[(src, tgt)] = degrees.get(src, 0) + degrees.get(tgt, 0)
        return edge_degrees

+    @READ_RETRY
    async def get_edge(
        self, source_node_id: str, target_node_id: str
    ) -> dict[str, str] | None:
@ -734,6 +759,7 @@ class Neo4JStorage(BaseGraphStorage):
            )
            raise

+    @READ_RETRY
    async def get_edges_batch(
        self, pairs: list[dict[str, str]]
    ) -> dict[tuple[str, str], dict]:
@ -784,6 +810,7 @@ class Neo4JStorage(BaseGraphStorage):
            await result.consume()
            return edges_dict

+    @READ_RETRY
    async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
        """Retrieves all edges (relationships) for a particular node identified by its label.

@ -851,6 +878,7 @@ class Neo4JStorage(BaseGraphStorage):
            )
            raise

+    @READ_RETRY
    async def get_nodes_edges_batch(
        self, node_ids: list[str]
    ) -> dict[str, list[tuple[str, str]]]:
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@ -383,7 +383,7 @@ class PostgreSQLDB:
    async def configure_age_extension(connection: asyncpg.Connection) -> None:
        """Create AGE extension if it doesn't exist for graph operations."""
        try:
-            await connection.execute("CREATE EXTENSION IF NOT EXISTS age")  # type: ignore
+            await connection.execute("CREATE EXTENSION IF NOT EXISTS AGE CASCADE")  # type: ignore
            logger.info("PostgreSQL, AGE extension enabled")
        except Exception as e:
            logger.warning(f"Could not create AGE extension: {e}")
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@ -1683,3 +1683,17 @@ def get_default_workspace() -> str:
    """
    global _default_workspace
    return _default_workspace
+
+
+def get_pipeline_status_lock(
+    enable_logging: bool = False, workspace: str = None
+) -> NamespaceLock:
+    """Return unified storage lock for pipeline status data consistency.
+
+    This function is for compatibility with legacy code only.
+    """
+    global _default_workspace
+    actual_workspace = workspace if workspace else _default_workspace
+    return get_namespace_lock(
+        "pipeline_status", workspace=actual_workspace, enable_logging=enable_logging
+    )
--- a/lightrag/llm/azure_openai.py
+++ b/lightrag/llm/azure_openai.py
@ -1,177 +1,22 @@
-from collections.abc import Iterable
-import os
-import pipmaster as pm  # Pipmaster for dynamic library install
+"""
+Azure OpenAI compatibility layer.

-# install specific modules
-if not pm.is_installed("openai"):
-    pm.install("openai")
+This module provides backward compatibility by re-exporting Azure OpenAI functions
+from the main openai module where the actual implementation resides.

-from openai import (
-    AsyncAzureOpenAI,
-    APIConnectionError,
-    RateLimitError,
-    APITimeoutError,
-)
-from openai.types.chat import ChatCompletionMessageParam
+All core logic for both OpenAI and Azure OpenAI now lives in lightrag.llm.openai,
+with this module serving as a thin compatibility wrapper for existing code that
+imports from lightrag.llm.azure_openai.
+"""

-from tenacity import (
-    retry,
-    stop_after_attempt,
-    wait_exponential,
-    retry_if_exception_type,
+from lightrag.llm.openai import (
+    azure_openai_complete_if_cache,
+    azure_openai_complete,
+    azure_openai_embed,
 )

-from lightrag.utils import (
-    wrap_embedding_func_with_attrs,
-    safe_unicode_decode,
-    logger,
-)
-
-import numpy as np
-
-
-@retry(
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=4, max=10),
-    retry=retry_if_exception_type(
-        (RateLimitError, APIConnectionError, APIConnectionError)
-    ),
-)
-async def azure_openai_complete_if_cache(
-    model,
-    prompt,
-    system_prompt: str | None = None,
-    history_messages: Iterable[ChatCompletionMessageParam] | None = None,
-    enable_cot: bool = False,
-    base_url: str | None = None,
-    api_key: str | None = None,
-    api_version: str | None = None,
-    **kwargs,
-):
-    if enable_cot:
-        logger.debug(
-            "enable_cot=True is not supported for the Azure OpenAI API and will be ignored."
-        )
-    deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT") or model or os.getenv("LLM_MODEL")
-    base_url = (
-        base_url or os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("LLM_BINDING_HOST")
-    )
-    api_key = (
-        api_key or os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("LLM_BINDING_API_KEY")
-    )
-    api_version = (
-        api_version
-        or os.getenv("AZURE_OPENAI_API_VERSION")
-        or os.getenv("OPENAI_API_VERSION")
-    )
-
-    kwargs.pop("hashing_kv", None)
-    kwargs.pop("keyword_extraction", None)
-    timeout = kwargs.pop("timeout", None)
-
-    openai_async_client = AsyncAzureOpenAI(
-        azure_endpoint=base_url,
-        azure_deployment=deployment,
-        api_key=api_key,
-        api_version=api_version,
-        timeout=timeout,
-    )
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": system_prompt})
-    if history_messages:
-        messages.extend(history_messages)
-    if prompt is not None:
-        messages.append({"role": "user", "content": prompt})
-
-    if "response_format" in kwargs:
-        response = await openai_async_client.beta.chat.completions.parse(
-            model=model, messages=messages, **kwargs
-        )
-    else:
-        response = await openai_async_client.chat.completions.create(
-            model=model, messages=messages, **kwargs
-        )
-
-    if hasattr(response, "__aiter__"):
-
-        async def inner():
-            async for chunk in response:
-                if len(chunk.choices) == 0:
-                    continue
-                content = chunk.choices[0].delta.content
-                if content is None:
-                    continue
-                if r"\u" in content:
-                    content = safe_unicode_decode(content.encode("utf-8"))
-                yield content
-
-        return inner()
-    else:
-        content = response.choices[0].message.content
-        if r"\u" in content:
-            content = safe_unicode_decode(content.encode("utf-8"))
-        return content
-
-
-async def azure_openai_complete(
-    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
-) -> str:
-    kwargs.pop("keyword_extraction", None)
-    result = await azure_openai_complete_if_cache(
-        os.getenv("LLM_MODEL", "gpt-4o-mini"),
-        prompt,
-        system_prompt=system_prompt,
-        history_messages=history_messages,
-        **kwargs,
-    )
-    return result
-
-
-@wrap_embedding_func_with_attrs(embedding_dim=1536)
-@retry(
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=4, max=10),
-    retry=retry_if_exception_type(
-        (RateLimitError, APIConnectionError, APITimeoutError)
-    ),
-)
-async def azure_openai_embed(
-    texts: list[str],
-    model: str | None = None,
-    base_url: str | None = None,
-    api_key: str | None = None,
-    api_version: str | None = None,
-) -> np.ndarray:
-    deployment = (
-        os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
-        or model
-        or os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
-    )
-    base_url = (
-        base_url
-        or os.getenv("AZURE_EMBEDDING_ENDPOINT")
-        or os.getenv("EMBEDDING_BINDING_HOST")
-    )
-    api_key = (
-        api_key
-        or os.getenv("AZURE_EMBEDDING_API_KEY")
-        or os.getenv("EMBEDDING_BINDING_API_KEY")
-    )
-    api_version = (
-        api_version
-        or os.getenv("AZURE_EMBEDDING_API_VERSION")
-        or os.getenv("OPENAI_API_VERSION")
-    )
-
-    openai_async_client = AsyncAzureOpenAI(
-        azure_endpoint=base_url,
-        azure_deployment=deployment,
-        api_key=api_key,
-        api_version=api_version,
-    )
-
-    response = await openai_async_client.embeddings.create(
-        model=model or deployment, input=texts, encoding_format="float"
-    )
-    return np.array([dp.embedding for dp in response.data])
+__all__ = [
+    "azure_openai_complete_if_cache",
+    "azure_openai_complete",
+    "azure_openai_embed",
+]
--- a/lightrag/llm/jina.py
+++ b/lightrag/llm/jina.py
@ -69,6 +69,7 @@ async def fetch_data(url, headers, data):
 )
 async def jina_embed(
    texts: list[str],
+    model: str = "jina-embeddings-v4",
    embedding_dim: int = 2048,
    late_chunking: bool = False,
    base_url: str = None,
@ -78,6 +79,8 @@ async def jina_embed(

    Args:
        texts: List of texts to embed.
+        model: The Jina embedding model to use (default: jina-embeddings-v4).
+            Supported models: jina-embeddings-v3, jina-embeddings-v4, etc.
        embedding_dim: The embedding dimensions (default: 2048 for jina-embeddings-v4).
            **IMPORTANT**: This parameter is automatically injected by the EmbeddingFunc wrapper.
            Do NOT manually pass this parameter when calling the function directly.
@ -107,7 +110,7 @@ async def jina_embed(
        "Authorization": f"Bearer {os.environ['JINA_API_KEY']}",
    }
    data = {
-        "model": "jina-embeddings-v4",
+        "model": model,
        "task": "text-matching",
        "dimensions": embedding_dim,
        "embedding_type": "base64",
--- a/lightrag/llm/ollama.py
+++ b/lightrag/llm/ollama.py
@ -173,7 +173,9 @@ async def ollama_model_complete(


@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
-async def ollama_embed(texts: list[str], embed_model, **kwargs) -> np.ndarray:
+async def ollama_embed(
+    texts: list[str], embed_model: str = "bge-m3:latest", **kwargs
+) -> np.ndarray:
    api_key = kwargs.pop("api_key", None)
    if not api_key:
        api_key = os.getenv("OLLAMA_API_KEY")
--- a/lightrag/llm/openai.py
+++ b/lightrag/llm/openai.py
@ -77,46 +77,86 @@ class InvalidResponseError(Exception):
 def create_openai_async_client(
    api_key: str | None = None,
    base_url: str | None = None,
+    use_azure: bool = False,
+    azure_deployment: str | None = None,
+    api_version: str | None = None,
+    timeout: int | None = None,
    client_configs: dict[str, Any] | None = None,
 ) -> AsyncOpenAI:
-    """Create an AsyncOpenAI client with the given configuration.
+    """Create an AsyncOpenAI or AsyncAzureOpenAI client with the given configuration.

    Args:
        api_key: OpenAI API key. If None, uses the OPENAI_API_KEY environment variable.
        base_url: Base URL for the OpenAI API. If None, uses the default OpenAI API URL.
+        use_azure: Whether to create an Azure OpenAI client. Default is False.
+        azure_deployment: Azure OpenAI deployment name (only used when use_azure=True).
+        api_version: Azure OpenAI API version (only used when use_azure=True).
+        timeout: Request timeout in seconds.
        client_configs: Additional configuration options for the AsyncOpenAI client.
            These will override any default configurations but will be overridden by
            explicit parameters (api_key, base_url).

    Returns:
-        An AsyncOpenAI client instance.
+        An AsyncOpenAI or AsyncAzureOpenAI client instance.
    """
-    if not api_key:
-        api_key = os.environ["OPENAI_API_KEY"]
+    if use_azure:
+        from openai import AsyncAzureOpenAI

-    default_headers = {
-        "User-Agent": f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
-        "Content-Type": "application/json",
-    }
+        if not api_key:
+            api_key = os.environ.get("AZURE_OPENAI_API_KEY") or os.environ.get(
+                "LLM_BINDING_API_KEY"
+            )

-    if client_configs is None:
-        client_configs = {}
+        if client_configs is None:
+            client_configs = {}

-    # Create a merged config dict with precedence: explicit params > client_configs > defaults
-    merged_configs = {
-        **client_configs,
-        "default_headers": default_headers,
-        "api_key": api_key,
-    }
+        # Create a merged config dict with precedence: explicit params > client_configs
+        merged_configs = {
+            **client_configs,
+            "api_key": api_key,
+        }

-    if base_url is not None:
-        merged_configs["base_url"] = base_url
+        # Add explicit parameters (override client_configs)
+        if base_url is not None:
+            merged_configs["azure_endpoint"] = base_url
+        if azure_deployment is not None:
+            merged_configs["azure_deployment"] = azure_deployment
+        if api_version is not None:
+            merged_configs["api_version"] = api_version
+        if timeout is not None:
+            merged_configs["timeout"] = timeout
+
+        return AsyncAzureOpenAI(**merged_configs)
    else:
-        merged_configs["base_url"] = os.environ.get(
-            "OPENAI_API_BASE", "https://api.openai.com/v1"
-        )
+        if not api_key:
+            api_key = os.environ["OPENAI_API_KEY"]

-    return AsyncOpenAI(**merged_configs)
+        default_headers = {
+            "User-Agent": f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
+            "Content-Type": "application/json",
+        }
+
+        if client_configs is None:
+            client_configs = {}
+
+        # Create a merged config dict with precedence: explicit params > client_configs > defaults
+        merged_configs = {
+            **client_configs,
+            "default_headers": default_headers,
+            "api_key": api_key,
+        }
+
+        if base_url is not None:
+            merged_configs["base_url"] = base_url
+        else:
+            merged_configs["base_url"] = os.environ.get(
+                "OPENAI_API_BASE", "https://api.openai.com/v1"
+            )
+
+        if timeout is not None:
+            merged_configs["timeout"] = timeout
+
+        return AsyncOpenAI(**merged_configs)


@retry(
@ -141,6 +181,9 @@ async def openai_complete_if_cache(
    stream: bool | None = None,
    timeout: int | None = None,
    keyword_extraction: bool = False,
+    use_azure: bool = False,
+    azure_deployment: str | None = None,
+    api_version: str | None = None,
    **kwargs: Any,
 ) -> str:
    """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
@ -162,23 +205,33 @@ async def openai_complete_if_cache(
    6. For non-streaming: COT content is prepended to regular content with <think> tags.

    Args:
-        model: The OpenAI model to use.
+        model: The OpenAI model to use. For Azure, this can be the deployment name.
        prompt: The prompt to complete.
        system_prompt: Optional system prompt to include.
        history_messages: Optional list of previous messages in the conversation.
-        base_url: Optional base URL for the OpenAI API.
-        api_key: Optional OpenAI API key. If None, uses the OPENAI_API_KEY environment variable.
-        token_tracker: Optional token usage tracker for monitoring API usage.
        enable_cot: Whether to enable Chain of Thought (COT) processing. Default is False.
+        base_url: Optional base URL for the OpenAI API. For Azure, this should be the
+            Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com/).
+        api_key: Optional API key. For standard OpenAI, uses OPENAI_API_KEY environment
+            variable if None. For Azure, uses AZURE_OPENAI_API_KEY if None.
+        token_tracker: Optional token usage tracker for monitoring API usage.
        stream: Whether to stream the response. Default is False.
        timeout: Request timeout in seconds. Default is None.
        keyword_extraction: Whether to enable keyword extraction mode. When True, triggers
            special response formatting for keyword extraction. Default is False.
+        use_azure: Whether to use Azure OpenAI service instead of standard OpenAI.
+            When True, creates an AsyncAzureOpenAI client. Default is False.
+        azure_deployment: Azure OpenAI deployment name. Only used when use_azure=True.
+            If not specified, falls back to AZURE_OPENAI_DEPLOYMENT environment variable.
+        api_version: Azure OpenAI API version (e.g., "2024-02-15-preview"). Only used
+            when use_azure=True. If not specified, falls back to AZURE_OPENAI_API_VERSION
+            environment variable.
        **kwargs: Additional keyword arguments to pass to the OpenAI API.
            Special kwargs:
            - openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
                These will be passed to the client constructor but will be overridden by
-                explicit parameters (api_key, base_url).
+                explicit parameters (api_key, base_url). Supports proxy configuration,
+                custom headers, retry policies, etc.

    Returns:
        The completed text (with integrated COT content if available) or an async iterator
@ -203,10 +256,18 @@ async def openai_complete_if_cache(
    # Extract client configuration options
    client_configs = kwargs.pop("openai_client_configs", {})

-    # Create the OpenAI client
+    # Handle keyword extraction mode
+    if keyword_extraction:
+        kwargs["response_format"] = GPTKeywordExtractionFormat
+
+    # Create the OpenAI client (supports both OpenAI and Azure)
    openai_async_client = create_openai_async_client(
        api_key=api_key,
        base_url=base_url,
+        use_azure=use_azure,
+        azure_deployment=azure_deployment,
+        api_version=api_version,
+        timeout=timeout,
        client_configs=client_configs,
    )

@ -234,16 +295,24 @@ async def openai_complete_if_cache(
    if timeout is not None:
        kwargs["timeout"] = timeout

+    # Determine the correct model identifier to use
+    # For Azure OpenAI, we must use the deployment name instead of the model name
+    api_model = azure_deployment if use_azure and azure_deployment else model
+
    try:
        # Don't use async with context manager, use client directly
        if "response_format" in kwargs:
-            response = await openai_async_client.beta.chat.completions.parse(
-                model=model, messages=messages, **kwargs
+            response = await openai_async_client.chat.completions.parse(
+                model=api_model, messages=messages, **kwargs
            )
        else:
            response = await openai_async_client.chat.completions.create(
-                model=model, messages=messages, **kwargs
+                model=api_model, messages=messages, **kwargs
            )
+    except APITimeoutError as e:
+        logger.error(f"OpenAI API Timeout Error: {e}")
+        await openai_async_client.close()  # Ensure client is closed
+        raise
    except APIConnectionError as e:
        logger.error(f"OpenAI API Connection Error: {e}")
        await openai_async_client.close()  # Ensure client is closed
@ -252,10 +321,6 @@ async def openai_complete_if_cache(
        logger.error(f"OpenAI API Rate Limit Error: {e}")
        await openai_async_client.close()  # Ensure client is closed
        raise
-    except APITimeoutError as e:
-        logger.error(f"OpenAI API Timeout Error: {e}")
-        await openai_async_client.close()  # Ensure client is closed
-        raise
    except Exception as e:
        logger.error(
            f"OpenAI API Call Failed,\nModel: {model},\nParams: {kwargs}, Got: {e}"
@ -287,7 +352,10 @@ async def openai_complete_if_cache(

                    # Check if choices exists and is not empty
                    if not hasattr(chunk, "choices") or not chunk.choices:
-                        logger.warning(f"Received chunk without choices: {chunk}")
+                        # Azure OpenAI sends content filter results in first chunk without choices
+                        logger.debug(
+                            f"Received chunk without choices (likely Azure content filter): {chunk}"
+                        )
                        continue

                    # Check if delta exists
@ -449,46 +517,57 @@ async def openai_complete_if_cache(
                raise InvalidResponseError("Invalid response from OpenAI API")

            message = response.choices[0].message
-            content = getattr(message, "content", None)
-            reasoning_content = getattr(message, "reasoning_content", "")

-            # Handle COT logic for non-streaming responses (only if enabled)
-            final_content = ""
+            # Handle parsed responses (structured output via response_format)
+            # When using beta.chat.completions.parse(), the response is in message.parsed
+            if hasattr(message, "parsed") and message.parsed is not None:
+                # Serialize the parsed structured response to JSON
+                final_content = message.parsed.model_dump_json()
+                logger.debug("Using parsed structured response from API")
+            else:
+                # Handle regular content responses
+                content = getattr(message, "content", None)
+                reasoning_content = getattr(message, "reasoning_content", "")

-            if enable_cot:
-                # Check if we should include reasoning content
-                should_include_reasoning = False
-                if reasoning_content and reasoning_content.strip():
-                    if not content or content.strip() == "":
-                        # Case 1: Only reasoning content, should include COT
-                        should_include_reasoning = True
-                        final_content = (
-                            content or ""
-                        )  # Use empty string if content is None
+                # Handle COT logic for non-streaming responses (only if enabled)
+                final_content = ""
+
+                if enable_cot:
+                    # Check if we should include reasoning content
+                    should_include_reasoning = False
+                    if reasoning_content and reasoning_content.strip():
+                        if not content or content.strip() == "":
+                            # Case 1: Only reasoning content, should include COT
+                            should_include_reasoning = True
+                            final_content = (
+                                content or ""
+                            )  # Use empty string if content is None
+                        else:
+                            # Case 3: Both content and reasoning_content present, ignore reasoning
+                            should_include_reasoning = False
+                            final_content = content
                    else:
-                        # Case 3: Both content and reasoning_content present, ignore reasoning
-                        should_include_reasoning = False
-                        final_content = content
+                        # No reasoning content, use regular content
+                        final_content = content or ""
+
+                    # Apply COT wrapping if needed
+                    if should_include_reasoning:
+                        if r"\u" in reasoning_content:
+                            reasoning_content = safe_unicode_decode(
+                                reasoning_content.encode("utf-8")
+                            )
+                        final_content = (
+                            f"<think>{reasoning_content}</think>{final_content}"
+                        )
                else:
-                    # No reasoning content, use regular content
+                    # COT disabled, only use regular content
                    final_content = content or ""

-                # Apply COT wrapping if needed
-                if should_include_reasoning:
-                    if r"\u" in reasoning_content:
-                        reasoning_content = safe_unicode_decode(
-                            reasoning_content.encode("utf-8")
-                        )
-                    final_content = f"<think>{reasoning_content}</think>{final_content}"
-            else:
-                # COT disabled, only use regular content
-                final_content = content or ""
-
-            # Validate final content
-            if not final_content or final_content.strip() == "":
-                logger.error("Received empty content from OpenAI API")
-                await openai_async_client.close()  # Ensure client is closed
-                raise InvalidResponseError("Received empty content from OpenAI API")
+                # Validate final content
+                if not final_content or final_content.strip() == "":
+                    logger.error("Received empty content from OpenAI API")
+                    await openai_async_client.close()  # Ensure client is closed
+                    raise InvalidResponseError("Received empty content from OpenAI API")

            # Apply Unicode decoding to final content if needed
            if r"\u" in final_content:
@ -522,8 +601,6 @@ async def openai_complete(
 ) -> Union[str, AsyncIterator[str]]:
    if history_messages is None:
        history_messages = []
-    if keyword_extraction:
-        kwargs["response_format"] = "json"
    model_name = kwargs["hashing_kv"].global_config["llm_model_name"]
    return await openai_complete_if_cache(
        model_name,
@ -545,8 +622,6 @@ async def gpt_4o_complete(
 ) -> str:
    if history_messages is None:
        history_messages = []
-    if keyword_extraction:
-        kwargs["response_format"] = GPTKeywordExtractionFormat
    return await openai_complete_if_cache(
        "gpt-4o",
        prompt,
@ -568,8 +643,6 @@ async def gpt_4o_mini_complete(
 ) -> str:
    if history_messages is None:
        history_messages = []
-    if keyword_extraction:
-        kwargs["response_format"] = GPTKeywordExtractionFormat
    return await openai_complete_if_cache(
        "gpt-4o-mini",
        prompt,
@ -622,24 +695,40 @@ async def openai_embed(
    embedding_dim: int | None = None,
    client_configs: dict[str, Any] | None = None,
    token_tracker: Any | None = None,
+    use_azure: bool = False,
+    azure_deployment: str | None = None,
+    api_version: str | None = None,
 ) -> np.ndarray:
    """Generate embeddings for a list of texts using OpenAI's API.

+    This function supports both standard OpenAI and Azure OpenAI services.
+
    Args:
        texts: List of texts to embed.
-        model: The OpenAI embedding model to use.
-        base_url: Optional base URL for the OpenAI API.
-        api_key: Optional OpenAI API key. If None, uses the OPENAI_API_KEY environment variable.
+        model: The embedding model to use. For standard OpenAI (e.g., "text-embedding-3-small").
+            For Azure, this can be the deployment name.
+        base_url: Optional base URL for the API. For standard OpenAI, uses default OpenAI endpoint.
+            For Azure, this should be the Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com/).
+        api_key: Optional API key. For standard OpenAI, uses OPENAI_API_KEY environment variable if None.
+            For Azure, uses AZURE_EMBEDDING_API_KEY environment variable if None.
        embedding_dim: Optional embedding dimension for dynamic dimension reduction.
            **IMPORTANT**: This parameter is automatically injected by the EmbeddingFunc wrapper.
            Do NOT manually pass this parameter when calling the function directly.
            The dimension is controlled by the @wrap_embedding_func_with_attrs decorator.
            Manually passing a different value will trigger a warning and be ignored.
            When provided (by EmbeddingFunc), it will be passed to the OpenAI API for dimension reduction.
-        client_configs: Additional configuration options for the AsyncOpenAI client.
+        client_configs: Additional configuration options for the AsyncOpenAI/AsyncAzureOpenAI client.
            These will override any default configurations but will be overridden by
-            explicit parameters (api_key, base_url).
+            explicit parameters (api_key, base_url). Supports proxy configuration,
+            custom headers, retry policies, etc.
        token_tracker: Optional token usage tracker for monitoring API usage.
+        use_azure: Whether to use Azure OpenAI service instead of standard OpenAI.
+            When True, creates an AsyncAzureOpenAI client. Default is False.
+        azure_deployment: Azure OpenAI deployment name. Only used when use_azure=True.
+            If not specified, falls back to AZURE_EMBEDDING_DEPLOYMENT environment variable.
+        api_version: Azure OpenAI API version (e.g., "2024-02-15-preview"). Only used
+            when use_azure=True. If not specified, falls back to AZURE_EMBEDDING_API_VERSION
+            environment variable.

    Returns:
        A numpy array of embeddings, one per input text.
@ -649,15 +738,24 @@ async def openai_embed(
        RateLimitError: If the OpenAI API rate limit is exceeded.
        APITimeoutError: If the OpenAI API request times out.
    """
-    # Create the OpenAI client
+    # Create the OpenAI client (supports both OpenAI and Azure)
    openai_async_client = create_openai_async_client(
-        api_key=api_key, base_url=base_url, client_configs=client_configs
+        api_key=api_key,
+        base_url=base_url,
+        use_azure=use_azure,
+        azure_deployment=azure_deployment,
+        api_version=api_version,
+        client_configs=client_configs,
    )

    async with openai_async_client:
+        # Determine the correct model identifier to use
+        # For Azure OpenAI, we must use the deployment name instead of the model name
+        api_model = azure_deployment if use_azure and azure_deployment else model
+
        # Prepare API call parameters
        api_params = {
-            "model": model,
+            "model": api_model,
            "input": texts,
            "encoding_format": "base64",
        }
@ -684,3 +782,172 @@ async def openai_embed(
                for dp in response.data
            ]
        )
+
+
+# Azure OpenAI wrapper functions for backward compatibility
+async def azure_openai_complete_if_cache(
+    model,
+    prompt,
+    system_prompt: str | None = None,
+    history_messages: list[dict[str, Any]] | None = None,
+    enable_cot: bool = False,
+    base_url: str | None = None,
+    api_key: str | None = None,
+    token_tracker: Any | None = None,
+    stream: bool | None = None,
+    timeout: int | None = None,
+    api_version: str | None = None,
+    keyword_extraction: bool = False,
+    **kwargs,
+):
+    """Azure OpenAI completion wrapper function.
+
+    This function provides backward compatibility by wrapping the unified
+    openai_complete_if_cache implementation with Azure-specific parameter handling.
+
+    All parameters from the underlying openai_complete_if_cache are exposed to ensure
+    full feature parity and API consistency.
+    """
+    # Handle Azure-specific environment variables and parameters
+    deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT") or model or os.getenv("LLM_MODEL")
+    base_url = (
+        base_url or os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("LLM_BINDING_HOST")
+    )
+    api_key = (
+        api_key or os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("LLM_BINDING_API_KEY")
+    )
+    api_version = (
+        api_version
+        or os.getenv("AZURE_OPENAI_API_VERSION")
+        or os.getenv("OPENAI_API_VERSION")
+        or "2024-08-01-preview"
+    )
+
+    # Call the unified implementation with Azure-specific parameters
+    return await openai_complete_if_cache(
+        model=deployment,
+        prompt=prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        enable_cot=enable_cot,
+        base_url=base_url,
+        api_key=api_key,
+        token_tracker=token_tracker,
+        stream=stream,
+        timeout=timeout,
+        use_azure=True,
+        azure_deployment=deployment,
+        api_version=api_version,
+        keyword_extraction=keyword_extraction,
+        **kwargs,
+    )
+
+
+async def azure_openai_complete(
+    prompt,
+    system_prompt=None,
+    history_messages=None,
+    keyword_extraction=False,
+    **kwargs,
+) -> str:
+    """Azure OpenAI complete wrapper function.
+
+    Provides backward compatibility for azure_openai_complete calls.
+    """
+    if history_messages is None:
+        history_messages = []
+    result = await azure_openai_complete_if_cache(
+        os.getenv("LLM_MODEL", "gpt-4o-mini"),
+        prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        keyword_extraction=keyword_extraction,
+        **kwargs,
+    )
+    return result
+
+
+@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
+async def azure_openai_embed(
+    texts: list[str],
+    model: str | None = None,
+    base_url: str | None = None,
+    api_key: str | None = None,
+    token_tracker: Any | None = None,
+    client_configs: dict[str, Any] | None = None,
+    api_version: str | None = None,
+) -> np.ndarray:
+    """Azure OpenAI embedding wrapper function.
+
+    This function provides backward compatibility by wrapping the unified
+    openai_embed implementation with Azure-specific parameter handling.
+
+    All parameters from the underlying openai_embed are exposed to ensure
+    full feature parity and API consistency.
+
+    IMPORTANT - Decorator Usage:
+
+    1. This function is decorated with @wrap_embedding_func_with_attrs to provide
+       the EmbeddingFunc interface for users who need to access embedding_dim
+       and other attributes.
+
+    2. This function does NOT use @retry decorator to avoid double-wrapping,
+       since the underlying openai_embed.func already has retry logic.
+
+    3. This function calls openai_embed.func (the unwrapped function) instead of
+       openai_embed (the EmbeddingFunc instance) to avoid double decoration issues:
+
+       ✅ Correct: await openai_embed.func(...)  # Calls unwrapped function with retry
+       ❌ Wrong:   await openai_embed(...)       # Would cause double EmbeddingFunc wrapping
+
+    Double decoration causes:
+    - Double injection of embedding_dim parameter
+    - Incorrect parameter passing to the underlying implementation
+    - Runtime errors due to parameter conflicts
+
+    The call chain with correct implementation:
+    azure_openai_embed(texts)
+    → EmbeddingFunc.__call__(texts)              # azure's decorator
+      → azure_openai_embed_impl(texts, embedding_dim=1536)
+        → openai_embed.func(texts, ...)
+          → @retry_wrapper(texts, ...)           # openai's retry (only one layer)
+            → openai_embed_impl(texts, ...)
+              → actual embedding computation
+    """
+    # Handle Azure-specific environment variables and parameters
+    deployment = (
+        os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
+        or model
+        or os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+    )
+    base_url = (
+        base_url
+        or os.getenv("AZURE_EMBEDDING_ENDPOINT")
+        or os.getenv("EMBEDDING_BINDING_HOST")
+    )
+    api_key = (
+        api_key
+        or os.getenv("AZURE_EMBEDDING_API_KEY")
+        or os.getenv("EMBEDDING_BINDING_API_KEY")
+    )
+    api_version = (
+        api_version
+        or os.getenv("AZURE_EMBEDDING_API_VERSION")
+        or os.getenv("AZURE_OPENAI_API_VERSION")
+        or os.getenv("OPENAI_API_VERSION")
+        or "2024-08-01-preview"
+    )
+
+    # CRITICAL: Call openai_embed.func (unwrapped) to avoid double decoration
+    # openai_embed is an EmbeddingFunc instance, .func accesses the underlying function
+    return await openai_embed.func(
+        texts=texts,
+        model=deployment,
+        base_url=base_url,
+        api_key=api_key,
+        token_tracker=token_tracker,
+        client_configs=client_configs,
+        use_azure=True,
+        azure_deployment=deployment,
+        api_version=api_version,
+    )
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -397,8 +397,8 @@ async def _handle_single_entity_extraction(

        # Validate entity name after all cleaning steps
        if not entity_name or not entity_name.strip():
-            logger.warning(
-                f"Entity extraction error: entity name became empty after cleaning. Original: '{record_attributes[1]}'"
+            logger.info(
+                f"Empty entity name found after sanitization. Original: '{record_attributes[1]}'"
            )
            return None

@ -474,14 +474,14 @@ async def _handle_single_relationship_extraction(

        # Validate entity names after all cleaning steps
        if not source:
-            logger.warning(
-                f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'"
+            logger.info(
+                f"Empty source entity found after sanitization. Original: '{record_attributes[1]}'"
            )
            return None

        if not target:
-            logger.warning(
-                f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'"
+            logger.info(
+                f"Empty target entity found after sanitization. Original: '{record_attributes[2]}'"
            )
            return None

--- a/lightrag/rerank.py
+++ b/lightrag/rerank.py
@ -2,7 +2,7 @@ from __future__ import annotations

 import os
 import aiohttp
-from typing import Any, List, Dict, Optional
+from typing import Any, List, Dict, Optional, Tuple
 from tenacity import (
    retry,
    stop_after_attempt,
@ -19,6 +19,158 @@ from dotenv import load_dotenv
 load_dotenv(dotenv_path=".env", override=False)


+def chunk_documents_for_rerank(
+    documents: List[str],
+    max_tokens: int = 480,
+    overlap_tokens: int = 32,
+    tokenizer_model: str = "gpt-4o-mini",
+) -> Tuple[List[str], List[int]]:
+    """
+    Chunk documents that exceed token limit for reranking.
+
+    Args:
+        documents: List of document strings to chunk
+        max_tokens: Maximum tokens per chunk (default 480 to leave margin for 512 limit)
+        overlap_tokens: Number of tokens to overlap between chunks
+        tokenizer_model: Model name for tiktoken tokenizer
+
+    Returns:
+        Tuple of (chunked_documents, original_doc_indices)
+        - chunked_documents: List of document chunks (may be more than input)
+        - original_doc_indices: Maps each chunk back to its original document index
+    """
+    # Clamp overlap_tokens to ensure the loop always advances
+    # If overlap_tokens >= max_tokens, the chunking loop would hang
+    if overlap_tokens >= max_tokens:
+        original_overlap = overlap_tokens
+        # Ensure overlap is at least 1 token less than max to guarantee progress
+        # For very small max_tokens (e.g., 1), set overlap to 0
+        overlap_tokens = max(0, max_tokens - 1)
+        logger.warning(
+            f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). "
+            f"Clamping to {overlap_tokens} to prevent infinite loop."
+        )
+
+    try:
+        from .utils import TiktokenTokenizer
+
+        tokenizer = TiktokenTokenizer(model_name=tokenizer_model)
+    except Exception as e:
+        logger.warning(
+            f"Failed to initialize tokenizer: {e}. Using character-based approximation."
+        )
+        # Fallback: approximate 1 token ≈ 4 characters
+        max_chars = max_tokens * 4
+        overlap_chars = overlap_tokens * 4
+
+        chunked_docs = []
+        doc_indices = []
+
+        for idx, doc in enumerate(documents):
+            if len(doc) <= max_chars:
+                chunked_docs.append(doc)
+                doc_indices.append(idx)
+            else:
+                # Split into overlapping chunks
+                start = 0
+                while start < len(doc):
+                    end = min(start + max_chars, len(doc))
+                    chunk = doc[start:end]
+                    chunked_docs.append(chunk)
+                    doc_indices.append(idx)
+
+                    if end >= len(doc):
+                        break
+                    start = end - overlap_chars
+
+        return chunked_docs, doc_indices
+
+    # Use tokenizer for accurate chunking
+    chunked_docs = []
+    doc_indices = []
+
+    for idx, doc in enumerate(documents):
+        tokens = tokenizer.encode(doc)
+
+        if len(tokens) <= max_tokens:
+            # Document fits in one chunk
+            chunked_docs.append(doc)
+            doc_indices.append(idx)
+        else:
+            # Split into overlapping chunks
+            start = 0
+            while start < len(tokens):
+                end = min(start + max_tokens, len(tokens))
+                chunk_tokens = tokens[start:end]
+                chunk_text = tokenizer.decode(chunk_tokens)
+                chunked_docs.append(chunk_text)
+                doc_indices.append(idx)
+
+                if end >= len(tokens):
+                    break
+                start = end - overlap_tokens
+
+    return chunked_docs, doc_indices
+
+
+def aggregate_chunk_scores(
+    chunk_results: List[Dict[str, Any]],
+    doc_indices: List[int],
+    num_original_docs: int,
+    aggregation: str = "max",
+) -> List[Dict[str, Any]]:
+    """
+    Aggregate rerank scores from document chunks back to original documents.
+
+    Args:
+        chunk_results: Rerank results for chunks [{"index": chunk_idx, "relevance_score": score}, ...]
+        doc_indices: Maps each chunk index to original document index
+        num_original_docs: Total number of original documents
+        aggregation: Strategy for aggregating scores ("max", "mean", "first")
+
+    Returns:
+        List of results for original documents [{"index": doc_idx, "relevance_score": score}, ...]
+    """
+    # Group scores by original document index
+    doc_scores: Dict[int, List[float]] = {i: [] for i in range(num_original_docs)}
+
+    for result in chunk_results:
+        chunk_idx = result["index"]
+        score = result["relevance_score"]
+
+        if 0 <= chunk_idx < len(doc_indices):
+            original_doc_idx = doc_indices[chunk_idx]
+            doc_scores[original_doc_idx].append(score)
+
+    # Aggregate scores
+    aggregated_results = []
+    for doc_idx, scores in doc_scores.items():
+        if not scores:
+            continue
+
+        if aggregation == "max":
+            final_score = max(scores)
+        elif aggregation == "mean":
+            final_score = sum(scores) / len(scores)
+        elif aggregation == "first":
+            final_score = scores[0]
+        else:
+            logger.warning(f"Unknown aggregation strategy: {aggregation}, using max")
+            final_score = max(scores)
+
+        aggregated_results.append(
+            {
+                "index": doc_idx,
+                "relevance_score": final_score,
+            }
+        )
+
+    # Sort by relevance score (descending)
+    aggregated_results.sort(key=lambda x: x["relevance_score"], reverse=True)
+
+    return aggregated_results
+
+
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=60),
@ -38,6 +190,8 @@ async def generic_rerank_api(
    extra_body: Optional[Dict[str, Any]] = None,
    response_format: str = "standard",  # "standard" (Jina/Cohere) or "aliyun"
    request_format: str = "standard",  # "standard" (Jina/Cohere) or "aliyun"
+    enable_chunking: bool = False,
+    max_tokens_per_doc: int = 480,
 ) -> List[Dict[str, Any]]:
    """
    Generic rerank API call for Jina/Cohere/Aliyun models.
@ -52,6 +206,9 @@ async def generic_rerank_api(
        return_documents: Whether to return document text (Jina only)
        extra_body: Additional body parameters
        response_format: Response format type ("standard" for Jina/Cohere, "aliyun" for Aliyun)
+        request_format: Request format type
+        enable_chunking: Whether to chunk documents exceeding token limit
+        max_tokens_per_doc: Maximum tokens per document for chunking

    Returns:
        List of dictionary of ["index": int, "relevance_score": float]
@ -63,6 +220,27 @@ async def generic_rerank_api(
    if api_key is not None:
        headers["Authorization"] = f"Bearer {api_key}"

+    # Handle document chunking if enabled
+    original_documents = documents
+    doc_indices = None
+    original_top_n = top_n  # Save original top_n for post-aggregation limiting
+
+    if enable_chunking:
+        documents, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=max_tokens_per_doc
+        )
+        logger.debug(
+            f"Chunked {len(original_documents)} documents into {len(documents)} chunks"
+        )
+        # When chunking is enabled, disable top_n at API level to get all chunk scores
+        # This ensures proper document-level coverage after aggregation
+        # We'll apply top_n to aggregated document results instead
+        if top_n is not None:
+            logger.debug(
+                f"Chunking enabled: disabled API-level top_n={top_n} to ensure complete document coverage"
+            )
+            top_n = None
+
    # Build request payload based on request format
    if request_format == "aliyun":
        # Aliyun format: nested input/parameters structure
@ -86,7 +264,7 @@ async def generic_rerank_api(
        if extra_body:
            payload["parameters"].update(extra_body)
    else:
-        # Standard format for Jina/Cohere
+        # Standard format for Jina/Cohere/OpenAI
        payload = {
            "model": model,
            "query": query,
@ -98,7 +276,7 @@ async def generic_rerank_api(
            payload["top_n"] = top_n

        # Only Jina API supports return_documents parameter
-        if return_documents is not None:
+        if return_documents is not None and response_format in ("standard",):
            payload["return_documents"] = return_documents

        # Add extra parameters
@ -147,7 +325,6 @@ async def generic_rerank_api(
                        f"Expected 'output.results' to be list, got {type(results)}: {results}"
                    )
                    results = []
-
            elif response_format == "standard":
                # Standard format: {"results": [...]}
                results = response_json.get("results", [])
@ -158,16 +335,35 @@ async def generic_rerank_api(
                    results = []
            else:
                raise ValueError(f"Unsupported response format: {response_format}")
+
            if not results:
                logger.warning("Rerank API returned empty results")
                return []

            # Standardize return format
-            return [
+            standardized_results = [
                {"index": result["index"], "relevance_score": result["relevance_score"]}
                for result in results
            ]

+            # Aggregate chunk scores back to original documents if chunking was enabled
+            if enable_chunking and doc_indices:
+                standardized_results = aggregate_chunk_scores(
+                    standardized_results,
+                    doc_indices,
+                    len(original_documents),
+                    aggregation="max",
+                )
+                # Apply original top_n limit at document level (post-aggregation)
+                # This preserves document-level semantics: top_n limits documents, not chunks
+                if (
+                    original_top_n is not None
+                    and len(standardized_results) > original_top_n
+                ):
+                    standardized_results = standardized_results[:original_top_n]
+
+            return standardized_results
+

 async def cohere_rerank(
    query: str,
@ -177,21 +373,46 @@ async def cohere_rerank(
    model: str = "rerank-v3.5",
    base_url: str = "https://api.cohere.com/v2/rerank",
    extra_body: Optional[Dict[str, Any]] = None,
+    enable_chunking: bool = False,
+    max_tokens_per_doc: int = 4096,
 ) -> List[Dict[str, Any]]:
    """
    Rerank documents using Cohere API.

+    Supports both standard Cohere API and Cohere-compatible proxies
+
    Args:
        query: The search query
        documents: List of strings to rerank
        top_n: Number of top results to return
-        api_key: API key
-        model: rerank model name
+        api_key: API key for authentication
+        model: rerank model name (default: rerank-v3.5)
        base_url: API endpoint
        extra_body: Additional body for http request(reserved for extra params)
+        enable_chunking: Whether to chunk documents exceeding max_tokens_per_doc
+        max_tokens_per_doc: Maximum tokens per document (default: 4096 for Cohere v3.5)

    Returns:
        List of dictionary of ["index": int, "relevance_score": float]
+
+    Example:
+        >>> # Standard Cohere API
+        >>> results = await cohere_rerank(
+        ...     query="What is the meaning of life?",
+        ...     documents=["Doc1", "Doc2"],
+        ...     api_key="your-cohere-key"
+        ... )
+
+        >>> # LiteLLM proxy with user authentication
+        >>> results = await cohere_rerank(
+        ...     query="What is vector search?",
+        ...     documents=["Doc1", "Doc2"],
+        ...     model="answerai-colbert-small-v1",
+        ...     base_url="https://llm-proxy.example.com/v2/rerank",
+        ...     api_key="your-proxy-key",
+        ...     enable_chunking=True,
+        ...     max_tokens_per_doc=480
+        ... )
    """
    if api_key is None:
        api_key = os.getenv("COHERE_API_KEY") or os.getenv("RERANK_BINDING_API_KEY")
@ -206,6 +427,8 @@ async def cohere_rerank(
        return_documents=None,  # Cohere doesn't support this parameter
        extra_body=extra_body,
        response_format="standard",
+        enable_chunking=enable_chunking,
+        max_tokens_per_doc=max_tokens_per_doc,
    )


--- a/lightrag/tools/clean_llm_query_cache.py
+++ b/lightrag/tools/clean_llm_query_cache.py
@ -1129,11 +1129,16 @@ class CleanupTool:
                pass


-async def main():
-    """Main entry point"""
+async def async_main():
+    """Async main entry point"""
    tool = CleanupTool()
    await tool.run()


+def main():
+    """Synchronous entry point for CLI command"""
+    asyncio.run(async_main())
+
+
 if __name__ == "__main__":
-    asyncio.run(main())
+    main()
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -1005,7 +1005,76 @@ def priority_limit_async_func_call(


 def wrap_embedding_func_with_attrs(**kwargs):
-    """Wrap a function with attributes"""
+    """Decorator to add embedding dimension and token limit attributes to embedding functions.
+
+    This decorator wraps an async embedding function and returns an EmbeddingFunc instance
+    that automatically handles dimension parameter injection and attribute management.
+
+    WARNING: DO NOT apply this decorator to wrapper functions that call other
+    decorated embedding functions. This will cause double decoration and parameter
+    injection conflicts.
+
+    Correct usage patterns:
+
+    1. Direct implementation (decorated):
+        ```python
+        @wrap_embedding_func_with_attrs(embedding_dim=1536)
+        async def my_embed(texts, embedding_dim=None):
+            # Direct implementation
+            return embeddings
+        ```
+
+    2. Wrapper calling decorated function (DO NOT decorate wrapper):
+        ```python
+        # my_embed is already decorated above
+
+        async def my_wrapper(texts, **kwargs):  # ❌ DO NOT decorate this!
+            # Must call .func to access unwrapped implementation
+            return await my_embed.func(texts, **kwargs)
+        ```
+
+    3. Wrapper calling decorated function (properly decorated):
+        ```python
+        @wrap_embedding_func_with_attrs(embedding_dim=1536)
+        async def my_wrapper(texts, **kwargs):  # ✅ Can decorate if calling .func
+            # Calling .func avoids double decoration
+            return await my_embed.func(texts, **kwargs)
+        ```
+
+    The decorated function becomes an EmbeddingFunc instance with:
+    - embedding_dim: The embedding dimension
+    - max_token_size: Maximum token limit (optional)
+    - func: The original unwrapped function (access via .func)
+    - __call__: Wrapper that injects embedding_dim parameter
+
+    Double decoration causes:
+    - Double injection of embedding_dim parameter
+    - Incorrect parameter passing to the underlying implementation
+    - Runtime errors due to parameter conflicts
+
+    Args:
+        embedding_dim: The dimension of embedding vectors
+        max_token_size: Maximum number of tokens (optional)
+        send_dimensions: Whether to inject embedding_dim as a keyword argument (optional)
+
+    Returns:
+        A decorator that wraps the function as an EmbeddingFunc instance
+
+    Example of correct wrapper implementation:
+        ```python
+        @wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
+        @retry(...)
+        async def openai_embed(texts, ...):
+            # Base implementation
+            pass
+
+        @wrap_embedding_func_with_attrs(embedding_dim=1536)  # Note: No @retry here!
+        async def azure_openai_embed(texts, ...):
+            # CRITICAL: Call .func to access unwrapped function
+            return await openai_embed.func(texts, ...)  # ✅ Correct
+            # return await openai_embed(texts, ...)     # ❌ Wrong - double decoration!
+        ```
+    """

    def final_decro(func) -> EmbeddingFunc:
        new_func = EmbeddingFunc(**kwargs, func=func)
--- a/lightrag_webui/bun.lock
+++ b/lightrag_webui/bun.lock
--- a/lightrag_webui/package.json
+++ b/lightrag_webui/package.json
@ -16,16 +16,16 @@
    "preview-no-bun": "vite preview"
  },
  "dependencies": {
-    "@faker-js/faker": "^9.9.0",
+    "@faker-js/faker": "^10.1.0",
    "@radix-ui/react-alert-dialog": "^1.1.15",
    "@radix-ui/react-checkbox": "^1.3.3",
    "@radix-ui/react-dialog": "^1.1.15",
    "@radix-ui/react-popover": "^1.1.15",
-    "@radix-ui/react-progress": "^1.1.7",
+    "@radix-ui/react-progress": "^1.1.8",
    "@radix-ui/react-scroll-area": "^1.2.10",
    "@radix-ui/react-select": "^2.2.6",
-    "@radix-ui/react-separator": "^1.1.7",
-    "@radix-ui/react-slot": "^1.2.3",
+    "@radix-ui/react-separator": "^1.1.8",
+    "@radix-ui/react-slot": "^1.2.4",
    "@radix-ui/react-tabs": "^1.1.13",
    "@radix-ui/react-tooltip": "^1.2.8",
    "@radix-ui/react-use-controllable-state": "^1.2.2",
@ -41,7 +41,7 @@
    "@sigma/edge-curve": "^3.1.0",
    "@sigma/node-border": "^3.0.0",
    "@tanstack/react-table": "^8.21.3",
-    "axios": "^1.12.2",
+    "axios": "^1.13.2",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "cmdk": "^1.1.1",
@ -51,21 +51,21 @@
    "graphology-layout-force": "^0.2.4",
    "graphology-layout-forceatlas2": "^0.10.1",
    "graphology-layout-noverlap": "^0.4.2",
-    "i18next": "^24.2.3",
-    "katex": "^0.16.23",
-    "lucide-react": "^0.475.0",
-    "mermaid": "^11.12.0",
+    "i18next": "^25.6.3",
+    "katex": "^0.16.25",
+    "mermaid": "^11.12.1",
+    "lucide-react": "^0.554.0",
    "minisearch": "^7.2.0",
    "react": "^19.2.0",
    "react-dom": "^19.2.0",
    "react-dropzone": "^14.3.8",
-    "react-error-boundary": "^5.0.0",
-    "react-i18next": "^15.7.4",
-    "react-markdown": "^9.1.0",
+    "react-error-boundary": "^6.0.0",
+    "react-i18next": "^16.3.5",
+    "react-markdown": "^10.1.0",
    "react-number-format": "^5.4.4",
-    "react-router-dom": "^7.9.4",
+    "react-router-dom": "^7.9.6",
    "react-select": "^5.10.2",
-    "react-syntax-highlighter": "^15.6.6",
+    "react-syntax-highlighter": "^16.1.0",
    "rehype-katex": "^7.0.1",
    "rehype-raw": "^7.0.0",
    "rehype-react": "^8.0.0",
@ -73,40 +73,40 @@
    "remark-math": "^6.0.0",
    "seedrandom": "^3.0.5",
    "sigma": "^3.0.2",
-    "sonner": "^1.7.4",
-    "tailwind-merge": "^3.3.1",
+    "sonner": "^2.0.7",
+    "tailwind-merge": "^3.4.0",
    "tailwind-scrollbar": "^4.0.2",
    "typography": "^0.16.24",
    "unist-util-visit": "^5.0.0",
    "zustand": "^5.0.8"
  },
  "devDependencies": {
-    "@eslint/js": "^9.37.0",
-    "@stylistic/eslint-plugin-js": "^3.1.0",
-    "@tailwindcss/vite": "^4.1.14",
-    "@types/bun": "^1.2.23",
+    "@eslint/js": "^9.39.1",
+    "@stylistic/eslint-plugin-js": "^4.4.1",
+    "@types/bun": "^1.3.3",
+    "@tailwindcss/vite": "^4.1.17",
    "@types/katex": "^0.16.7",
-    "@types/node": "^22.18.9",
+    "@types/node": "^24.10.1",
    "@tailwindcss/typography": "^0.5.15",
-    "@types/react": "^19.2.2",
-    "@types/react-dom": "^19.2.1",
+    "@types/react": "^19.2.7",
+    "@types/react-dom": "^19.2.3",
    "@types/react-i18next": "^8.1.0",
    "@types/react-syntax-highlighter": "^15.5.13",
    "@types/seedrandom": "^3.0.8",
-    "@vitejs/plugin-react-swc": "^3.11.0",
-    "eslint": "^9.37.0",
+    "@vitejs/plugin-react-swc": "^4.2.2",
+    "eslint": "^9.39.1",
    "eslint-config-prettier": "^10.1.8",
    "eslint-plugin-react": "^7.37.5",
-    "eslint-plugin-react-hooks": "^5.2.0",
-    "eslint-plugin-react-refresh": "^0.4.23",
-    "globals": "^15.15.0",
+    "eslint-plugin-react-hooks": "^7.0.1",
+    "eslint-plugin-react-refresh": "^0.4.24",
+    "globals": "^16.5.0",
    "graphology-types": "^0.24.8",
    "prettier": "^3.6.2",
-    "prettier-plugin-tailwindcss": "^0.6.14",
-    "tailwindcss": "^4.1.14",
+    "prettier-plugin-tailwindcss": "^0.7.1",
+    "typescript-eslint": "^8.48.0",
+    "tailwindcss": "^4.1.17",
    "tailwindcss-animate": "^1.0.7",
-    "typescript": "~5.7.3",
-    "typescript-eslint": "^8.46.0",
-    "vite": "^6.3.6"
+    "typescript": "~5.9.3",
+    "vite": "^7.2.4"
  }
 }
--- a/lightrag_webui/src/components/retrieval/ChatMessage.tsx
+++ b/lightrag_webui/src/components/retrieval/ChatMessage.tsx
@ -76,7 +76,8 @@ export const ChatMessage = ({
    ? message.content
    : (displayContent !== undefined ? displayContent : (message.content || ''))

-  // Load KaTeX dynamically
+  // Load KaTeX rehype plugin dynamically
+  // Note: KaTeX extensions (mhchem, copy-tex) are imported statically in main.tsx
  useEffect(() => {
    const loadKaTeX = async () => {
      try {
@ -84,7 +85,6 @@ export const ChatMessage = ({
        setKatexPlugin(() => rehypeKatex);
      } catch (error) {
        console.error('Failed to load KaTeX plugin:', error);
-        // Set to null to ensure we don't try to use a failed plugin
        setKatexPlugin(null);
      }
    };
--- a/lightrag_webui/src/main.tsx
+++ b/lightrag_webui/src/main.tsx
@ -4,6 +4,9 @@ import './index.css'
 import AppRouter from './AppRouter'
 import './i18n.ts';
 import 'katex/dist/katex.min.css';
+// Import KaTeX extensions at app startup to ensure they are registered before any rendering
+import 'katex/contrib/mhchem'; // Chemistry formulas: \ce{} and \pu{}
+import 'katex/contrib/copy-tex'; // Allow copying rendered formulas as LaTeX source



--- a/lightrag_webui/src/types/katex.d.ts
+++ b/lightrag_webui/src/types/katex.d.ts
@ -1 +1,2 @@
 declare module 'katex/contrib/mhchem';
+declare module 'katex/contrib/copy-tex';
--- a/lightrag_webui/vite.config.ts
+++ b/lightrag_webui/vite.config.ts
@ -10,7 +10,10 @@ export default defineConfig({
  resolve: {
    alias: {
      '@': path.resolve(__dirname, './src')
-    }
+    },
+    // Force all modules to use the same katex instance
+    // This ensures mhchem extension registered in main.tsx is available to rehype-katex
+    dedupe: ['katex']
  },
  // base: import.meta.env.VITE_BASE_URL || '/webui/',
  base: webuiPrefix,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -23,7 +23,6 @@ classifiers = [
 dependencies = [
    "aiohttp",
    "configparser",
-    "future",
    "google-api-core>=2.0.0,<3.0.0",
    "google-genai>=1.0.0,<2.0.0",
    "json_repair",
@ -47,18 +46,18 @@ pytest = [
    "pytest>=8.4.2",
    "pytest-asyncio>=1.2.0",
    "pre-commit",
+    "ruff",
 ]

 api = [
    # Core dependencies
    "aiohttp",
    "configparser",
-    "future",
    "json_repair",
    "nano-vectordb",
    "networkx",
    "numpy>=1.24.0,<2.0.0",
-    "openai>=1.0.0,<3.0.0",
+    "openai>=2.0.0,<3.0.0",
    "pandas>=2.0.0,<2.4.0",
    "pipmaster",
    "pydantic",
@ -77,9 +76,9 @@ api = [
    "distro",
    "fastapi",
    "httpcore",
-    "httpx",
+    "httpx>=0.28.1",
    "jiter",
-    "passlib[bcrypt]",
+    "bcrypt>=4.0.0",
    "psutil",
    "PyJWT>=2.8.0,<3.0.0",
    "python-jose[cryptography]",
@ -115,7 +114,7 @@ offline-storage = [

 offline-llm = [
    # LLM provider dependencies
-    "openai>=1.0.0,<3.0.0",
+    "openai>=2.0.0,<3.0.0",
    "anthropic>=0.18.0,<1.0.0",
    "ollama>=0.1.0,<1.0.0",
    "zhipuai>=2.0.0,<3.0.0",
@ -131,15 +130,18 @@ offline = [
    "lightrag-hku[api,offline-storage,offline-llm]",
 ]

-evaluation = [
-    # Test framework dependencies (for evaluation)
+test = [
+    "lightrag-hku[api]",
    "pytest>=8.4.2",
    "pytest-asyncio>=1.2.0",
    "pre-commit",
-    # RAG evaluation dependencies (RAGAS framework)
+    "ruff",
+]
+
+evaluation = [
+    "lightrag-hku[api]",
    "ragas>=0.3.7",
    "datasets>=4.3.0",
-    "httpx>=0.28.1",
 ]

 observability = [
@ -151,6 +153,7 @@ observability = [
 lightrag-server = "lightrag.api.lightrag_server:main"
 lightrag-gunicorn = "lightrag.api.run_with_gunicorn:main"
 lightrag-download-cache = "lightrag.tools.download_cache:main"
+lightrag-clean-llmqc = "lightrag.tools.clean_llm_query_cache:main"

 [project.urls]
 Homepage = "https://github.com/HKUDS/LightRAG"
--- a/requirements-offline-llm.txt
+++ b/requirements-offline-llm.txt
@ -14,6 +14,6 @@ google-api-core>=2.0.0,<3.0.0
 google-genai>=1.0.0,<2.0.0
 llama-index>=0.9.0,<1.0.0
 ollama>=0.1.0,<1.0.0
-openai>=1.0.0,<3.0.0
+openai>=2.0.0,<3.0.0
 voyageai>=0.2.0,<1.0.0
 zhipuai>=2.0.0,<3.0.0
--- a/requirements-offline.txt
+++ b/requirements-offline.txt
@ -19,7 +19,7 @@ google-genai>=1.0.0,<2.0.0
 llama-index>=0.9.0,<1.0.0
 neo4j>=5.0.0,<7.0.0
 ollama>=0.1.0,<1.0.0
-openai>=1.0.0,<3.0.0
+openai>=2.0.0,<3.0.0
 openpyxl>=3.0.0,<4.0.0
 pycryptodome>=3.0.0,<4.0.0
 pymilvus>=2.6.2,<3.0.0
--- a/tests/test_overlap_validation.py
+++ b/tests/test_overlap_validation.py
@ -0,0 +1,113 @@
+"""
+Test for overlap_tokens validation to prevent infinite loop.
+
+This test validates the fix for the bug where overlap_tokens >= max_tokens
+causes an infinite loop in the chunking function.
+"""
+
+from lightrag.rerank import chunk_documents_for_rerank
+
+
+class TestOverlapValidation:
+    """Test suite for overlap_tokens validation"""
+
+    def test_overlap_greater_than_max_tokens(self):
+        """Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+
+        # This should clamp overlap_tokens to 29 (max_tokens - 1)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=32
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_overlap_equal_to_max_tokens(self):
+        """Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+
+        # This should clamp overlap_tokens to 29 (max_tokens - 1)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=30
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_overlap_slightly_less_than_max_tokens(self):
+        """Test that overlap_tokens < max_tokens works normally"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+
+        # This should work without clamping
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=29
+        )
+
+        # Should complete successfully
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_small_max_tokens_with_large_overlap(self):
+        """Test edge case with very small max_tokens"""
+        documents = [" ".join([f"word{i}" for i in range(50)])]
+
+        # max_tokens=5, overlap_tokens=10 should clamp to 4
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=5, overlap_tokens=10
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_multiple_documents_with_invalid_overlap(self):
+        """Test multiple documents with overlap_tokens >= max_tokens"""
+        documents = [
+            " ".join([f"word{i}" for i in range(50)]),
+            "short document",
+            " ".join([f"word{i}" for i in range(75)]),
+        ]
+
+        # overlap_tokens > max_tokens
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=25, overlap_tokens=30
+        )
+
+        # Should complete successfully and chunk the long documents
+        assert len(chunked_docs) >= len(documents)
+        # Short document should not be chunked
+        assert "short document" in chunked_docs
+
+    def test_normal_operation_unaffected(self):
+        """Test that normal cases continue to work correctly"""
+        documents = [
+            " ".join([f"word{i}" for i in range(100)]),
+            "short doc",
+        ]
+
+        # Normal case: overlap_tokens (10) < max_tokens (50)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=50, overlap_tokens=10
+        )
+
+        # Long document should be chunked, short one should not
+        assert len(chunked_docs) > 2  # At least 3 chunks (2 from long doc + 1 short)
+        assert "short doc" in chunked_docs
+        # Verify doc_indices maps correctly
+        assert doc_indices[-1] == 1  # Last chunk is from second document
+
+    def test_edge_case_max_tokens_one(self):
+        """Test edge case where max_tokens=1"""
+        documents = [" ".join([f"word{i}" for i in range(20)])]
+
+        # max_tokens=1, overlap_tokens=5 should clamp to 0
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=1, overlap_tokens=5
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
--- a/tests/test_rerank_chunking.py
+++ b/tests/test_rerank_chunking.py
@ -0,0 +1,564 @@
+"""
+Unit tests for rerank document chunking functionality.
+
+Tests the chunk_documents_for_rerank and aggregate_chunk_scores functions
+in lightrag/rerank.py to ensure proper document splitting and score aggregation.
+"""
+
+import pytest
+from unittest.mock import Mock, patch, AsyncMock
+from lightrag.rerank import (
+    chunk_documents_for_rerank,
+    aggregate_chunk_scores,
+    cohere_rerank,
+)
+
+
+class TestChunkDocumentsForRerank:
+    """Test suite for chunk_documents_for_rerank function"""
+
+    def test_no_chunking_needed_for_short_docs(self):
+        """Documents shorter than max_tokens should not be chunked"""
+        documents = [
+            "Short doc 1",
+            "Short doc 2",
+            "Short doc 3",
+        ]
+
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=100, overlap_tokens=10
+        )
+
+        # No chunking should occur
+        assert len(chunked_docs) == 3
+        assert chunked_docs == documents
+        assert doc_indices == [0, 1, 2]
+
+    def test_chunking_with_character_fallback(self):
+        """Test chunking falls back to character-based when tokenizer unavailable"""
+        # Create a very long document that exceeds character limit
+        long_doc = "a" * 2000  # 2000 characters
+        documents = [long_doc, "short doc"]
+
+        with patch("lightrag.utils.TiktokenTokenizer", side_effect=ImportError):
+            chunked_docs, doc_indices = chunk_documents_for_rerank(
+                documents,
+                max_tokens=100,  # 100 tokens = ~400 chars
+                overlap_tokens=10,  # 10 tokens = ~40 chars
+            )
+
+        # First doc should be split into chunks, second doc stays whole
+        assert len(chunked_docs) > 2  # At least one chunk from first doc + second doc
+        assert chunked_docs[-1] == "short doc"  # Last chunk is the short doc
+        # Verify doc_indices maps chunks to correct original document
+        assert doc_indices[-1] == 1  # Last chunk maps to document 1
+
+    def test_chunking_with_tiktoken_tokenizer(self):
+        """Test chunking with actual tokenizer"""
+        # Create document with known token count
+        # Approximate: "word " = ~1 token, so 200 words ~ 200 tokens
+        long_doc = " ".join([f"word{i}" for i in range(200)])
+        documents = [long_doc, "short"]
+
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=50, overlap_tokens=10
+        )
+
+        # Long doc should be split, short doc should remain
+        assert len(chunked_docs) > 2
+        assert doc_indices[-1] == 1  # Last chunk is from second document
+
+        # Verify overlapping chunks contain overlapping content
+        if len(chunked_docs) > 2:
+            # Check that consecutive chunks from same doc have some overlap
+            for i in range(len(doc_indices) - 1):
+                if doc_indices[i] == doc_indices[i + 1] == 0:
+                    # Both chunks from first doc, should have overlap
+                    chunk1_words = chunked_docs[i].split()
+                    chunk2_words = chunked_docs[i + 1].split()
+                    # At least one word should be common due to overlap
+                    assert any(word in chunk2_words for word in chunk1_words[-5:])
+
+    def test_empty_documents(self):
+        """Test handling of empty document list"""
+        documents = []
+        chunked_docs, doc_indices = chunk_documents_for_rerank(documents)
+
+        assert chunked_docs == []
+        assert doc_indices == []
+
+    def test_single_document_chunking(self):
+        """Test chunking of a single long document"""
+        # Create document with ~100 tokens
+        long_doc = " ".join([f"token{i}" for i in range(100)])
+        documents = [long_doc]
+
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=5
+        )
+
+        # Should create multiple chunks
+        assert len(chunked_docs) > 1
+        # All chunks should map to document 0
+        assert all(idx == 0 for idx in doc_indices)
+
+
+class TestAggregateChunkScores:
+    """Test suite for aggregate_chunk_scores function"""
+
+    def test_no_chunking_simple_aggregation(self):
+        """Test aggregation when no chunking occurred (1:1 mapping)"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.9},
+            {"index": 1, "relevance_score": 0.7},
+            {"index": 2, "relevance_score": 0.5},
+        ]
+        doc_indices = [0, 1, 2]  # 1:1 mapping
+        num_original_docs = 3
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="max"
+        )
+
+        # Results should be sorted by score
+        assert len(aggregated) == 3
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == 0.9
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.7
+        assert aggregated[2]["index"] == 2
+        assert aggregated[2]["relevance_score"] == 0.5
+
+    def test_max_aggregation_with_chunks(self):
+        """Test max aggregation strategy with multiple chunks per document"""
+        # 5 chunks: first 3 from doc 0, last 2 from doc 1
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.5},
+            {"index": 1, "relevance_score": 0.8},
+            {"index": 2, "relevance_score": 0.6},
+            {"index": 3, "relevance_score": 0.7},
+            {"index": 4, "relevance_score": 0.4},
+        ]
+        doc_indices = [0, 0, 0, 1, 1]
+        num_original_docs = 2
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="max"
+        )
+
+        # Should take max score for each document
+        assert len(aggregated) == 2
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == 0.8  # max of 0.5, 0.8, 0.6
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.7  # max of 0.7, 0.4
+
+    def test_mean_aggregation_with_chunks(self):
+        """Test mean aggregation strategy"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.6},
+            {"index": 1, "relevance_score": 0.8},
+            {"index": 2, "relevance_score": 0.4},
+        ]
+        doc_indices = [0, 0, 1]  # First two chunks from doc 0, last from doc 1
+        num_original_docs = 2
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="mean"
+        )
+
+        assert len(aggregated) == 2
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == pytest.approx(0.7)  # (0.6 + 0.8) / 2
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.4
+
+    def test_first_aggregation_with_chunks(self):
+        """Test first aggregation strategy"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.6},
+            {"index": 1, "relevance_score": 0.8},
+            {"index": 2, "relevance_score": 0.4},
+        ]
+        doc_indices = [0, 0, 1]
+        num_original_docs = 2
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="first"
+        )
+
+        assert len(aggregated) == 2
+        # First should use first score seen for each doc
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == 0.6  # First score for doc 0
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.4
+
+    def test_empty_chunk_results(self):
+        """Test handling of empty results"""
+        aggregated = aggregate_chunk_scores([], [], 3, aggregation="max")
+        assert aggregated == []
+
+    def test_documents_with_no_scores(self):
+        """Test when some documents have no chunks/scores"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.9},
+            {"index": 1, "relevance_score": 0.7},
+        ]
+        doc_indices = [0, 0]  # Both chunks from document 0
+        num_original_docs = 3  # But we have 3 documents total
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="max"
+        )
+
+        # Only doc 0 should appear in results
+        assert len(aggregated) == 1
+        assert aggregated[0]["index"] == 0
+
+    def test_unknown_aggregation_strategy(self):
+        """Test that unknown strategy falls back to max"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.6},
+            {"index": 1, "relevance_score": 0.8},
+        ]
+        doc_indices = [0, 0]
+        num_original_docs = 1
+
+        # Use invalid strategy
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="invalid"
+        )
+
+        # Should fall back to max
+        assert aggregated[0]["relevance_score"] == 0.8
+
+
+@pytest.mark.offline
+class TestTopNWithChunking:
+    """Tests for top_n behavior when chunking is enabled (Bug fix verification)"""
+
+    @pytest.mark.asyncio
+    async def test_top_n_limits_documents_not_chunks(self):
+        """
+        Test that top_n correctly limits documents (not chunks) when chunking is enabled.
+
+        Bug scenario: 10 docs expand to 50 chunks. With old behavior, top_n=5 would
+        return scores for only 5 chunks (possibly all from 1-2 docs). After aggregation,
+        fewer than 5 documents would be returned.
+
+        Fixed behavior: top_n=5 should return exactly 5 documents after aggregation.
+        """
+        # Setup: 5 documents, each producing multiple chunks when chunked
+        # Using small max_tokens to force chunking
+        long_docs = [" ".join([f"doc{i}_word{j}" for j in range(50)]) for i in range(5)]
+        query = "test query"
+
+        # First, determine how many chunks will be created by actual chunking
+        _, doc_indices = chunk_documents_for_rerank(
+            long_docs, max_tokens=50, overlap_tokens=10
+        )
+        num_chunks = len(doc_indices)
+
+        # Mock API returns scores for ALL chunks (simulating disabled API-level top_n)
+        # Give different scores to ensure doc 0 gets highest, doc 1 second, etc.
+        # Assign scores based on original document index (lower doc index = higher score)
+        mock_chunk_scores = []
+        for i in range(num_chunks):
+            original_doc = doc_indices[i]
+            # Higher score for lower doc index, with small variation per chunk
+            base_score = 0.9 - (original_doc * 0.1)
+            mock_chunk_scores.append({"index": i, "relevance_score": base_score})
+
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(return_value={"results": mock_chunk_scores})
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        mock_session = Mock()
+        mock_session.post = Mock(return_value=mock_response)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            result = await cohere_rerank(
+                query=query,
+                documents=long_docs,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=True,
+                max_tokens_per_doc=50,  # Match chunking above
+                top_n=3,  # Request top 3 documents
+            )
+
+            # Verify: should get exactly 3 documents (not unlimited chunks)
+            assert len(result) == 3
+            # All results should have valid document indices (0-4)
+            assert all(0 <= r["index"] < 5 for r in result)
+            # Results should be sorted by score (descending)
+            assert all(
+                result[i]["relevance_score"] >= result[i + 1]["relevance_score"]
+                for i in range(len(result) - 1)
+            )
+            # The top 3 docs should be 0, 1, 2 (highest scores)
+            result_indices = [r["index"] for r in result]
+            assert set(result_indices) == {0, 1, 2}
+
+    @pytest.mark.asyncio
+    async def test_api_receives_no_top_n_when_chunking_enabled(self):
+        """
+        Test that the API request does NOT include top_n when chunking is enabled.
+
+        This ensures all chunk scores are retrieved for proper aggregation.
+        """
+        documents = [" ".join([f"word{i}" for i in range(100)]), "short doc"]
+        query = "test query"
+
+        captured_payload = {}
+
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(
+            return_value={
+                "results": [
+                    {"index": 0, "relevance_score": 0.9},
+                    {"index": 1, "relevance_score": 0.8},
+                    {"index": 2, "relevance_score": 0.7},
+                ]
+            }
+        )
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        def capture_post(*args, **kwargs):
+            captured_payload.update(kwargs.get("json", {}))
+            return mock_response
+
+        mock_session = Mock()
+        mock_session.post = Mock(side_effect=capture_post)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=True,
+                max_tokens_per_doc=30,
+                top_n=1,  # User wants top 1 document
+            )
+
+            # Verify: API payload should NOT have top_n (disabled for chunking)
+            assert "top_n" not in captured_payload
+
+    @pytest.mark.asyncio
+    async def test_top_n_not_modified_when_chunking_disabled(self):
+        """
+        Test that top_n is passed through to API when chunking is disabled.
+        """
+        documents = ["doc1", "doc2"]
+        query = "test query"
+
+        captured_payload = {}
+
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(
+            return_value={
+                "results": [
+                    {"index": 0, "relevance_score": 0.9},
+                ]
+            }
+        )
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        def capture_post(*args, **kwargs):
+            captured_payload.update(kwargs.get("json", {}))
+            return mock_response
+
+        mock_session = Mock()
+        mock_session.post = Mock(side_effect=capture_post)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=False,  # Chunking disabled
+                top_n=1,
+            )
+
+            # Verify: API payload should have top_n when chunking is disabled
+            assert captured_payload.get("top_n") == 1
+
+
+@pytest.mark.offline
+class TestCohereRerankChunking:
+    """Integration tests for cohere_rerank with chunking enabled"""
+
+    @pytest.mark.asyncio
+    async def test_cohere_rerank_with_chunking_disabled(self):
+        """Test that chunking can be disabled"""
+        documents = ["doc1", "doc2"]
+        query = "test query"
+
+        # Mock the generic_rerank_api
+        with patch(
+            "lightrag.rerank.generic_rerank_api", new_callable=AsyncMock
+        ) as mock_api:
+            mock_api.return_value = [
+                {"index": 0, "relevance_score": 0.9},
+                {"index": 1, "relevance_score": 0.7},
+            ]
+
+            result = await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                enable_chunking=False,
+                max_tokens_per_doc=100,
+            )
+
+            # Verify generic_rerank_api was called with correct parameters
+            mock_api.assert_called_once()
+            call_kwargs = mock_api.call_args[1]
+            assert call_kwargs["enable_chunking"] is False
+            assert call_kwargs["max_tokens_per_doc"] == 100
+            # Result should mirror mocked scores
+            assert len(result) == 2
+            assert result[0]["index"] == 0
+            assert result[0]["relevance_score"] == 0.9
+            assert result[1]["index"] == 1
+            assert result[1]["relevance_score"] == 0.7
+
+    @pytest.mark.asyncio
+    async def test_cohere_rerank_with_chunking_enabled(self):
+        """Test that chunking parameters are passed through"""
+        documents = ["doc1", "doc2"]
+        query = "test query"
+
+        with patch(
+            "lightrag.rerank.generic_rerank_api", new_callable=AsyncMock
+        ) as mock_api:
+            mock_api.return_value = [
+                {"index": 0, "relevance_score": 0.9},
+                {"index": 1, "relevance_score": 0.7},
+            ]
+
+            result = await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                enable_chunking=True,
+                max_tokens_per_doc=480,
+            )
+
+            # Verify parameters were passed
+            call_kwargs = mock_api.call_args[1]
+            assert call_kwargs["enable_chunking"] is True
+            assert call_kwargs["max_tokens_per_doc"] == 480
+            # Result should mirror mocked scores
+            assert len(result) == 2
+            assert result[0]["index"] == 0
+            assert result[0]["relevance_score"] == 0.9
+            assert result[1]["index"] == 1
+            assert result[1]["relevance_score"] == 0.7
+
+    @pytest.mark.asyncio
+    async def test_cohere_rerank_default_parameters(self):
+        """Test default parameter values for cohere_rerank"""
+        documents = ["doc1"]
+        query = "test"
+
+        with patch(
+            "lightrag.rerank.generic_rerank_api", new_callable=AsyncMock
+        ) as mock_api:
+            mock_api.return_value = [{"index": 0, "relevance_score": 0.9}]
+
+            result = await cohere_rerank(
+                query=query, documents=documents, api_key="test-key"
+            )
+
+            # Verify default values
+            call_kwargs = mock_api.call_args[1]
+            assert call_kwargs["enable_chunking"] is False
+            assert call_kwargs["max_tokens_per_doc"] == 4096
+            assert call_kwargs["model"] == "rerank-v3.5"
+            # Result should mirror mocked scores
+            assert len(result) == 1
+            assert result[0]["index"] == 0
+            assert result[0]["relevance_score"] == 0.9
+
+
+@pytest.mark.offline
+class TestEndToEndChunking:
+    """End-to-end tests for chunking workflow"""
+
+    @pytest.mark.asyncio
+    async def test_end_to_end_chunking_workflow(self):
+        """Test complete chunking workflow from documents to aggregated results"""
+        # Create documents where first one needs chunking
+        long_doc = " ".join([f"word{i}" for i in range(100)])
+        documents = [long_doc, "short doc"]
+        query = "test query"
+
+        # Mock the HTTP call inside generic_rerank_api
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(
+            return_value={
+                "results": [
+                    {"index": 0, "relevance_score": 0.5},  # chunk 0 from doc 0
+                    {"index": 1, "relevance_score": 0.8},  # chunk 1 from doc 0
+                    {"index": 2, "relevance_score": 0.6},  # chunk 2 from doc 0
+                    {"index": 3, "relevance_score": 0.7},  # doc 1 (short)
+                ]
+            }
+        )
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        # Make mock_response an async context manager (for `async with session.post() as response`)
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        mock_session = Mock()
+        # session.post() returns an async context manager, so return mock_response which is now one
+        mock_session.post = Mock(return_value=mock_response)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            result = await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=True,
+                max_tokens_per_doc=30,  # Force chunking of long doc
+            )
+
+            # Should get 2 results (one per original document)
+            # The long doc's chunks should be aggregated
+            assert len(result) <= len(documents)
+            # Results should be sorted by score
+            assert all(
+                result[i]["relevance_score"] >= result[i + 1]["relevance_score"]
+                for i in range(len(result) - 1)
+            )
--- a/uv.lock
+++ b/uv.lock
@ -1334,15 +1334,6 @@ http = [
    { name = "aiohttp" },
 ]

-[[package]]
-name = "future"
-version = "1.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a7/b2/4140c69c6a66432916b26158687e821ba631a4c9273c474343badf84d3ba/future-1.0.0.tar.gz", hash = "sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05", size = 1228490, upload-time = "2024-02-21T11:52:38.461Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/da/71/ae30dadffc90b9006d77af76b393cb9dfbfc9629f339fc1574a1c52e6806/future-1.0.0-py3-none-any.whl", hash = "sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216", size = 491326, upload-time = "2024-02-21T11:52:35.956Z" },
-]
-
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@ -2542,7 +2533,6 @@ source = { editable = "." }
 dependencies = [
    { name = "aiohttp" },
    { name = "configparser" },
-    { name = "future" },
    { name = "google-api-core" },
    { name = "google-genai" },
    { name = "json-repair" },
@ -2567,10 +2557,10 @@ api = [
    { name = "aiohttp" },
    { name = "ascii-colors" },
    { name = "asyncpg" },
+    { name = "bcrypt" },
    { name = "configparser" },
    { name = "distro" },
    { name = "fastapi" },
-    { name = "future" },
    { name = "google-api-core" },
    { name = "google-genai" },
    { name = "gunicorn" },
@ -2585,7 +2575,6 @@ api = [
    { name = "openai" },
    { name = "openpyxl" },
    { name = "pandas" },
-    { name = "passlib", extra = ["bcrypt"] },
    { name = "pipmaster" },
    { name = "psutil" },
    { name = "pycryptodome" },
@ -2615,6 +2604,7 @@ evaluation = [
    { name = "pytest" },
    { name = "pytest-asyncio" },
    { name = "ragas" },
+    { name = "ruff" },
 ]
 observability = [
    { name = "langfuse" },
@ -2626,10 +2616,10 @@ offline = [
    { name = "anthropic" },
    { name = "ascii-colors" },
    { name = "asyncpg" },
+    { name = "bcrypt" },
    { name = "configparser" },
    { name = "distro" },
    { name = "fastapi" },
-    { name = "future" },
    { name = "google-api-core" },
    { name = "google-genai" },
    { name = "gunicorn" },
@ -2647,7 +2637,6 @@ offline = [
    { name = "openai" },
    { name = "openpyxl" },
    { name = "pandas" },
-    { name = "passlib", extra = ["bcrypt"] },
    { name = "pipmaster" },
    { name = "psutil" },
    { name = "pycryptodome" },
@ -2700,6 +2689,7 @@ pytest = [
    { name = "pre-commit" },
    { name = "pytest" },
    { name = "pytest-asyncio" },
+    { name = "ruff" },
 ]

 [package.metadata]
@ -2712,14 +2702,13 @@ requires-dist = [
    { name = "ascii-colors", marker = "extra == 'api'" },
    { name = "asyncpg", marker = "extra == 'api'" },
    { name = "asyncpg", marker = "extra == 'offline-storage'", specifier = ">=0.29.0,<1.0.0" },
+    { name = "bcrypt", marker = "extra == 'api'", specifier = ">=4.0.0" },
    { name = "configparser" },
    { name = "configparser", marker = "extra == 'api'" },
    { name = "datasets", marker = "extra == 'evaluation'", specifier = ">=4.3.0" },
    { name = "distro", marker = "extra == 'api'" },
    { name = "docling", marker = "sys_platform != 'darwin' and extra == 'docling'", specifier = ">=2.0.0,<3.0.0" },
    { name = "fastapi", marker = "extra == 'api'" },
-    { name = "future" },
-    { name = "future", marker = "extra == 'api'" },
    { name = "google-api-core", specifier = ">=2.0.0,<3.0.0" },
    { name = "google-api-core", marker = "extra == 'api'", specifier = ">=2.0.0,<3.0.0" },
    { name = "google-api-core", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
@ -2735,7 +2724,6 @@ requires-dist = [
    { name = "json-repair", marker = "extra == 'api'" },
    { name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" },
    { name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" },
-    { name = "lightrag-hku", extras = ["pytest"], marker = "extra == 'evaluation'" },
    { name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" },
    { name = "nano-vectordb" },
    { name = "nano-vectordb", marker = "extra == 'api'" },
@ -2745,14 +2733,14 @@ requires-dist = [
    { name = "numpy", specifier = ">=1.24.0,<2.0.0" },
    { name = "numpy", marker = "extra == 'api'", specifier = ">=1.24.0,<2.0.0" },
    { name = "ollama", marker = "extra == 'offline-llm'", specifier = ">=0.1.0,<1.0.0" },
-    { name = "openai", marker = "extra == 'api'", specifier = ">=1.0.0,<3.0.0" },
-    { name = "openai", marker = "extra == 'offline-llm'", specifier = ">=1.0.0,<3.0.0" },
+    { name = "openai", marker = "extra == 'api'", specifier = ">=2.0.0,<3.0.0" },
+    { name = "openai", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
    { name = "openpyxl", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
    { name = "pandas", specifier = ">=2.0.0,<2.4.0" },
    { name = "pandas", marker = "extra == 'api'", specifier = ">=2.0.0,<2.4.0" },
-    { name = "passlib", extras = ["bcrypt"], marker = "extra == 'api'" },
    { name = "pipmaster" },
    { name = "pipmaster", marker = "extra == 'api'" },
+    { name = "pre-commit", marker = "extra == 'evaluation'" },
    { name = "pre-commit", marker = "extra == 'pytest'" },
    { name = "psutil", marker = "extra == 'api'" },
    { name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
@ -2764,7 +2752,9 @@ requires-dist = [
    { name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" },
    { name = "pypinyin" },
    { name = "pypinyin", marker = "extra == 'api'" },
+    { name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
    { name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" },
+    { name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" },
    { name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" },
    { name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" },
    { name = "python-dotenv" },
@ -2776,6 +2766,8 @@ requires-dist = [
    { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
    { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
    { name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
+    { name = "ruff", marker = "extra == 'evaluation'" },
+    { name = "ruff", marker = "extra == 'pytest'" },
    { name = "setuptools" },
    { name = "setuptools", marker = "extra == 'api'" },
    { name = "tenacity" },
@ -4104,20 +4096,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload-time = "2024-09-20T13:09:48.112Z" },
 ]

-[[package]]
-name = "passlib"
-version = "1.7.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b6/06/9da9ee59a67fae7761aab3ccc84fa4f3f33f125b370f1ccdb915bf967c11/passlib-1.7.4.tar.gz", hash = "sha256:defd50f72b65c5402ab2c573830a6978e5f202ad0d984793c8dde2c4152ebe04", size = 689844, upload-time = "2020-10-08T19:00:52.121Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/a4/ab6b7589382ca3df236e03faa71deac88cae040af60c071a78d254a62172/passlib-1.7.4-py2.py3-none-any.whl", hash = "sha256:aa6bca462b8d8bda89c70b382f0c298a20b5560af6cbfa2dce410c0a2fb669f1", size = 525554, upload-time = "2020-10-08T19:00:49.856Z" },
-]
-
-[package.optional-dependencies]
-bcrypt = [
-    { name = "bcrypt" },
-]
-
 [[package]]
 name = "pillow"
 version = "11.3.0"
@ -5635,6 +5613,32 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" },
 ]

+[[package]]
+name = "ruff"
+version = "0.14.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/f0/62b5a1a723fe183650109407fa56abb433b00aa1c0b9ba555f9c4efec2c6/ruff-0.14.6.tar.gz", hash = "sha256:6f0c742ca6a7783a736b867a263b9a7a80a45ce9bee391eeda296895f1b4e1cc", size = 5669501, upload-time = "2025-11-21T14:26:17.903Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/67/d2/7dd544116d107fffb24a0064d41a5d2ed1c9d6372d142f9ba108c8e39207/ruff-0.14.6-py3-none-linux_armv6l.whl", hash = "sha256:d724ac2f1c240dbd01a2ae98db5d1d9a5e1d9e96eba999d1c48e30062df578a3", size = 13326119, upload-time = "2025-11-21T14:25:24.2Z" },
+    { url = "https://files.pythonhosted.org/packages/36/6a/ad66d0a3315d6327ed6b01f759d83df3c4d5f86c30462121024361137b6a/ruff-0.14.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:9f7539ea257aa4d07b7ce87aed580e485c40143f2473ff2f2b75aee003186004", size = 13526007, upload-time = "2025-11-21T14:25:26.906Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/9d/dae6db96df28e0a15dea8e986ee393af70fc97fd57669808728080529c37/ruff-0.14.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7f6007e55b90a2a7e93083ba48a9f23c3158c433591c33ee2e99a49b889c6332", size = 12676572, upload-time = "2025-11-21T14:25:29.826Z" },
+    { url = "https://files.pythonhosted.org/packages/76/a4/f319e87759949062cfee1b26245048e92e2acce900ad3a909285f9db1859/ruff-0.14.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a8e7b9d73d8728b68f632aa8e824ef041d068d231d8dbc7808532d3629a6bef", size = 13140745, upload-time = "2025-11-21T14:25:32.788Z" },
+    { url = "https://files.pythonhosted.org/packages/95/d3/248c1efc71a0a8ed4e8e10b4b2266845d7dfc7a0ab64354afe049eaa1310/ruff-0.14.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d50d45d4553a3ebcbd33e7c5e0fe6ca4aafd9a9122492de357205c2c48f00775", size = 13076486, upload-time = "2025-11-21T14:25:35.601Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/19/b68d4563fe50eba4b8c92aa842149bb56dd24d198389c0ed12e7faff4f7d/ruff-0.14.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:118548dd121f8a21bfa8ab2c5b80e5b4aed67ead4b7567790962554f38e598ce", size = 13727563, upload-time = "2025-11-21T14:25:38.514Z" },
+    { url = "https://files.pythonhosted.org/packages/47/ac/943169436832d4b0e867235abbdb57ce3a82367b47e0280fa7b4eabb7593/ruff-0.14.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:57256efafbfefcb8748df9d1d766062f62b20150691021f8ab79e2d919f7c11f", size = 15199755, upload-time = "2025-11-21T14:25:41.516Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/b9/288bb2399860a36d4bb0541cb66cce3c0f4156aaff009dc8499be0c24bf2/ruff-0.14.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff18134841e5c68f8e5df1999a64429a02d5549036b394fafbe410f886e1989d", size = 14850608, upload-time = "2025-11-21T14:25:44.428Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/b1/a0d549dd4364e240f37e7d2907e97ee80587480d98c7799d2d8dc7a2f605/ruff-0.14.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29c4b7ec1e66a105d5c27bd57fa93203637d66a26d10ca9809dc7fc18ec58440", size = 14118754, upload-time = "2025-11-21T14:25:47.214Z" },
+    { url = "https://files.pythonhosted.org/packages/13/ac/9b9fe63716af8bdfddfacd0882bc1586f29985d3b988b3c62ddce2e202c3/ruff-0.14.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:167843a6f78680746d7e226f255d920aeed5e4ad9c03258094a2d49d3028b105", size = 13949214, upload-time = "2025-11-21T14:25:50.002Z" },
+    { url = "https://files.pythonhosted.org/packages/12/27/4dad6c6a77fede9560b7df6802b1b697e97e49ceabe1f12baf3ea20862e9/ruff-0.14.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:16a33af621c9c523b1ae006b1b99b159bf5ac7e4b1f20b85b2572455018e0821", size = 14106112, upload-time = "2025-11-21T14:25:52.841Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/db/23e322d7177873eaedea59a7932ca5084ec5b7e20cb30f341ab594130a71/ruff-0.14.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1432ab6e1ae2dc565a7eea707d3b03a0c234ef401482a6f1621bc1f427c2ff55", size = 13035010, upload-time = "2025-11-21T14:25:55.536Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/9c/20e21d4d69dbb35e6a1df7691e02f363423658a20a2afacf2a2c011800dc/ruff-0.14.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:4c55cfbbe7abb61eb914bfd20683d14cdfb38a6d56c6c66efa55ec6570ee4e71", size = 13054082, upload-time = "2025-11-21T14:25:58.625Z" },
+    { url = "https://files.pythonhosted.org/packages/66/25/906ee6a0464c3125c8d673c589771a974965c2be1a1e28b5c3b96cb6ef88/ruff-0.14.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:efea3c0f21901a685fff4befda6d61a1bf4cb43de16da87e8226a281d614350b", size = 13303354, upload-time = "2025-11-21T14:26:01.816Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/58/60577569e198d56922b7ead07b465f559002b7b11d53f40937e95067ca1c/ruff-0.14.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:344d97172576d75dc6afc0e9243376dbe1668559c72de1864439c4fc95f78185", size = 14054487, upload-time = "2025-11-21T14:26:05.058Z" },
+    { url = "https://files.pythonhosted.org/packages/67/0b/8e4e0639e4cc12547f41cb771b0b44ec8225b6b6a93393176d75fe6f7d40/ruff-0.14.6-py3-none-win32.whl", hash = "sha256:00169c0c8b85396516fdd9ce3446c7ca20c2a8f90a77aa945ba6b8f2bfe99e85", size = 13013361, upload-time = "2025-11-21T14:26:08.152Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/02/82240553b77fd1341f80ebb3eaae43ba011c7a91b4224a9f317d8e6591af/ruff-0.14.6-py3-none-win_amd64.whl", hash = "sha256:390e6480c5e3659f8a4c8d6a0373027820419ac14fa0d2713bd8e6c3e125b8b9", size = 14432087, upload-time = "2025-11-21T14:26:10.891Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/1f/93f9b0fad9470e4c829a5bb678da4012f0c710d09331b860ee555216f4ea/ruff-0.14.6-py3-none-win_arm64.whl", hash = "sha256:d43c81fbeae52cfa8728d8766bbf46ee4298c888072105815b392da70ca836b2", size = 13520930, upload-time = "2025-11-21T14:26:13.951Z" },
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.14.0"