From c5f230a30c3763cbc17a56eea108a9181ad16e5e Mon Sep 17 00:00:00 2001
From: clssck <foldvarszki@gmail.com>
Date: Wed, 3 Dec 2025 15:02:11 +0100
Subject: [PATCH] test: fix env handling, add type hints, improve docs Improve
 code quality and test robustness: - Refactor environment variable parsing in
 rerank config using centralized get_env_value helper - Add return type hints
 to all test methods for better type safety - Fix patch path in test from
 lightrag.utils to lightrag.rerank for correct import location - Clarify batch
 insert endpoint behavior regarding duplicate content rejection - Expand
 .dockerignore to comprehensively exclude node_modules (200MB+), Python cache
 files, and venv directories - Update dependency groups: align evaluation and
 test extras with pytest/pre-commit/ruff tools

---
 .dockerignore                           |   8 ++
 lightrag/api/lightrag_server.py         |  13 ++-
 lightrag/api/routers/document_routes.py |   4 +
 tests/test_overlap_validation.py        |  19 +++--
 tests/test_rerank_chunking.py           |   2 +-
 uv.lock                                 | 104 +++++++++++++++++++++---
 6 files changed, 121 insertions(+), 29 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index f738d586..c1e07cd4 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,9 +1,17 @@
+# Node.js - THE BIG ONE (200MB+)
+**/node_modules
+
 # Python-related files and directories
 __pycache__
+**/__pycache__
+*.pyc
+*.pyo
 .cache
 
 # Virtual environment directories
 *.venv
+.venv
+venv/
 
 # Env
 env/
diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 296783c7..73a82293 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -1036,15 +1036,12 @@ def create_app(args):
             # Add Cohere-specific parameters if using cohere binding
             if args.rerank_binding == "cohere":
                 # Enable chunking if configured (useful for models with token limits like ColBERT)
-                kwargs["enable_chunking"] = (
-                    os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true"
+                kwargs["enable_chunking"] = get_env_value(
+                    "RERANK_ENABLE_CHUNKING", False, bool
+                )
+                kwargs["max_tokens_per_doc"] = get_env_value(
+                    "RERANK_MAX_TOKENS_PER_DOC", 4096, int
                 )
-                try:
-                    kwargs["max_tokens_per_doc"] = int(
-                        os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")
-                    )
-                except ValueError:
-                    kwargs["max_tokens_per_doc"] = 4096
 
             return await selected_rerank_func(**kwargs, extra_body=extra_body)
 
diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 0a534bbe..d072277c 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -2232,6 +2232,10 @@ def create_document_routes(
         This endpoint allows you to insert multiple text entries into the RAG system
         in a single request.
 
+        Note:
+            If any text content or file_source already exists in the system,
+            the entire batch will be rejected with status "duplicated".
+
         Args:
             request (InsertTextsRequest): The request body containing the list of texts.
             background_tasks: FastAPI BackgroundTasks for async processing
diff --git a/tests/test_overlap_validation.py b/tests/test_overlap_validation.py
index 3ad627d7..265818fd 100644
--- a/tests/test_overlap_validation.py
+++ b/tests/test_overlap_validation.py
@@ -14,7 +14,7 @@ from lightrag.rerank import chunk_documents_for_rerank
 class TestOverlapValidation:
     """Test suite for overlap_tokens validation"""
 
-    def test_overlap_greater_than_max_tokens(self):
+    def test_overlap_greater_than_max_tokens(self) -> None:
         """Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
         documents = [" ".join([f"word{i}" for i in range(100)])]
 
@@ -27,7 +27,7 @@ class TestOverlapValidation:
         assert len(chunked_docs) > 0
         assert all(idx == 0 for idx in doc_indices)
 
-    def test_overlap_equal_to_max_tokens(self):
+    def test_overlap_equal_to_max_tokens(self) -> None:
         """Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
         documents = [" ".join([f"word{i}" for i in range(100)])]
 
@@ -40,7 +40,7 @@ class TestOverlapValidation:
         assert len(chunked_docs) > 0
         assert all(idx == 0 for idx in doc_indices)
 
-    def test_overlap_slightly_less_than_max_tokens(self):
+    def test_overlap_slightly_less_than_max_tokens(self) -> None:
         """Test that overlap_tokens < max_tokens works normally"""
         documents = [" ".join([f"word{i}" for i in range(100)])]
 
@@ -53,7 +53,7 @@ class TestOverlapValidation:
         assert len(chunked_docs) > 0
         assert all(idx == 0 for idx in doc_indices)
 
-    def test_small_max_tokens_with_large_overlap(self):
+    def test_small_max_tokens_with_large_overlap(self) -> None:
         """Test edge case with very small max_tokens"""
         documents = [" ".join([f"word{i}" for i in range(50)])]
 
@@ -66,7 +66,7 @@ class TestOverlapValidation:
         assert len(chunked_docs) > 0
         assert all(idx == 0 for idx in doc_indices)
 
-    def test_multiple_documents_with_invalid_overlap(self):
+    def test_multiple_documents_with_invalid_overlap(self) -> None:
         """Test multiple documents with overlap_tokens >= max_tokens"""
         documents = [
             " ".join([f"word{i}" for i in range(50)]),
@@ -84,7 +84,7 @@ class TestOverlapValidation:
         # Short document should not be chunked
         assert "short document" in chunked_docs
 
-    def test_normal_operation_unaffected(self):
+    def test_normal_operation_unaffected(self) -> None:
         """Test that normal cases continue to work correctly"""
         documents = [
             " ".join([f"word{i}" for i in range(100)]),
@@ -99,10 +99,11 @@ class TestOverlapValidation:
         # Long document should be chunked, short one should not
         assert len(chunked_docs) > 2  # At least 3 chunks (2 from long doc + 1 short)
         assert "short doc" in chunked_docs
-        # Verify doc_indices maps correctly
-        assert doc_indices[-1] == 1  # Last chunk is from second document
+        # Verify doc_indices maps "short doc" to document index 1
+        short_doc_idx = chunked_docs.index("short doc")
+        assert doc_indices[short_doc_idx] == 1
 
-    def test_edge_case_max_tokens_one(self):
+    def test_edge_case_max_tokens_one(self) -> None:
         """Test edge case where max_tokens=1"""
         documents = [" ".join([f"word{i}" for i in range(20)])]
 
diff --git a/tests/test_rerank_chunking.py b/tests/test_rerank_chunking.py
index 09f1816b..14608747 100644
--- a/tests/test_rerank_chunking.py
+++ b/tests/test_rerank_chunking.py
@@ -40,7 +40,7 @@ class TestChunkDocumentsForRerank:
         long_doc = "a" * 2000  # 2000 characters
         documents = [long_doc, "short doc"]
 
-        with patch("lightrag.utils.TiktokenTokenizer", side_effect=ImportError):
+        with patch("lightrag.rerank.TiktokenTokenizer", side_effect=ImportError):
             chunked_docs, doc_indices = chunk_documents_for_rerank(
                 documents,
                 max_tokens=100,  # 100 tokens = ~400 chars
diff --git a/uv.lock b/uv.lock
index b07a8ea1..2f8f3313 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2598,13 +2598,48 @@ docling = [
     { name = "docling", marker = "sys_platform != 'darwin'" },
 ]
 evaluation = [
+    { name = "aiofiles" },
+    { name = "aiohttp" },
+    { name = "ascii-colors" },
+    { name = "asyncpg" },
+    { name = "bcrypt" },
+    { name = "configparser" },
     { name = "datasets" },
+    { name = "distro" },
+    { name = "fastapi" },
+    { name = "google-api-core" },
+    { name = "google-genai" },
+    { name = "gunicorn" },
+    { name = "httpcore" },
     { name = "httpx" },
-    { name = "pre-commit" },
-    { name = "pytest" },
-    { name = "pytest-asyncio" },
+    { name = "jiter" },
+    { name = "json-repair" },
+    { name = "nano-vectordb" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "numpy" },
+    { name = "openai" },
+    { name = "openpyxl" },
+    { name = "pandas" },
+    { name = "pipmaster" },
+    { name = "psutil" },
+    { name = "pycryptodome" },
+    { name = "pydantic" },
+    { name = "pyjwt" },
+    { name = "pypdf" },
+    { name = "pypinyin" },
+    { name = "python-docx" },
+    { name = "python-dotenv" },
+    { name = "python-jose", extra = ["cryptography"] },
+    { name = "python-multipart" },
+    { name = "python-pptx" },
+    { name = "pytz" },
     { name = "ragas" },
-    { name = "ruff" },
+    { name = "setuptools" },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "uvicorn" },
+    { name = "xlsxwriter" },
 ]
 observability = [
     { name = "langfuse" },
@@ -2691,6 +2726,52 @@ pytest = [
     { name = "pytest-asyncio" },
     { name = "ruff" },
 ]
+test = [
+    { name = "aiofiles" },
+    { name = "aiohttp" },
+    { name = "ascii-colors" },
+    { name = "asyncpg" },
+    { name = "bcrypt" },
+    { name = "configparser" },
+    { name = "distro" },
+    { name = "fastapi" },
+    { name = "google-api-core" },
+    { name = "google-genai" },
+    { name = "gunicorn" },
+    { name = "httpcore" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "json-repair" },
+    { name = "nano-vectordb" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "numpy" },
+    { name = "openai" },
+    { name = "openpyxl" },
+    { name = "pandas" },
+    { name = "pipmaster" },
+    { name = "pre-commit" },
+    { name = "psutil" },
+    { name = "pycryptodome" },
+    { name = "pydantic" },
+    { name = "pyjwt" },
+    { name = "pypdf" },
+    { name = "pypinyin" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "python-docx" },
+    { name = "python-dotenv" },
+    { name = "python-jose", extra = ["cryptography"] },
+    { name = "python-multipart" },
+    { name = "python-pptx" },
+    { name = "pytz" },
+    { name = "ruff" },
+    { name = "setuptools" },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "uvicorn" },
+    { name = "xlsxwriter" },
+]
 
 [package.metadata]
 requires-dist = [
@@ -2717,12 +2798,13 @@ requires-dist = [
     { name = "google-genai", marker = "extra == 'offline-llm'", specifier = ">=1.0.0,<2.0.0" },
     { name = "gunicorn", marker = "extra == 'api'" },
     { name = "httpcore", marker = "extra == 'api'" },
-    { name = "httpx", marker = "extra == 'api'" },
-    { name = "httpx", marker = "extra == 'evaluation'", specifier = ">=0.28.1" },
+    { name = "httpx", marker = "extra == 'api'", specifier = ">=0.28.1" },
     { name = "jiter", marker = "extra == 'api'" },
     { name = "json-repair" },
     { name = "json-repair", marker = "extra == 'api'" },
     { name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" },
+    { name = "lightrag-hku", extras = ["api"], marker = "extra == 'evaluation'" },
+    { name = "lightrag-hku", extras = ["api"], marker = "extra == 'test'" },
     { name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" },
     { name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" },
     { name = "nano-vectordb" },
@@ -2740,8 +2822,8 @@ requires-dist = [
     { name = "pandas", marker = "extra == 'api'", specifier = ">=2.0.0,<2.4.0" },
     { name = "pipmaster" },
     { name = "pipmaster", marker = "extra == 'api'" },
-    { name = "pre-commit", marker = "extra == 'evaluation'" },
     { name = "pre-commit", marker = "extra == 'pytest'" },
+    { name = "pre-commit", marker = "extra == 'test'" },
     { name = "psutil", marker = "extra == 'api'" },
     { name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
     { name = "pydantic" },
@@ -2752,10 +2834,10 @@ requires-dist = [
     { name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" },
     { name = "pypinyin" },
     { name = "pypinyin", marker = "extra == 'api'" },
-    { name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
     { name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" },
-    { name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" },
+    { name = "pytest", marker = "extra == 'test'", specifier = ">=8.4.2" },
     { name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" },
+    { name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=1.2.0" },
     { name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" },
     { name = "python-dotenv" },
     { name = "python-dotenv", marker = "extra == 'api'" },
@@ -2766,8 +2848,8 @@ requires-dist = [
     { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
     { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
     { name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
-    { name = "ruff", marker = "extra == 'evaluation'" },
     { name = "ruff", marker = "extra == 'pytest'" },
+    { name = "ruff", marker = "extra == 'test'" },
     { name = "setuptools" },
     { name = "setuptools", marker = "extra == 'api'" },
     { name = "tenacity" },
@@ -2780,7 +2862,7 @@ requires-dist = [
     { name = "xlsxwriter", marker = "extra == 'api'", specifier = ">=3.1.0" },
     { name = "zhipuai", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
 ]
-provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "evaluation", "observability"]
+provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "test", "evaluation", "observability"]
 
 [[package]]
 name = "llama-cloud"