test: fix env handling, add type hints, improve docs

Improve code quality and test robustness: - Refactor environment variable parsing in rerank config using centralized get_env_value helper - Add return type hints to all test methods for better type safety - Fix patch path in test from lightrag.utils to lightrag.rerank for correct import location - Clarify batch insert endpoint behavior regarding duplicate content rejection - Expand .dockerignore to comprehensively exclude node_modules (200MB+), Python cache files, and venv directories - Update dependency groups: align evaluation and test extras with pytest/pre-commit/ruff tools
2025-12-03 15:02:11 +01:00 · 2025-12-03 15:02:11 +01:00 · c5f230a30c
commit c5f230a30c
parent 9bae6267f6
6 changed files with 121 additions and 29 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,9 +1,17 @@
+# Node.js - THE BIG ONE (200MB+)
+**/node_modules
+
 # Python-related files and directories
 __pycache__
+**/__pycache__
+*.pyc
+*.pyo
 .cache

 # Virtual environment directories
 *.venv
+.venv
+venv/

 # Env
 env/
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@ -1036,15 +1036,12 @@ def create_app(args):
            # Add Cohere-specific parameters if using cohere binding
            if args.rerank_binding == "cohere":
                # Enable chunking if configured (useful for models with token limits like ColBERT)
-                kwargs["enable_chunking"] = (
-                    os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true"
+                kwargs["enable_chunking"] = get_env_value(
+                    "RERANK_ENABLE_CHUNKING", False, bool
+                )
+                kwargs["max_tokens_per_doc"] = get_env_value(
+                    "RERANK_MAX_TOKENS_PER_DOC", 4096, int
                )
-                try:
-                    kwargs["max_tokens_per_doc"] = int(
-                        os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")
-                    )
-                except ValueError:
-                    kwargs["max_tokens_per_doc"] = 4096

            return await selected_rerank_func(**kwargs, extra_body=extra_body)

--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@ -2232,6 +2232,10 @@ def create_document_routes(
        This endpoint allows you to insert multiple text entries into the RAG system
        in a single request.

+        Note:
+            If any text content or file_source already exists in the system,
+            the entire batch will be rejected with status "duplicated".
+
        Args:
            request (InsertTextsRequest): The request body containing the list of texts.
            background_tasks: FastAPI BackgroundTasks for async processing
--- a/tests/test_overlap_validation.py
+++ b/tests/test_overlap_validation.py
@ -14,7 +14,7 @@ from lightrag.rerank import chunk_documents_for_rerank
 class TestOverlapValidation:
    """Test suite for overlap_tokens validation"""

-    def test_overlap_greater_than_max_tokens(self):
+    def test_overlap_greater_than_max_tokens(self) -> None:
        """Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
        documents = [" ".join([f"word{i}" for i in range(100)])]

@ -27,7 +27,7 @@ class TestOverlapValidation:
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)

-    def test_overlap_equal_to_max_tokens(self):
+    def test_overlap_equal_to_max_tokens(self) -> None:
        """Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
        documents = [" ".join([f"word{i}" for i in range(100)])]

@ -40,7 +40,7 @@ class TestOverlapValidation:
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)

-    def test_overlap_slightly_less_than_max_tokens(self):
+    def test_overlap_slightly_less_than_max_tokens(self) -> None:
        """Test that overlap_tokens < max_tokens works normally"""
        documents = [" ".join([f"word{i}" for i in range(100)])]

@ -53,7 +53,7 @@ class TestOverlapValidation:
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)

-    def test_small_max_tokens_with_large_overlap(self):
+    def test_small_max_tokens_with_large_overlap(self) -> None:
        """Test edge case with very small max_tokens"""
        documents = [" ".join([f"word{i}" for i in range(50)])]

@ -66,7 +66,7 @@ class TestOverlapValidation:
        assert len(chunked_docs) > 0
        assert all(idx == 0 for idx in doc_indices)

-    def test_multiple_documents_with_invalid_overlap(self):
+    def test_multiple_documents_with_invalid_overlap(self) -> None:
        """Test multiple documents with overlap_tokens >= max_tokens"""
        documents = [
            " ".join([f"word{i}" for i in range(50)]),
@ -84,7 +84,7 @@ class TestOverlapValidation:
        # Short document should not be chunked
        assert "short document" in chunked_docs

-    def test_normal_operation_unaffected(self):
+    def test_normal_operation_unaffected(self) -> None:
        """Test that normal cases continue to work correctly"""
        documents = [
            " ".join([f"word{i}" for i in range(100)]),
@ -99,10 +99,11 @@ class TestOverlapValidation:
        # Long document should be chunked, short one should not
        assert len(chunked_docs) > 2  # At least 3 chunks (2 from long doc + 1 short)
        assert "short doc" in chunked_docs
-        # Verify doc_indices maps correctly
-        assert doc_indices[-1] == 1  # Last chunk is from second document
+        # Verify doc_indices maps "short doc" to document index 1
+        short_doc_idx = chunked_docs.index("short doc")
+        assert doc_indices[short_doc_idx] == 1

-    def test_edge_case_max_tokens_one(self):
+    def test_edge_case_max_tokens_one(self) -> None:
        """Test edge case where max_tokens=1"""
        documents = [" ".join([f"word{i}" for i in range(20)])]

--- a/tests/test_rerank_chunking.py
+++ b/tests/test_rerank_chunking.py
@ -40,7 +40,7 @@ class TestChunkDocumentsForRerank:
        long_doc = "a" * 2000  # 2000 characters
        documents = [long_doc, "short doc"]

-        with patch("lightrag.utils.TiktokenTokenizer", side_effect=ImportError):
+        with patch("lightrag.rerank.TiktokenTokenizer", side_effect=ImportError):
            chunked_docs, doc_indices = chunk_documents_for_rerank(
                documents,
                max_tokens=100,  # 100 tokens = ~400 chars
--- a/uv.lock
+++ b/uv.lock
@ -2598,13 +2598,48 @@ docling = [
    { name = "docling", marker = "sys_platform != 'darwin'" },
 ]
 evaluation = [
+    { name = "aiofiles" },
+    { name = "aiohttp" },
+    { name = "ascii-colors" },
+    { name = "asyncpg" },
+    { name = "bcrypt" },
+    { name = "configparser" },
    { name = "datasets" },
+    { name = "distro" },
+    { name = "fastapi" },
+    { name = "google-api-core" },
+    { name = "google-genai" },
+    { name = "gunicorn" },
+    { name = "httpcore" },
    { name = "httpx" },
-    { name = "pre-commit" },
-    { name = "pytest" },
-    { name = "pytest-asyncio" },
+    { name = "jiter" },
+    { name = "json-repair" },
+    { name = "nano-vectordb" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "numpy" },
+    { name = "openai" },
+    { name = "openpyxl" },
+    { name = "pandas" },
+    { name = "pipmaster" },
+    { name = "psutil" },
+    { name = "pycryptodome" },
+    { name = "pydantic" },
+    { name = "pyjwt" },
+    { name = "pypdf" },
+    { name = "pypinyin" },
+    { name = "python-docx" },
+    { name = "python-dotenv" },
+    { name = "python-jose", extra = ["cryptography"] },
+    { name = "python-multipart" },
+    { name = "python-pptx" },
+    { name = "pytz" },
    { name = "ragas" },
-    { name = "ruff" },
+    { name = "setuptools" },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "uvicorn" },
+    { name = "xlsxwriter" },
 ]
 observability = [
    { name = "langfuse" },
@ -2691,6 +2726,52 @@ pytest = [
    { name = "pytest-asyncio" },
    { name = "ruff" },
 ]
+test = [
+    { name = "aiofiles" },
+    { name = "aiohttp" },
+    { name = "ascii-colors" },
+    { name = "asyncpg" },
+    { name = "bcrypt" },
+    { name = "configparser" },
+    { name = "distro" },
+    { name = "fastapi" },
+    { name = "google-api-core" },
+    { name = "google-genai" },
+    { name = "gunicorn" },
+    { name = "httpcore" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "json-repair" },
+    { name = "nano-vectordb" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "numpy" },
+    { name = "openai" },
+    { name = "openpyxl" },
+    { name = "pandas" },
+    { name = "pipmaster" },
+    { name = "pre-commit" },
+    { name = "psutil" },
+    { name = "pycryptodome" },
+    { name = "pydantic" },
+    { name = "pyjwt" },
+    { name = "pypdf" },
+    { name = "pypinyin" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "python-docx" },
+    { name = "python-dotenv" },
+    { name = "python-jose", extra = ["cryptography"] },
+    { name = "python-multipart" },
+    { name = "python-pptx" },
+    { name = "pytz" },
+    { name = "ruff" },
+    { name = "setuptools" },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "uvicorn" },
+    { name = "xlsxwriter" },
+]

 [package.metadata]
 requires-dist = [
@ -2717,12 +2798,13 @@ requires-dist = [
    { name = "google-genai", marker = "extra == 'offline-llm'", specifier = ">=1.0.0,<2.0.0" },
    { name = "gunicorn", marker = "extra == 'api'" },
    { name = "httpcore", marker = "extra == 'api'" },
-    { name = "httpx", marker = "extra == 'api'" },
-    { name = "httpx", marker = "extra == 'evaluation'", specifier = ">=0.28.1" },
+    { name = "httpx", marker = "extra == 'api'", specifier = ">=0.28.1" },
    { name = "jiter", marker = "extra == 'api'" },
    { name = "json-repair" },
    { name = "json-repair", marker = "extra == 'api'" },
    { name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" },
+    { name = "lightrag-hku", extras = ["api"], marker = "extra == 'evaluation'" },
+    { name = "lightrag-hku", extras = ["api"], marker = "extra == 'test'" },
    { name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" },
    { name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" },
    { name = "nano-vectordb" },
@ -2740,8 +2822,8 @@ requires-dist = [
    { name = "pandas", marker = "extra == 'api'", specifier = ">=2.0.0,<2.4.0" },
    { name = "pipmaster" },
    { name = "pipmaster", marker = "extra == 'api'" },
-    { name = "pre-commit", marker = "extra == 'evaluation'" },
    { name = "pre-commit", marker = "extra == 'pytest'" },
+    { name = "pre-commit", marker = "extra == 'test'" },
    { name = "psutil", marker = "extra == 'api'" },
    { name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
    { name = "pydantic" },
@ -2752,10 +2834,10 @@ requires-dist = [
    { name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" },
    { name = "pypinyin" },
    { name = "pypinyin", marker = "extra == 'api'" },
-    { name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
    { name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" },
-    { name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" },
+    { name = "pytest", marker = "extra == 'test'", specifier = ">=8.4.2" },
    { name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" },
+    { name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=1.2.0" },
    { name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" },
    { name = "python-dotenv" },
    { name = "python-dotenv", marker = "extra == 'api'" },
@ -2766,8 +2848,8 @@ requires-dist = [
    { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
    { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
    { name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
-    { name = "ruff", marker = "extra == 'evaluation'" },
    { name = "ruff", marker = "extra == 'pytest'" },
+    { name = "ruff", marker = "extra == 'test'" },
    { name = "setuptools" },
    { name = "setuptools", marker = "extra == 'api'" },
    { name = "tenacity" },
@ -2780,7 +2862,7 @@ requires-dist = [
    { name = "xlsxwriter", marker = "extra == 'api'", specifier = ">=3.1.0" },
    { name = "zhipuai", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
 ]
-provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "evaluation", "observability"]
+provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "test", "evaluation", "observability"]

 [[package]]
 name = "llama-cloud"