From c5f230a30c3763cbc17a56eea108a9181ad16e5e Mon Sep 17 00:00:00 2001 From: clssck Date: Wed, 3 Dec 2025 15:02:11 +0100 Subject: [PATCH] test: fix env handling, add type hints, improve docs Improve code quality and test robustness: - Refactor environment variable parsing in rerank config using centralized get_env_value helper - Add return type hints to all test methods for better type safety - Fix patch path in test from lightrag.utils to lightrag.rerank for correct import location - Clarify batch insert endpoint behavior regarding duplicate content rejection - Expand .dockerignore to comprehensively exclude node_modules (200MB+), Python cache files, and venv directories - Update dependency groups: align evaluation and test extras with pytest/pre-commit/ruff tools --- .dockerignore | 8 ++ lightrag/api/lightrag_server.py | 13 ++- lightrag/api/routers/document_routes.py | 4 + tests/test_overlap_validation.py | 19 +++-- tests/test_rerank_chunking.py | 2 +- uv.lock | 104 +++++++++++++++++++++--- 6 files changed, 121 insertions(+), 29 deletions(-) diff --git a/.dockerignore b/.dockerignore index f738d586..c1e07cd4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,9 +1,17 @@ +# Node.js - THE BIG ONE (200MB+) +**/node_modules + # Python-related files and directories __pycache__ +**/__pycache__ +*.pyc +*.pyo .cache # Virtual environment directories *.venv +.venv +venv/ # Env env/ diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 296783c7..73a82293 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1036,15 +1036,12 @@ def create_app(args): # Add Cohere-specific parameters if using cohere binding if args.rerank_binding == "cohere": # Enable chunking if configured (useful for models with token limits like ColBERT) - kwargs["enable_chunking"] = ( - os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true" + kwargs["enable_chunking"] = get_env_value( + "RERANK_ENABLE_CHUNKING", False, bool + ) + kwargs["max_tokens_per_doc"] = get_env_value( + "RERANK_MAX_TOKENS_PER_DOC", 4096, int ) - try: - kwargs["max_tokens_per_doc"] = int( - os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096") - ) - except ValueError: - kwargs["max_tokens_per_doc"] = 4096 return await selected_rerank_func(**kwargs, extra_body=extra_body) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 0a534bbe..d072277c 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -2232,6 +2232,10 @@ def create_document_routes( This endpoint allows you to insert multiple text entries into the RAG system in a single request. + Note: + If any text content or file_source already exists in the system, + the entire batch will be rejected with status "duplicated". + Args: request (InsertTextsRequest): The request body containing the list of texts. background_tasks: FastAPI BackgroundTasks for async processing diff --git a/tests/test_overlap_validation.py b/tests/test_overlap_validation.py index 3ad627d7..265818fd 100644 --- a/tests/test_overlap_validation.py +++ b/tests/test_overlap_validation.py @@ -14,7 +14,7 @@ from lightrag.rerank import chunk_documents_for_rerank class TestOverlapValidation: """Test suite for overlap_tokens validation""" - def test_overlap_greater_than_max_tokens(self): + def test_overlap_greater_than_max_tokens(self) -> None: """Test that overlap_tokens > max_tokens is clamped and doesn't hang""" documents = [" ".join([f"word{i}" for i in range(100)])] @@ -27,7 +27,7 @@ class TestOverlapValidation: assert len(chunked_docs) > 0 assert all(idx == 0 for idx in doc_indices) - def test_overlap_equal_to_max_tokens(self): + def test_overlap_equal_to_max_tokens(self) -> None: """Test that overlap_tokens == max_tokens is clamped and doesn't hang""" documents = [" ".join([f"word{i}" for i in range(100)])] @@ -40,7 +40,7 @@ class TestOverlapValidation: assert len(chunked_docs) > 0 assert all(idx == 0 for idx in doc_indices) - def test_overlap_slightly_less_than_max_tokens(self): + def test_overlap_slightly_less_than_max_tokens(self) -> None: """Test that overlap_tokens < max_tokens works normally""" documents = [" ".join([f"word{i}" for i in range(100)])] @@ -53,7 +53,7 @@ class TestOverlapValidation: assert len(chunked_docs) > 0 assert all(idx == 0 for idx in doc_indices) - def test_small_max_tokens_with_large_overlap(self): + def test_small_max_tokens_with_large_overlap(self) -> None: """Test edge case with very small max_tokens""" documents = [" ".join([f"word{i}" for i in range(50)])] @@ -66,7 +66,7 @@ class TestOverlapValidation: assert len(chunked_docs) > 0 assert all(idx == 0 for idx in doc_indices) - def test_multiple_documents_with_invalid_overlap(self): + def test_multiple_documents_with_invalid_overlap(self) -> None: """Test multiple documents with overlap_tokens >= max_tokens""" documents = [ " ".join([f"word{i}" for i in range(50)]), @@ -84,7 +84,7 @@ class TestOverlapValidation: # Short document should not be chunked assert "short document" in chunked_docs - def test_normal_operation_unaffected(self): + def test_normal_operation_unaffected(self) -> None: """Test that normal cases continue to work correctly""" documents = [ " ".join([f"word{i}" for i in range(100)]), @@ -99,10 +99,11 @@ class TestOverlapValidation: # Long document should be chunked, short one should not assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short) assert "short doc" in chunked_docs - # Verify doc_indices maps correctly - assert doc_indices[-1] == 1 # Last chunk is from second document + # Verify doc_indices maps "short doc" to document index 1 + short_doc_idx = chunked_docs.index("short doc") + assert doc_indices[short_doc_idx] == 1 - def test_edge_case_max_tokens_one(self): + def test_edge_case_max_tokens_one(self) -> None: """Test edge case where max_tokens=1""" documents = [" ".join([f"word{i}" for i in range(20)])] diff --git a/tests/test_rerank_chunking.py b/tests/test_rerank_chunking.py index 09f1816b..14608747 100644 --- a/tests/test_rerank_chunking.py +++ b/tests/test_rerank_chunking.py @@ -40,7 +40,7 @@ class TestChunkDocumentsForRerank: long_doc = "a" * 2000 # 2000 characters documents = [long_doc, "short doc"] - with patch("lightrag.utils.TiktokenTokenizer", side_effect=ImportError): + with patch("lightrag.rerank.TiktokenTokenizer", side_effect=ImportError): chunked_docs, doc_indices = chunk_documents_for_rerank( documents, max_tokens=100, # 100 tokens = ~400 chars diff --git a/uv.lock b/uv.lock index b07a8ea1..2f8f3313 100644 --- a/uv.lock +++ b/uv.lock @@ -2598,13 +2598,48 @@ docling = [ { name = "docling", marker = "sys_platform != 'darwin'" }, ] evaluation = [ + { name = "aiofiles" }, + { name = "aiohttp" }, + { name = "ascii-colors" }, + { name = "asyncpg" }, + { name = "bcrypt" }, + { name = "configparser" }, { name = "datasets" }, + { name = "distro" }, + { name = "fastapi" }, + { name = "google-api-core" }, + { name = "google-genai" }, + { name = "gunicorn" }, + { name = "httpcore" }, { name = "httpx" }, - { name = "pre-commit" }, - { name = "pytest" }, - { name = "pytest-asyncio" }, + { name = "jiter" }, + { name = "json-repair" }, + { name = "nano-vectordb" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy" }, + { name = "openai" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pipmaster" }, + { name = "psutil" }, + { name = "pycryptodome" }, + { name = "pydantic" }, + { name = "pyjwt" }, + { name = "pypdf" }, + { name = "pypinyin" }, + { name = "python-docx" }, + { name = "python-dotenv" }, + { name = "python-jose", extra = ["cryptography"] }, + { name = "python-multipart" }, + { name = "python-pptx" }, + { name = "pytz" }, { name = "ragas" }, - { name = "ruff" }, + { name = "setuptools" }, + { name = "tenacity" }, + { name = "tiktoken" }, + { name = "uvicorn" }, + { name = "xlsxwriter" }, ] observability = [ { name = "langfuse" }, @@ -2691,6 +2726,52 @@ pytest = [ { name = "pytest-asyncio" }, { name = "ruff" }, ] +test = [ + { name = "aiofiles" }, + { name = "aiohttp" }, + { name = "ascii-colors" }, + { name = "asyncpg" }, + { name = "bcrypt" }, + { name = "configparser" }, + { name = "distro" }, + { name = "fastapi" }, + { name = "google-api-core" }, + { name = "google-genai" }, + { name = "gunicorn" }, + { name = "httpcore" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "json-repair" }, + { name = "nano-vectordb" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy" }, + { name = "openai" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pipmaster" }, + { name = "pre-commit" }, + { name = "psutil" }, + { name = "pycryptodome" }, + { name = "pydantic" }, + { name = "pyjwt" }, + { name = "pypdf" }, + { name = "pypinyin" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, + { name = "python-docx" }, + { name = "python-dotenv" }, + { name = "python-jose", extra = ["cryptography"] }, + { name = "python-multipart" }, + { name = "python-pptx" }, + { name = "pytz" }, + { name = "ruff" }, + { name = "setuptools" }, + { name = "tenacity" }, + { name = "tiktoken" }, + { name = "uvicorn" }, + { name = "xlsxwriter" }, +] [package.metadata] requires-dist = [ @@ -2717,12 +2798,13 @@ requires-dist = [ { name = "google-genai", marker = "extra == 'offline-llm'", specifier = ">=1.0.0,<2.0.0" }, { name = "gunicorn", marker = "extra == 'api'" }, { name = "httpcore", marker = "extra == 'api'" }, - { name = "httpx", marker = "extra == 'api'" }, - { name = "httpx", marker = "extra == 'evaluation'", specifier = ">=0.28.1" }, + { name = "httpx", marker = "extra == 'api'", specifier = ">=0.28.1" }, { name = "jiter", marker = "extra == 'api'" }, { name = "json-repair" }, { name = "json-repair", marker = "extra == 'api'" }, { name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" }, + { name = "lightrag-hku", extras = ["api"], marker = "extra == 'evaluation'" }, + { name = "lightrag-hku", extras = ["api"], marker = "extra == 'test'" }, { name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" }, { name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" }, { name = "nano-vectordb" }, @@ -2740,8 +2822,8 @@ requires-dist = [ { name = "pandas", marker = "extra == 'api'", specifier = ">=2.0.0,<2.4.0" }, { name = "pipmaster" }, { name = "pipmaster", marker = "extra == 'api'" }, - { name = "pre-commit", marker = "extra == 'evaluation'" }, { name = "pre-commit", marker = "extra == 'pytest'" }, + { name = "pre-commit", marker = "extra == 'test'" }, { name = "psutil", marker = "extra == 'api'" }, { name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" }, { name = "pydantic" }, @@ -2752,10 +2834,10 @@ requires-dist = [ { name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" }, { name = "pypinyin" }, { name = "pypinyin", marker = "extra == 'api'" }, - { name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" }, { name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" }, - { name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" }, + { name = "pytest", marker = "extra == 'test'", specifier = ">=8.4.2" }, { name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" }, + { name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=1.2.0" }, { name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" }, { name = "python-dotenv" }, { name = "python-dotenv", marker = "extra == 'api'" }, @@ -2766,8 +2848,8 @@ requires-dist = [ { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" }, { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" }, { name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" }, - { name = "ruff", marker = "extra == 'evaluation'" }, { name = "ruff", marker = "extra == 'pytest'" }, + { name = "ruff", marker = "extra == 'test'" }, { name = "setuptools" }, { name = "setuptools", marker = "extra == 'api'" }, { name = "tenacity" }, @@ -2780,7 +2862,7 @@ requires-dist = [ { name = "xlsxwriter", marker = "extra == 'api'", specifier = ">=3.1.0" }, { name = "zhipuai", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" }, ] -provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "evaluation", "observability"] +provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "test", "evaluation", "observability"] [[package]] name = "llama-cloud"