test: fix env handling, add type hints, improve docs

Improve code quality and test robustness:
- Refactor environment variable parsing in rerank config using centralized get_env_value helper
- Add return type hints to all test methods for better type safety
- Fix patch path in test from lightrag.utils to lightrag.rerank for correct import location
- Clarify batch insert endpoint behavior regarding duplicate content rejection
- Expand .dockerignore to comprehensively exclude node_modules (200MB+), Python cache files, and venv directories
- Update dependency groups: align evaluation and test extras with pytest/pre-commit/ruff tools
This commit is contained in:
clssck 2025-12-03 15:02:11 +01:00
parent 9bae6267f6
commit c5f230a30c
6 changed files with 121 additions and 29 deletions

View file

@ -1,9 +1,17 @@
# Node.js - THE BIG ONE (200MB+)
**/node_modules
# Python-related files and directories
__pycache__
**/__pycache__
*.pyc
*.pyo
.cache
# Virtual environment directories
*.venv
.venv
venv/
# Env
env/

View file

@ -1036,15 +1036,12 @@ def create_app(args):
# Add Cohere-specific parameters if using cohere binding
if args.rerank_binding == "cohere":
# Enable chunking if configured (useful for models with token limits like ColBERT)
kwargs["enable_chunking"] = (
os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true"
kwargs["enable_chunking"] = get_env_value(
"RERANK_ENABLE_CHUNKING", False, bool
)
kwargs["max_tokens_per_doc"] = get_env_value(
"RERANK_MAX_TOKENS_PER_DOC", 4096, int
)
try:
kwargs["max_tokens_per_doc"] = int(
os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")
)
except ValueError:
kwargs["max_tokens_per_doc"] = 4096
return await selected_rerank_func(**kwargs, extra_body=extra_body)

View file

@ -2232,6 +2232,10 @@ def create_document_routes(
This endpoint allows you to insert multiple text entries into the RAG system
in a single request.
Note:
If any text content or file_source already exists in the system,
the entire batch will be rejected with status "duplicated".
Args:
request (InsertTextsRequest): The request body containing the list of texts.
background_tasks: FastAPI BackgroundTasks for async processing

View file

@ -14,7 +14,7 @@ from lightrag.rerank import chunk_documents_for_rerank
class TestOverlapValidation:
"""Test suite for overlap_tokens validation"""
def test_overlap_greater_than_max_tokens(self):
def test_overlap_greater_than_max_tokens(self) -> None:
"""Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
documents = [" ".join([f"word{i}" for i in range(100)])]
@ -27,7 +27,7 @@ class TestOverlapValidation:
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_overlap_equal_to_max_tokens(self):
def test_overlap_equal_to_max_tokens(self) -> None:
"""Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
documents = [" ".join([f"word{i}" for i in range(100)])]
@ -40,7 +40,7 @@ class TestOverlapValidation:
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_overlap_slightly_less_than_max_tokens(self):
def test_overlap_slightly_less_than_max_tokens(self) -> None:
"""Test that overlap_tokens < max_tokens works normally"""
documents = [" ".join([f"word{i}" for i in range(100)])]
@ -53,7 +53,7 @@ class TestOverlapValidation:
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_small_max_tokens_with_large_overlap(self):
def test_small_max_tokens_with_large_overlap(self) -> None:
"""Test edge case with very small max_tokens"""
documents = [" ".join([f"word{i}" for i in range(50)])]
@ -66,7 +66,7 @@ class TestOverlapValidation:
assert len(chunked_docs) > 0
assert all(idx == 0 for idx in doc_indices)
def test_multiple_documents_with_invalid_overlap(self):
def test_multiple_documents_with_invalid_overlap(self) -> None:
"""Test multiple documents with overlap_tokens >= max_tokens"""
documents = [
" ".join([f"word{i}" for i in range(50)]),
@ -84,7 +84,7 @@ class TestOverlapValidation:
# Short document should not be chunked
assert "short document" in chunked_docs
def test_normal_operation_unaffected(self):
def test_normal_operation_unaffected(self) -> None:
"""Test that normal cases continue to work correctly"""
documents = [
" ".join([f"word{i}" for i in range(100)]),
@ -99,10 +99,11 @@ class TestOverlapValidation:
# Long document should be chunked, short one should not
assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short)
assert "short doc" in chunked_docs
# Verify doc_indices maps correctly
assert doc_indices[-1] == 1 # Last chunk is from second document
# Verify doc_indices maps "short doc" to document index 1
short_doc_idx = chunked_docs.index("short doc")
assert doc_indices[short_doc_idx] == 1
def test_edge_case_max_tokens_one(self):
def test_edge_case_max_tokens_one(self) -> None:
"""Test edge case where max_tokens=1"""
documents = [" ".join([f"word{i}" for i in range(20)])]

View file

@ -40,7 +40,7 @@ class TestChunkDocumentsForRerank:
long_doc = "a" * 2000 # 2000 characters
documents = [long_doc, "short doc"]
with patch("lightrag.utils.TiktokenTokenizer", side_effect=ImportError):
with patch("lightrag.rerank.TiktokenTokenizer", side_effect=ImportError):
chunked_docs, doc_indices = chunk_documents_for_rerank(
documents,
max_tokens=100, # 100 tokens = ~400 chars

104
uv.lock generated
View file

@ -2598,13 +2598,48 @@ docling = [
{ name = "docling", marker = "sys_platform != 'darwin'" },
]
evaluation = [
{ name = "aiofiles" },
{ name = "aiohttp" },
{ name = "ascii-colors" },
{ name = "asyncpg" },
{ name = "bcrypt" },
{ name = "configparser" },
{ name = "datasets" },
{ name = "distro" },
{ name = "fastapi" },
{ name = "google-api-core" },
{ name = "google-genai" },
{ name = "gunicorn" },
{ name = "httpcore" },
{ name = "httpx" },
{ name = "pre-commit" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "jiter" },
{ name = "json-repair" },
{ name = "nano-vectordb" },
{ name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "numpy" },
{ name = "openai" },
{ name = "openpyxl" },
{ name = "pandas" },
{ name = "pipmaster" },
{ name = "psutil" },
{ name = "pycryptodome" },
{ name = "pydantic" },
{ name = "pyjwt" },
{ name = "pypdf" },
{ name = "pypinyin" },
{ name = "python-docx" },
{ name = "python-dotenv" },
{ name = "python-jose", extra = ["cryptography"] },
{ name = "python-multipart" },
{ name = "python-pptx" },
{ name = "pytz" },
{ name = "ragas" },
{ name = "ruff" },
{ name = "setuptools" },
{ name = "tenacity" },
{ name = "tiktoken" },
{ name = "uvicorn" },
{ name = "xlsxwriter" },
]
observability = [
{ name = "langfuse" },
@ -2691,6 +2726,52 @@ pytest = [
{ name = "pytest-asyncio" },
{ name = "ruff" },
]
test = [
{ name = "aiofiles" },
{ name = "aiohttp" },
{ name = "ascii-colors" },
{ name = "asyncpg" },
{ name = "bcrypt" },
{ name = "configparser" },
{ name = "distro" },
{ name = "fastapi" },
{ name = "google-api-core" },
{ name = "google-genai" },
{ name = "gunicorn" },
{ name = "httpcore" },
{ name = "httpx" },
{ name = "jiter" },
{ name = "json-repair" },
{ name = "nano-vectordb" },
{ name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "numpy" },
{ name = "openai" },
{ name = "openpyxl" },
{ name = "pandas" },
{ name = "pipmaster" },
{ name = "pre-commit" },
{ name = "psutil" },
{ name = "pycryptodome" },
{ name = "pydantic" },
{ name = "pyjwt" },
{ name = "pypdf" },
{ name = "pypinyin" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "python-docx" },
{ name = "python-dotenv" },
{ name = "python-jose", extra = ["cryptography"] },
{ name = "python-multipart" },
{ name = "python-pptx" },
{ name = "pytz" },
{ name = "ruff" },
{ name = "setuptools" },
{ name = "tenacity" },
{ name = "tiktoken" },
{ name = "uvicorn" },
{ name = "xlsxwriter" },
]
[package.metadata]
requires-dist = [
@ -2717,12 +2798,13 @@ requires-dist = [
{ name = "google-genai", marker = "extra == 'offline-llm'", specifier = ">=1.0.0,<2.0.0" },
{ name = "gunicorn", marker = "extra == 'api'" },
{ name = "httpcore", marker = "extra == 'api'" },
{ name = "httpx", marker = "extra == 'api'" },
{ name = "httpx", marker = "extra == 'evaluation'", specifier = ">=0.28.1" },
{ name = "httpx", marker = "extra == 'api'", specifier = ">=0.28.1" },
{ name = "jiter", marker = "extra == 'api'" },
{ name = "json-repair" },
{ name = "json-repair", marker = "extra == 'api'" },
{ name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" },
{ name = "lightrag-hku", extras = ["api"], marker = "extra == 'evaluation'" },
{ name = "lightrag-hku", extras = ["api"], marker = "extra == 'test'" },
{ name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" },
{ name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" },
{ name = "nano-vectordb" },
@ -2740,8 +2822,8 @@ requires-dist = [
{ name = "pandas", marker = "extra == 'api'", specifier = ">=2.0.0,<2.4.0" },
{ name = "pipmaster" },
{ name = "pipmaster", marker = "extra == 'api'" },
{ name = "pre-commit", marker = "extra == 'evaluation'" },
{ name = "pre-commit", marker = "extra == 'pytest'" },
{ name = "pre-commit", marker = "extra == 'test'" },
{ name = "psutil", marker = "extra == 'api'" },
{ name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
{ name = "pydantic" },
@ -2752,10 +2834,10 @@ requires-dist = [
{ name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" },
{ name = "pypinyin" },
{ name = "pypinyin", marker = "extra == 'api'" },
{ name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
{ name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" },
{ name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" },
{ name = "pytest", marker = "extra == 'test'", specifier = ">=8.4.2" },
{ name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" },
{ name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=1.2.0" },
{ name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" },
{ name = "python-dotenv" },
{ name = "python-dotenv", marker = "extra == 'api'" },
@ -2766,8 +2848,8 @@ requires-dist = [
{ name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
{ name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
{ name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
{ name = "ruff", marker = "extra == 'evaluation'" },
{ name = "ruff", marker = "extra == 'pytest'" },
{ name = "ruff", marker = "extra == 'test'" },
{ name = "setuptools" },
{ name = "setuptools", marker = "extra == 'api'" },
{ name = "tenacity" },
@ -2780,7 +2862,7 @@ requires-dist = [
{ name = "xlsxwriter", marker = "extra == 'api'", specifier = ">=3.1.0" },
{ name = "zhipuai", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
]
provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "evaluation", "observability"]
provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "test", "evaluation", "observability"]
[[package]]
name = "llama-cloud"