test: fix env handling, add type hints, improve docs
Improve code quality and test robustness: - Refactor environment variable parsing in rerank config using centralized get_env_value helper - Add return type hints to all test methods for better type safety - Fix patch path in test from lightrag.utils to lightrag.rerank for correct import location - Clarify batch insert endpoint behavior regarding duplicate content rejection - Expand .dockerignore to comprehensively exclude node_modules (200MB+), Python cache files, and venv directories - Update dependency groups: align evaluation and test extras with pytest/pre-commit/ruff tools
This commit is contained in:
parent
9bae6267f6
commit
c5f230a30c
6 changed files with 121 additions and 29 deletions
|
|
@ -1,9 +1,17 @@
|
|||
# Node.js - THE BIG ONE (200MB+)
|
||||
**/node_modules
|
||||
|
||||
# Python-related files and directories
|
||||
__pycache__
|
||||
**/__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
.cache
|
||||
|
||||
# Virtual environment directories
|
||||
*.venv
|
||||
.venv
|
||||
venv/
|
||||
|
||||
# Env
|
||||
env/
|
||||
|
|
|
|||
|
|
@ -1036,15 +1036,12 @@ def create_app(args):
|
|||
# Add Cohere-specific parameters if using cohere binding
|
||||
if args.rerank_binding == "cohere":
|
||||
# Enable chunking if configured (useful for models with token limits like ColBERT)
|
||||
kwargs["enable_chunking"] = (
|
||||
os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true"
|
||||
kwargs["enable_chunking"] = get_env_value(
|
||||
"RERANK_ENABLE_CHUNKING", False, bool
|
||||
)
|
||||
kwargs["max_tokens_per_doc"] = get_env_value(
|
||||
"RERANK_MAX_TOKENS_PER_DOC", 4096, int
|
||||
)
|
||||
try:
|
||||
kwargs["max_tokens_per_doc"] = int(
|
||||
os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")
|
||||
)
|
||||
except ValueError:
|
||||
kwargs["max_tokens_per_doc"] = 4096
|
||||
|
||||
return await selected_rerank_func(**kwargs, extra_body=extra_body)
|
||||
|
||||
|
|
|
|||
|
|
@ -2232,6 +2232,10 @@ def create_document_routes(
|
|||
This endpoint allows you to insert multiple text entries into the RAG system
|
||||
in a single request.
|
||||
|
||||
Note:
|
||||
If any text content or file_source already exists in the system,
|
||||
the entire batch will be rejected with status "duplicated".
|
||||
|
||||
Args:
|
||||
request (InsertTextsRequest): The request body containing the list of texts.
|
||||
background_tasks: FastAPI BackgroundTasks for async processing
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ from lightrag.rerank import chunk_documents_for_rerank
|
|||
class TestOverlapValidation:
|
||||
"""Test suite for overlap_tokens validation"""
|
||||
|
||||
def test_overlap_greater_than_max_tokens(self):
|
||||
def test_overlap_greater_than_max_tokens(self) -> None:
|
||||
"""Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
|
||||
documents = [" ".join([f"word{i}" for i in range(100)])]
|
||||
|
||||
|
|
@ -27,7 +27,7 @@ class TestOverlapValidation:
|
|||
assert len(chunked_docs) > 0
|
||||
assert all(idx == 0 for idx in doc_indices)
|
||||
|
||||
def test_overlap_equal_to_max_tokens(self):
|
||||
def test_overlap_equal_to_max_tokens(self) -> None:
|
||||
"""Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
|
||||
documents = [" ".join([f"word{i}" for i in range(100)])]
|
||||
|
||||
|
|
@ -40,7 +40,7 @@ class TestOverlapValidation:
|
|||
assert len(chunked_docs) > 0
|
||||
assert all(idx == 0 for idx in doc_indices)
|
||||
|
||||
def test_overlap_slightly_less_than_max_tokens(self):
|
||||
def test_overlap_slightly_less_than_max_tokens(self) -> None:
|
||||
"""Test that overlap_tokens < max_tokens works normally"""
|
||||
documents = [" ".join([f"word{i}" for i in range(100)])]
|
||||
|
||||
|
|
@ -53,7 +53,7 @@ class TestOverlapValidation:
|
|||
assert len(chunked_docs) > 0
|
||||
assert all(idx == 0 for idx in doc_indices)
|
||||
|
||||
def test_small_max_tokens_with_large_overlap(self):
|
||||
def test_small_max_tokens_with_large_overlap(self) -> None:
|
||||
"""Test edge case with very small max_tokens"""
|
||||
documents = [" ".join([f"word{i}" for i in range(50)])]
|
||||
|
||||
|
|
@ -66,7 +66,7 @@ class TestOverlapValidation:
|
|||
assert len(chunked_docs) > 0
|
||||
assert all(idx == 0 for idx in doc_indices)
|
||||
|
||||
def test_multiple_documents_with_invalid_overlap(self):
|
||||
def test_multiple_documents_with_invalid_overlap(self) -> None:
|
||||
"""Test multiple documents with overlap_tokens >= max_tokens"""
|
||||
documents = [
|
||||
" ".join([f"word{i}" for i in range(50)]),
|
||||
|
|
@ -84,7 +84,7 @@ class TestOverlapValidation:
|
|||
# Short document should not be chunked
|
||||
assert "short document" in chunked_docs
|
||||
|
||||
def test_normal_operation_unaffected(self):
|
||||
def test_normal_operation_unaffected(self) -> None:
|
||||
"""Test that normal cases continue to work correctly"""
|
||||
documents = [
|
||||
" ".join([f"word{i}" for i in range(100)]),
|
||||
|
|
@ -99,10 +99,11 @@ class TestOverlapValidation:
|
|||
# Long document should be chunked, short one should not
|
||||
assert len(chunked_docs) > 2 # At least 3 chunks (2 from long doc + 1 short)
|
||||
assert "short doc" in chunked_docs
|
||||
# Verify doc_indices maps correctly
|
||||
assert doc_indices[-1] == 1 # Last chunk is from second document
|
||||
# Verify doc_indices maps "short doc" to document index 1
|
||||
short_doc_idx = chunked_docs.index("short doc")
|
||||
assert doc_indices[short_doc_idx] == 1
|
||||
|
||||
def test_edge_case_max_tokens_one(self):
|
||||
def test_edge_case_max_tokens_one(self) -> None:
|
||||
"""Test edge case where max_tokens=1"""
|
||||
documents = [" ".join([f"word{i}" for i in range(20)])]
|
||||
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ class TestChunkDocumentsForRerank:
|
|||
long_doc = "a" * 2000 # 2000 characters
|
||||
documents = [long_doc, "short doc"]
|
||||
|
||||
with patch("lightrag.utils.TiktokenTokenizer", side_effect=ImportError):
|
||||
with patch("lightrag.rerank.TiktokenTokenizer", side_effect=ImportError):
|
||||
chunked_docs, doc_indices = chunk_documents_for_rerank(
|
||||
documents,
|
||||
max_tokens=100, # 100 tokens = ~400 chars
|
||||
|
|
|
|||
104
uv.lock
generated
104
uv.lock
generated
|
|
@ -2598,13 +2598,48 @@ docling = [
|
|||
{ name = "docling", marker = "sys_platform != 'darwin'" },
|
||||
]
|
||||
evaluation = [
|
||||
{ name = "aiofiles" },
|
||||
{ name = "aiohttp" },
|
||||
{ name = "ascii-colors" },
|
||||
{ name = "asyncpg" },
|
||||
{ name = "bcrypt" },
|
||||
{ name = "configparser" },
|
||||
{ name = "datasets" },
|
||||
{ name = "distro" },
|
||||
{ name = "fastapi" },
|
||||
{ name = "google-api-core" },
|
||||
{ name = "google-genai" },
|
||||
{ name = "gunicorn" },
|
||||
{ name = "httpcore" },
|
||||
{ name = "httpx" },
|
||||
{ name = "pre-commit" },
|
||||
{ name = "pytest" },
|
||||
{ name = "pytest-asyncio" },
|
||||
{ name = "jiter" },
|
||||
{ name = "json-repair" },
|
||||
{ name = "nano-vectordb" },
|
||||
{ name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
{ name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "numpy" },
|
||||
{ name = "openai" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pipmaster" },
|
||||
{ name = "psutil" },
|
||||
{ name = "pycryptodome" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pyjwt" },
|
||||
{ name = "pypdf" },
|
||||
{ name = "pypinyin" },
|
||||
{ name = "python-docx" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "python-jose", extra = ["cryptography"] },
|
||||
{ name = "python-multipart" },
|
||||
{ name = "python-pptx" },
|
||||
{ name = "pytz" },
|
||||
{ name = "ragas" },
|
||||
{ name = "ruff" },
|
||||
{ name = "setuptools" },
|
||||
{ name = "tenacity" },
|
||||
{ name = "tiktoken" },
|
||||
{ name = "uvicorn" },
|
||||
{ name = "xlsxwriter" },
|
||||
]
|
||||
observability = [
|
||||
{ name = "langfuse" },
|
||||
|
|
@ -2691,6 +2726,52 @@ pytest = [
|
|||
{ name = "pytest-asyncio" },
|
||||
{ name = "ruff" },
|
||||
]
|
||||
test = [
|
||||
{ name = "aiofiles" },
|
||||
{ name = "aiohttp" },
|
||||
{ name = "ascii-colors" },
|
||||
{ name = "asyncpg" },
|
||||
{ name = "bcrypt" },
|
||||
{ name = "configparser" },
|
||||
{ name = "distro" },
|
||||
{ name = "fastapi" },
|
||||
{ name = "google-api-core" },
|
||||
{ name = "google-genai" },
|
||||
{ name = "gunicorn" },
|
||||
{ name = "httpcore" },
|
||||
{ name = "httpx" },
|
||||
{ name = "jiter" },
|
||||
{ name = "json-repair" },
|
||||
{ name = "nano-vectordb" },
|
||||
{ name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
{ name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "numpy" },
|
||||
{ name = "openai" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pipmaster" },
|
||||
{ name = "pre-commit" },
|
||||
{ name = "psutil" },
|
||||
{ name = "pycryptodome" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pyjwt" },
|
||||
{ name = "pypdf" },
|
||||
{ name = "pypinyin" },
|
||||
{ name = "pytest" },
|
||||
{ name = "pytest-asyncio" },
|
||||
{ name = "python-docx" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "python-jose", extra = ["cryptography"] },
|
||||
{ name = "python-multipart" },
|
||||
{ name = "python-pptx" },
|
||||
{ name = "pytz" },
|
||||
{ name = "ruff" },
|
||||
{ name = "setuptools" },
|
||||
{ name = "tenacity" },
|
||||
{ name = "tiktoken" },
|
||||
{ name = "uvicorn" },
|
||||
{ name = "xlsxwriter" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
|
|
@ -2717,12 +2798,13 @@ requires-dist = [
|
|||
{ name = "google-genai", marker = "extra == 'offline-llm'", specifier = ">=1.0.0,<2.0.0" },
|
||||
{ name = "gunicorn", marker = "extra == 'api'" },
|
||||
{ name = "httpcore", marker = "extra == 'api'" },
|
||||
{ name = "httpx", marker = "extra == 'api'" },
|
||||
{ name = "httpx", marker = "extra == 'evaluation'", specifier = ">=0.28.1" },
|
||||
{ name = "httpx", marker = "extra == 'api'", specifier = ">=0.28.1" },
|
||||
{ name = "jiter", marker = "extra == 'api'" },
|
||||
{ name = "json-repair" },
|
||||
{ name = "json-repair", marker = "extra == 'api'" },
|
||||
{ name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" },
|
||||
{ name = "lightrag-hku", extras = ["api"], marker = "extra == 'evaluation'" },
|
||||
{ name = "lightrag-hku", extras = ["api"], marker = "extra == 'test'" },
|
||||
{ name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" },
|
||||
{ name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" },
|
||||
{ name = "nano-vectordb" },
|
||||
|
|
@ -2740,8 +2822,8 @@ requires-dist = [
|
|||
{ name = "pandas", marker = "extra == 'api'", specifier = ">=2.0.0,<2.4.0" },
|
||||
{ name = "pipmaster" },
|
||||
{ name = "pipmaster", marker = "extra == 'api'" },
|
||||
{ name = "pre-commit", marker = "extra == 'evaluation'" },
|
||||
{ name = "pre-commit", marker = "extra == 'pytest'" },
|
||||
{ name = "pre-commit", marker = "extra == 'test'" },
|
||||
{ name = "psutil", marker = "extra == 'api'" },
|
||||
{ name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
|
||||
{ name = "pydantic" },
|
||||
|
|
@ -2752,10 +2834,10 @@ requires-dist = [
|
|||
{ name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" },
|
||||
{ name = "pypinyin" },
|
||||
{ name = "pypinyin", marker = "extra == 'api'" },
|
||||
{ name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
|
||||
{ name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" },
|
||||
{ name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" },
|
||||
{ name = "pytest", marker = "extra == 'test'", specifier = ">=8.4.2" },
|
||||
{ name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" },
|
||||
{ name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=1.2.0" },
|
||||
{ name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "python-dotenv", marker = "extra == 'api'" },
|
||||
|
|
@ -2766,8 +2848,8 @@ requires-dist = [
|
|||
{ name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
|
||||
{ name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
|
||||
{ name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
|
||||
{ name = "ruff", marker = "extra == 'evaluation'" },
|
||||
{ name = "ruff", marker = "extra == 'pytest'" },
|
||||
{ name = "ruff", marker = "extra == 'test'" },
|
||||
{ name = "setuptools" },
|
||||
{ name = "setuptools", marker = "extra == 'api'" },
|
||||
{ name = "tenacity" },
|
||||
|
|
@ -2780,7 +2862,7 @@ requires-dist = [
|
|||
{ name = "xlsxwriter", marker = "extra == 'api'", specifier = ">=3.1.0" },
|
||||
{ name = "zhipuai", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
|
||||
]
|
||||
provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "evaluation", "observability"]
|
||||
provides-extras = ["pytest", "api", "docling", "offline-storage", "offline-llm", "offline", "test", "evaluation", "observability"]
|
||||
|
||||
[[package]]
|
||||
name = "llama-cloud"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue