Add extensive test suites for API routes and utilities: - Implement test_search_routes.py (406 lines) for search endpoint validation - Implement test_upload_routes.py (724 lines) for document upload workflows - Implement test_s3_client.py (618 lines) for S3 storage operations - Implement test_citation_utils.py (352 lines) for citation extraction - Implement test_chunking.py (216 lines) for text chunking validation Add S3 storage client implementation: - Create lightrag/storage/s3_client.py with S3 operations - Add storage module initialization with exports - Integrate S3 client with document upload handling Enhance API routes and core functionality: - Add search_routes.py with full-text and graph search endpoints - Add upload_routes.py with multipart document upload support - Update operate.py with bulk operations and health checks - Enhance postgres_impl.py with bulk upsert and parameterized queries - Update lightrag_server.py to register new API routes - Improve utils.py with citation and formatting utilities Update dependencies and configuration: - Add S3 and test dependencies to pyproject.toml - Update docker-compose.test.yml for testing environment - Sync uv.lock with new dependencies Apply code quality improvements across all modified files: - Add type hints to function signatures - Update imports and router initialization - Fix logging and error handling
618 lines
21 KiB
Python
618 lines
21 KiB
Python
"""Tests for S3 client functionality in lightrag/storage/s3_client.py.
|
|
|
|
This module tests S3 operations by mocking the aioboto3 session layer,
|
|
avoiding the moto/aiobotocore async incompatibility issue.
|
|
"""
|
|
|
|
from contextlib import asynccontextmanager
|
|
from io import BytesIO
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
# Note: The S3Client in lightrag uses aioboto3 which requires proper async mocking
|
|
|
|
|
|
@pytest.fixture
|
|
def aws_credentials(monkeypatch):
|
|
"""Set mock AWS credentials."""
|
|
monkeypatch.setenv('AWS_ACCESS_KEY_ID', 'testing')
|
|
monkeypatch.setenv('AWS_SECRET_ACCESS_KEY', 'testing')
|
|
monkeypatch.setenv('AWS_DEFAULT_REGION', 'us-east-1')
|
|
monkeypatch.setenv('S3_ACCESS_KEY_ID', 'testing')
|
|
monkeypatch.setenv('S3_SECRET_ACCESS_KEY', 'testing')
|
|
monkeypatch.setenv('S3_BUCKET_NAME', 'test-bucket')
|
|
monkeypatch.setenv('S3_REGION', 'us-east-1')
|
|
monkeypatch.setenv('S3_ENDPOINT_URL', '')
|
|
|
|
|
|
@pytest.fixture
|
|
def s3_config(aws_credentials):
|
|
"""Create S3Config for testing."""
|
|
from lightrag.storage.s3_client import S3Config
|
|
|
|
return S3Config(
|
|
endpoint_url='',
|
|
access_key_id='testing',
|
|
secret_access_key='testing',
|
|
bucket_name='test-bucket',
|
|
region='us-east-1',
|
|
)
|
|
|
|
|
|
def create_mock_s3_client():
|
|
"""Create a mock S3 client with common operations."""
|
|
mock_client = MagicMock()
|
|
|
|
# Storage for mock objects
|
|
mock_client._objects = {}
|
|
|
|
# head_bucket - succeeds (bucket exists)
|
|
mock_client.head_bucket = AsyncMock(return_value={})
|
|
|
|
# put_object
|
|
async def mock_put_object(**kwargs):
|
|
key = kwargs['Key']
|
|
body = kwargs['Body']
|
|
metadata = kwargs.get('Metadata', {})
|
|
content_type = kwargs.get('ContentType', 'application/octet-stream')
|
|
|
|
# Read body if it's a file-like object
|
|
if hasattr(body, 'read'):
|
|
content = body.read()
|
|
else:
|
|
content = body
|
|
|
|
mock_client._objects[key] = {
|
|
'Body': content,
|
|
'Metadata': metadata,
|
|
'ContentType': content_type,
|
|
}
|
|
return {'ETag': '"mock-etag"'}
|
|
|
|
mock_client.put_object = AsyncMock(side_effect=mock_put_object)
|
|
|
|
# get_object
|
|
async def mock_get_object(**kwargs):
|
|
key = kwargs['Key']
|
|
if key not in mock_client._objects:
|
|
from botocore.exceptions import ClientError
|
|
raise ClientError(
|
|
{'Error': {'Code': 'NoSuchKey', 'Message': 'Not found'}},
|
|
'GetObject'
|
|
)
|
|
|
|
obj = mock_client._objects[key]
|
|
body_mock = MagicMock()
|
|
body_mock.read = AsyncMock(return_value=obj['Body'])
|
|
|
|
return {
|
|
'Body': body_mock,
|
|
'Metadata': obj['Metadata'],
|
|
'ContentType': obj['ContentType'],
|
|
}
|
|
|
|
mock_client.get_object = AsyncMock(side_effect=mock_get_object)
|
|
|
|
# head_object (for object_exists)
|
|
async def mock_head_object(**kwargs):
|
|
key = kwargs['Key']
|
|
if key not in mock_client._objects:
|
|
from botocore.exceptions import ClientError
|
|
raise ClientError(
|
|
{'Error': {'Code': '404', 'Message': 'Not found'}},
|
|
'HeadObject'
|
|
)
|
|
return {'ContentLength': len(mock_client._objects[key]['Body'])}
|
|
|
|
mock_client.head_object = AsyncMock(side_effect=mock_head_object)
|
|
|
|
# delete_object
|
|
async def mock_delete_object(**kwargs):
|
|
key = kwargs['Key']
|
|
if key in mock_client._objects:
|
|
del mock_client._objects[key]
|
|
return {}
|
|
|
|
mock_client.delete_object = AsyncMock(side_effect=mock_delete_object)
|
|
|
|
# copy_object
|
|
async def mock_copy_object(**kwargs):
|
|
source = kwargs['CopySource']
|
|
dest_key = kwargs['Key']
|
|
|
|
# CopySource is like {'Bucket': 'bucket', 'Key': 'key'}
|
|
source_key = source['Key']
|
|
|
|
if source_key not in mock_client._objects:
|
|
from botocore.exceptions import ClientError
|
|
raise ClientError(
|
|
{'Error': {'Code': 'NoSuchKey', 'Message': 'Not found'}},
|
|
'CopyObject'
|
|
)
|
|
|
|
mock_client._objects[dest_key] = mock_client._objects[source_key].copy()
|
|
return {}
|
|
|
|
mock_client.copy_object = AsyncMock(side_effect=mock_copy_object)
|
|
|
|
# list_objects_v2
|
|
async def mock_list_objects_v2(**kwargs):
|
|
prefix = kwargs.get('Prefix', '')
|
|
contents = []
|
|
|
|
for key, obj in mock_client._objects.items():
|
|
if key.startswith(prefix):
|
|
contents.append({
|
|
'Key': key,
|
|
'Size': len(obj['Body']),
|
|
'LastModified': '2024-01-01T00:00:00Z',
|
|
})
|
|
|
|
return {'Contents': contents} if contents else {}
|
|
|
|
mock_client.list_objects_v2 = AsyncMock(side_effect=mock_list_objects_v2)
|
|
|
|
# get_paginator for list_staging - returns async paginator
|
|
class MockPaginator:
|
|
def __init__(self, objects_dict):
|
|
self._objects = objects_dict
|
|
|
|
def paginate(self, **kwargs):
|
|
return MockPaginatorIterator(self._objects, kwargs.get('Prefix', ''))
|
|
|
|
class MockPaginatorIterator:
|
|
def __init__(self, objects_dict, prefix):
|
|
self._objects = objects_dict
|
|
self._prefix = prefix
|
|
self._done = False
|
|
|
|
def __aiter__(self):
|
|
return self
|
|
|
|
async def __anext__(self):
|
|
if self._done:
|
|
raise StopAsyncIteration
|
|
|
|
self._done = True
|
|
from datetime import datetime
|
|
contents = []
|
|
for key, obj in self._objects.items():
|
|
if key.startswith(self._prefix):
|
|
contents.append({
|
|
'Key': key,
|
|
'Size': len(obj['Body']),
|
|
'LastModified': datetime(2024, 1, 1),
|
|
})
|
|
return {'Contents': contents} if contents else {}
|
|
|
|
def mock_get_paginator(operation_name):
|
|
return MockPaginator(mock_client._objects)
|
|
|
|
mock_client.get_paginator = MagicMock(side_effect=mock_get_paginator)
|
|
|
|
# generate_presigned_url - the code awaits this, so return an awaitable
|
|
async def mock_generate_presigned_url(ClientMethod, Params, ExpiresIn=3600):
|
|
key = Params.get('Key', 'unknown')
|
|
bucket = Params.get('Bucket', 'bucket')
|
|
return f'https://{bucket}.s3.amazonaws.com/{key}?signature=mock'
|
|
|
|
mock_client.generate_presigned_url = mock_generate_presigned_url
|
|
|
|
return mock_client
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_s3_session():
|
|
"""Create a mock aioboto3 session that returns a mock S3 client."""
|
|
mock_session = MagicMock()
|
|
mock_client = create_mock_s3_client()
|
|
|
|
@asynccontextmanager
|
|
async def mock_client_context(*args, **kwargs):
|
|
yield mock_client
|
|
|
|
# Return a NEW context manager each time client() is called
|
|
mock_session.client = MagicMock(side_effect=lambda *args, **kwargs: mock_client_context())
|
|
|
|
return mock_session, mock_client
|
|
|
|
|
|
# ============================================================================
|
|
# Unit Tests for Key Generation (no mocking needed)
|
|
# ============================================================================
|
|
|
|
|
|
class TestKeyGeneration:
|
|
"""Tests for S3 key generation methods."""
|
|
|
|
@pytest.mark.offline
|
|
def test_make_staging_key(self, s3_config):
|
|
"""Test staging key format."""
|
|
from lightrag.storage.s3_client import S3Client
|
|
|
|
client = S3Client(config=s3_config)
|
|
key = client._make_staging_key('default', 'doc123', 'report.pdf')
|
|
assert key == 'staging/default/doc123/report.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_make_staging_key_sanitizes_slashes(self, s3_config):
|
|
"""Test that slashes in filename are sanitized."""
|
|
from lightrag.storage.s3_client import S3Client
|
|
|
|
client = S3Client(config=s3_config)
|
|
key = client._make_staging_key('default', 'doc123', 'path/to/file.pdf')
|
|
assert key == 'staging/default/doc123/path_to_file.pdf'
|
|
assert '//' not in key
|
|
|
|
@pytest.mark.offline
|
|
def test_make_staging_key_sanitizes_backslashes(self, s3_config):
|
|
"""Test that backslashes in filename are sanitized."""
|
|
from lightrag.storage.s3_client import S3Client
|
|
|
|
client = S3Client(config=s3_config)
|
|
key = client._make_staging_key('default', 'doc123', 'path\\to\\file.pdf')
|
|
assert key == 'staging/default/doc123/path_to_file.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_make_archive_key(self, s3_config):
|
|
"""Test archive key format."""
|
|
from lightrag.storage.s3_client import S3Client
|
|
|
|
client = S3Client(config=s3_config)
|
|
key = client._make_archive_key('workspace1', 'doc456', 'data.json')
|
|
assert key == 'archive/workspace1/doc456/data.json'
|
|
|
|
@pytest.mark.offline
|
|
def test_staging_to_archive_key(self, s3_config):
|
|
"""Test staging to archive key transformation."""
|
|
from lightrag.storage.s3_client import S3Client
|
|
|
|
client = S3Client(config=s3_config)
|
|
staging_key = 'staging/default/doc123/report.pdf'
|
|
archive_key = client._staging_to_archive_key(staging_key)
|
|
assert archive_key == 'archive/default/doc123/report.pdf'
|
|
|
|
@pytest.mark.offline
|
|
def test_staging_to_archive_key_non_staging(self, s3_config):
|
|
"""Test that non-staging keys are returned unchanged."""
|
|
from lightrag.storage.s3_client import S3Client
|
|
|
|
client = S3Client(config=s3_config)
|
|
key = 'archive/default/doc123/report.pdf'
|
|
result = client._staging_to_archive_key(key)
|
|
assert result == key
|
|
|
|
@pytest.mark.offline
|
|
def test_get_s3_url(self, s3_config):
|
|
"""Test S3 URL generation."""
|
|
from lightrag.storage.s3_client import S3Client
|
|
|
|
client = S3Client(config=s3_config)
|
|
url = client.get_s3_url('archive/default/doc123/report.pdf')
|
|
assert url == 's3://test-bucket/archive/default/doc123/report.pdf'
|
|
|
|
|
|
# ============================================================================
|
|
# Integration Tests with Mocked S3 Session
|
|
# ============================================================================
|
|
|
|
|
|
@pytest.mark.offline
|
|
class TestS3ClientOperations:
|
|
"""Tests for S3 client operations using mocked session."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_initialize_creates_bucket(self, s3_config, mock_s3_session):
|
|
"""Test that initialize checks bucket exists."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
assert client._initialized is True
|
|
mock_client.head_bucket.assert_called_once()
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_to_staging(self, s3_config, mock_s3_session):
|
|
"""Test uploading content to staging."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
s3_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content=b'Hello, World!',
|
|
filename='test.txt',
|
|
content_type='text/plain',
|
|
)
|
|
|
|
assert s3_key == 'staging/default/doc123/test.txt'
|
|
mock_client.put_object.assert_called_once()
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_string_content(self, s3_config, mock_s3_session):
|
|
"""Test uploading string content (should be encoded to bytes)."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
s3_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content='String content', # String, not bytes
|
|
filename='test.txt',
|
|
)
|
|
|
|
# Verify we can retrieve it
|
|
content, metadata = await client.get_object(s3_key)
|
|
assert content == b'String content'
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_object(self, s3_config, mock_s3_session):
|
|
"""Test retrieving uploaded object."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
# Upload
|
|
test_content = b'Test content for retrieval'
|
|
s3_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content=test_content,
|
|
filename='test.txt',
|
|
)
|
|
|
|
# Retrieve
|
|
content, metadata = await client.get_object(s3_key)
|
|
|
|
assert content == test_content
|
|
assert metadata.get('workspace') == 'default'
|
|
assert metadata.get('doc_id') == 'doc123'
|
|
assert 'content_hash' in metadata
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_move_to_archive(self, s3_config, mock_s3_session):
|
|
"""Test moving object from staging to archive."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
# Upload to staging
|
|
test_content = b'Content to archive'
|
|
staging_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content=test_content,
|
|
filename='test.txt',
|
|
)
|
|
|
|
# Move to archive
|
|
archive_key = await client.move_to_archive(staging_key)
|
|
|
|
assert archive_key == 'archive/default/doc123/test.txt'
|
|
|
|
# Verify staging key no longer exists
|
|
assert not await client.object_exists(staging_key)
|
|
|
|
# Verify archive key exists and has correct content
|
|
assert await client.object_exists(archive_key)
|
|
content, _ = await client.get_object(archive_key)
|
|
assert content == test_content
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delete_object(self, s3_config, mock_s3_session):
|
|
"""Test deleting an object."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
# Upload
|
|
s3_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content=b'Content to delete',
|
|
filename='test.txt',
|
|
)
|
|
|
|
assert await client.object_exists(s3_key)
|
|
|
|
# Delete
|
|
await client.delete_object(s3_key)
|
|
|
|
# Verify deleted
|
|
assert not await client.object_exists(s3_key)
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_list_staging(self, s3_config, mock_s3_session):
|
|
"""Test listing objects in staging."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
# Upload multiple objects
|
|
await client.upload_to_staging('default', 'doc1', b'Content 1', 'file1.txt')
|
|
await client.upload_to_staging('default', 'doc2', b'Content 2', 'file2.txt')
|
|
await client.upload_to_staging('other', 'doc3', b'Content 3', 'file3.txt')
|
|
|
|
# List only 'default' workspace
|
|
objects = await client.list_staging('default')
|
|
|
|
assert len(objects) == 2
|
|
keys = [obj['key'] for obj in objects]
|
|
assert 'staging/default/doc1/file1.txt' in keys
|
|
assert 'staging/default/doc2/file2.txt' in keys
|
|
assert 'staging/other/doc3/file3.txt' not in keys
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_object_exists_true(self, s3_config, mock_s3_session):
|
|
"""Test object_exists returns True for existing object."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
s3_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content=b'Test',
|
|
filename='test.txt',
|
|
)
|
|
|
|
assert await client.object_exists(s3_key) is True
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_object_exists_false(self, s3_config, mock_s3_session):
|
|
"""Test object_exists returns False for non-existing object."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
assert await client.object_exists('nonexistent/key') is False
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_presigned_url(self, s3_config, mock_s3_session):
|
|
"""Test generating presigned URL."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
s3_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content=b'Test',
|
|
filename='test.txt',
|
|
)
|
|
|
|
url = await client.get_presigned_url(s3_key)
|
|
|
|
# URL should be a string containing the bucket
|
|
assert isinstance(url, str)
|
|
assert 'test-bucket' in url
|
|
|
|
await client.finalize()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_upload_with_metadata(self, s3_config, mock_s3_session):
|
|
"""Test uploading with custom metadata."""
|
|
from lightrag.storage.s3_client import S3Client, S3ClientManager
|
|
|
|
mock_session, mock_client = mock_s3_session
|
|
|
|
with patch.object(S3ClientManager, 'get_session', return_value=mock_session):
|
|
client = S3Client(config=s3_config)
|
|
await client.initialize()
|
|
|
|
custom_metadata = {'author': 'test-user', 'version': '1.0'}
|
|
|
|
s3_key = await client.upload_to_staging(
|
|
workspace='default',
|
|
doc_id='doc123',
|
|
content=b'Test',
|
|
filename='test.txt',
|
|
metadata=custom_metadata,
|
|
)
|
|
|
|
_, metadata = await client.get_object(s3_key)
|
|
|
|
# Custom metadata should be included
|
|
assert metadata.get('author') == 'test-user'
|
|
assert metadata.get('version') == '1.0'
|
|
# Built-in metadata should also be present
|
|
assert metadata.get('workspace') == 'default'
|
|
|
|
await client.finalize()
|
|
|
|
|
|
# ============================================================================
|
|
# S3Config Tests
|
|
# ============================================================================
|
|
|
|
|
|
class TestS3Config:
|
|
"""Tests for S3Config validation."""
|
|
|
|
@pytest.mark.offline
|
|
def test_config_requires_credentials(self, monkeypatch):
|
|
"""Test that S3Config raises error without credentials."""
|
|
from lightrag.storage.s3_client import S3Config
|
|
|
|
monkeypatch.setenv('S3_ACCESS_KEY_ID', '')
|
|
monkeypatch.setenv('S3_SECRET_ACCESS_KEY', '')
|
|
|
|
with pytest.raises(ValueError, match='S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY must be set'):
|
|
S3Config(
|
|
access_key_id='',
|
|
secret_access_key='',
|
|
)
|
|
|
|
@pytest.mark.offline
|
|
def test_config_with_valid_credentials(self, aws_credentials):
|
|
"""Test that S3Config initializes with valid credentials."""
|
|
from lightrag.storage.s3_client import S3Config
|
|
|
|
config = S3Config(
|
|
access_key_id='valid-key',
|
|
secret_access_key='valid-secret',
|
|
)
|
|
|
|
assert config.access_key_id == 'valid-key'
|
|
assert config.secret_access_key == 'valid-secret'
|