added suggestions
This commit is contained in:
parent
d0f2a377ef
commit
bf15790be8
7 changed files with 244 additions and 27 deletions
|
|
@ -1,4 +1,5 @@
|
|||
from uuid import UUID
|
||||
from pathlib import Path
|
||||
from typing import Union, BinaryIO, List, Optional
|
||||
|
||||
from cognee.modules.pipelines import Task
|
||||
|
|
@ -8,7 +9,7 @@ from cognee.tasks.ingestion import ingest_data, resolve_data_directories
|
|||
|
||||
|
||||
async def add(
|
||||
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
|
||||
data: Union[BinaryIO, list[BinaryIO], str, list[str], Path, list[Path]],
|
||||
dataset_name: str = "main_dataset",
|
||||
user: User = None,
|
||||
node_set: Optional[List[str]] = None,
|
||||
|
|
@ -17,7 +18,7 @@ async def add(
|
|||
dataset_id: Optional[UUID] = None,
|
||||
preferred_loaders: Optional[List[str]] = None,
|
||||
loader_config: Optional[dict] = None,
|
||||
|
||||
incremental_loading: bool = True,
|
||||
):
|
||||
"""
|
||||
Add data to Cognee for knowledge graph processing using a plugin-based loader system.
|
||||
|
|
@ -36,8 +37,9 @@ async def add(
|
|||
|
||||
Supported Input Types:
|
||||
- **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
|
||||
- **File paths**: Local file paths as strings in these formats:
|
||||
* Absolute paths: "/path/to/document.pdf"
|
||||
- **File paths**: Local file paths in these formats:
|
||||
* Path objects: pathlib.Path("/path/to/document.pdf") - **Recommended for explicit file path handling**
|
||||
* Absolute paths as strings: "/path/to/document.pdf"
|
||||
* File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
|
||||
* S3 paths: "s3://bucket-name/path/to/file.pdf"
|
||||
- **Binary file objects**: File handles/streams (BinaryIO)
|
||||
|
|
@ -102,6 +104,8 @@ async def add(
|
|||
If not provided, uses default loader priority.
|
||||
loader_config: Optional configuration for specific loaders. Dictionary mapping loader names
|
||||
to their configuration options (e.g., {"pypdf_loader": {"strict": False}}).
|
||||
incremental_loading: Whether to skip processing of documents already processed by the pipeline.
|
||||
Defaults to True for efficiency. Set to False to reprocess all data.
|
||||
|
||||
Returns:
|
||||
PipelineRunInfo: Information about the ingestion pipeline execution including:
|
||||
|
|
|
|||
|
|
@ -1,11 +1,13 @@
|
|||
import os
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
from typing import List, Union, BinaryIO
|
||||
from cognee.infrastructure.files.storage.s3_config import get_s3_config
|
||||
|
||||
|
||||
async def resolve_data_directories(
|
||||
data: Union[BinaryIO, List[BinaryIO], str, List[str]], include_subdirectories: bool = True
|
||||
data: Union[BinaryIO, List[BinaryIO], str, List[str], Path, List[Path]],
|
||||
include_subdirectories: bool = True,
|
||||
):
|
||||
"""
|
||||
Resolves directories by replacing them with their contained files.
|
||||
|
|
@ -33,7 +35,26 @@ async def resolve_data_directories(
|
|||
)
|
||||
|
||||
for item in data:
|
||||
if isinstance(item, str): # Check if the item is a path
|
||||
if isinstance(item, Path): # Path objects explicitly indicate file paths
|
||||
# Convert Path to string for processing
|
||||
item_str = str(item)
|
||||
if item.is_dir(): # If it's a directory
|
||||
if include_subdirectories:
|
||||
# Recursively add all files in the directory and subdirectories
|
||||
for root, _, files in os.walk(item_str):
|
||||
resolved_data.extend([Path(os.path.join(root, f)) for f in files])
|
||||
else:
|
||||
# Add all files (not subdirectories) in the directory
|
||||
resolved_data.extend(
|
||||
[
|
||||
Path(os.path.join(item_str, f))
|
||||
for f in os.listdir(item_str)
|
||||
if os.path.isfile(os.path.join(item_str, f))
|
||||
]
|
||||
)
|
||||
else: # If it's a file, add it directly
|
||||
resolved_data.append(item)
|
||||
elif isinstance(item, str): # Check if the item is a path or text content
|
||||
# S3
|
||||
if urlparse(item).scheme == "s3":
|
||||
if fs is not None:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
from typing import Union, BinaryIO, Any
|
||||
|
||||
from cognee.modules.ingestion.exceptions import IngestionError
|
||||
|
|
@ -16,7 +17,7 @@ class SaveDataSettings(BaseSettings):
|
|||
settings = SaveDataSettings()
|
||||
|
||||
|
||||
async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str:
|
||||
async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Path, Any]) -> str:
|
||||
if "llama_index" in str(type(data_item)):
|
||||
# Dynamic import is used because the llama_index module is optional.
|
||||
from .transform_data import get_data_from_llama_index
|
||||
|
|
@ -27,6 +28,18 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|||
if hasattr(data_item, "file"):
|
||||
return await save_data_to_file(data_item.file, filename=data_item.filename)
|
||||
|
||||
# data is a Path object - explicitly indicates a file path
|
||||
if isinstance(data_item, Path):
|
||||
if settings.accept_local_file_path:
|
||||
# Convert Path to file URL for consistency
|
||||
normalized_path = str(data_item.resolve())
|
||||
# Use forward slashes in file URLs for consistency
|
||||
url_path = normalized_path.replace(os.sep, "/")
|
||||
file_path = "file://" + url_path
|
||||
return file_path
|
||||
else:
|
||||
raise IngestionError(message="Local files are not accepted.")
|
||||
|
||||
if isinstance(data_item, str):
|
||||
parsed_url = urlparse(data_item)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
from pathlib import Path
|
||||
from .models.LoaderResult import LoaderResult
|
||||
|
||||
|
||||
|
|
@ -45,12 +46,12 @@ class LoaderInterface(ABC):
|
|||
pass
|
||||
|
||||
@abstractmethod
|
||||
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||
def can_handle(self, file_path: Union[str, Path], mime_type: str = None) -> bool:
|
||||
"""
|
||||
Check if this loader can handle the given file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to be processed
|
||||
file_path: Path to the file to be processed (Path type recommended for explicit file path handling)
|
||||
mime_type: Optional MIME type of the file
|
||||
|
||||
Returns:
|
||||
|
|
@ -59,12 +60,12 @@ class LoaderInterface(ABC):
|
|||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def load(self, file_path: str, **kwargs) -> LoaderResult:
|
||||
async def load(self, file_path: Union[str, Path], **kwargs) -> LoaderResult:
|
||||
"""
|
||||
Load and process the file, returning standardized result.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to be processed
|
||||
file_path: Path to the file to be processed (Path type recommended for explicit file path handling)
|
||||
**kwargs: Additional loader-specific configuration
|
||||
|
||||
Returns:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
from pathlib import Path
|
||||
from ..LoaderInterface import LoaderInterface
|
||||
from ..models.LoaderResult import LoaderResult, ContentType
|
||||
|
||||
|
|
@ -36,19 +37,22 @@ class TextLoader(LoaderInterface):
|
|||
"""Unique identifier for this loader."""
|
||||
return "text_loader"
|
||||
|
||||
def can_handle(self, file_path: str, mime_type: str = None) -> bool:
|
||||
def can_handle(self, file_path: Union[str, Path], mime_type: str = None) -> bool:
|
||||
"""
|
||||
Check if this loader can handle the given file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
file_path: Path to the file (Path type recommended for explicit file path handling)
|
||||
mime_type: Optional MIME type
|
||||
|
||||
Returns:
|
||||
True if file can be handled, False otherwise
|
||||
"""
|
||||
# Convert to Path for consistent handling
|
||||
path_obj = Path(file_path) if isinstance(file_path, str) else file_path
|
||||
|
||||
# Check by extension
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
ext = path_obj.suffix.lower()
|
||||
if ext in self.supported_extensions:
|
||||
return True
|
||||
|
||||
|
|
@ -60,7 +64,7 @@ class TextLoader(LoaderInterface):
|
|||
# This is useful when other loaders fail
|
||||
try:
|
||||
# Quick check if file appears to be text
|
||||
with open(file_path, "rb") as f:
|
||||
with open(path_obj, "rb") as f:
|
||||
sample = f.read(512)
|
||||
# Simple heuristic: if most bytes are printable, consider it text
|
||||
if sample:
|
||||
|
|
@ -78,12 +82,14 @@ class TextLoader(LoaderInterface):
|
|||
|
||||
return False
|
||||
|
||||
async def load(self, file_path: str, encoding: str = "utf-8", **kwargs) -> LoaderResult:
|
||||
async def load(
|
||||
self, file_path: Union[str, Path], encoding: str = "utf-8", **kwargs
|
||||
) -> LoaderResult:
|
||||
"""
|
||||
Load and process the text file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load
|
||||
file_path: Path to the file to load (Path type recommended for explicit file path handling)
|
||||
encoding: Text encoding to use (default: utf-8)
|
||||
**kwargs: Additional configuration (unused)
|
||||
|
||||
|
|
@ -95,25 +101,28 @@ class TextLoader(LoaderInterface):
|
|||
UnicodeDecodeError: If file cannot be decoded with specified encoding
|
||||
OSError: If file cannot be read
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
# Convert to Path for consistent handling
|
||||
path_obj = Path(file_path) if isinstance(file_path, str) else file_path
|
||||
|
||||
if not path_obj.exists():
|
||||
raise FileNotFoundError(f"File not found: {path_obj}")
|
||||
|
||||
try:
|
||||
with open(file_path, "r", encoding=encoding) as f:
|
||||
with open(path_obj, "r", encoding=encoding) as f:
|
||||
content = f.read()
|
||||
except UnicodeDecodeError:
|
||||
# Try with fallback encoding
|
||||
if encoding == "utf-8":
|
||||
return await self.load(file_path, encoding="latin-1", **kwargs)
|
||||
return await self.load(path_obj, encoding="latin-1", **kwargs)
|
||||
else:
|
||||
raise
|
||||
|
||||
# Extract basic metadata
|
||||
file_stat = os.stat(file_path)
|
||||
file_stat = path_obj.stat()
|
||||
metadata = {
|
||||
"name": os.path.basename(file_path),
|
||||
"name": path_obj.name,
|
||||
"size": file_stat.st_size,
|
||||
"extension": os.path.splitext(file_path)[1],
|
||||
"extension": path_obj.suffix,
|
||||
"encoding": encoding,
|
||||
"loader": self.loader_name,
|
||||
"lines": len(content.splitlines()) if content else 0,
|
||||
|
|
@ -124,5 +133,5 @@ class TextLoader(LoaderInterface):
|
|||
content=content,
|
||||
metadata=metadata,
|
||||
content_type=ContentType.TEXT,
|
||||
source_info={"file_path": file_path, "encoding": encoding},
|
||||
source_info={"file_path": str(path_obj), "encoding": encoding},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -155,3 +155,43 @@ class TestTextLoader:
|
|||
"""Test that TextLoader has no external dependencies."""
|
||||
assert text_loader.get_dependencies() == []
|
||||
assert text_loader.validate_dependencies() is True
|
||||
|
||||
def test_can_handle_path_object(self, text_loader):
|
||||
"""Test that can_handle works with Path objects."""
|
||||
path_obj = Path("test.txt")
|
||||
assert text_loader.can_handle(path_obj)
|
||||
|
||||
path_obj = Path("test.pdf")
|
||||
assert not text_loader.can_handle(path_obj)
|
||||
|
||||
# Test case insensitive
|
||||
path_obj = Path("test.TXT")
|
||||
assert text_loader.can_handle(path_obj)
|
||||
|
||||
def test_can_handle_path_object_with_mime_type(self, text_loader):
|
||||
"""Test that can_handle works with Path objects and MIME type."""
|
||||
path_obj = Path("test.unknown")
|
||||
assert text_loader.can_handle(path_obj, mime_type="text/plain")
|
||||
assert not text_loader.can_handle(path_obj, mime_type="application/pdf")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_path_object(self, text_loader, temp_text_file):
|
||||
"""Test loading a file using a Path object."""
|
||||
path_obj = Path(temp_text_file)
|
||||
result = await text_loader.load(path_obj)
|
||||
|
||||
assert isinstance(result.content, str)
|
||||
assert "This is a test file." in result.content
|
||||
assert result.content_type == ContentType.TEXT
|
||||
assert result.metadata["loader"] == "text_loader"
|
||||
assert result.metadata["name"] == path_obj.name
|
||||
assert result.metadata["lines"] == 2
|
||||
assert result.metadata["encoding"] == "utf-8"
|
||||
assert result.source_info["file_path"] == str(path_obj)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_path_object_nonexistent(self, text_loader):
|
||||
"""Test loading a nonexistent file using a Path object."""
|
||||
path_obj = Path("/nonexistent/file.txt")
|
||||
with pytest.raises(FileNotFoundError):
|
||||
await text_loader.load(path_obj)
|
||||
|
|
|
|||
129
tests/unit/tasks/ingestion/test_path_support.py
Normal file
129
tests/unit/tasks/ingestion/test_path_support.py
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
import pytest
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from cognee.tasks.ingestion.save_data_item_to_storage import save_data_item_to_storage
|
||||
from cognee.tasks.ingestion.resolve_data_directories import resolve_data_directories
|
||||
|
||||
|
||||
class TestPathSupport:
|
||||
"""Test Path type support in ingestion functions."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_text_file(self):
|
||||
"""Create a temporary text file for testing."""
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||
f.write("This is a test file for Path support.\n")
|
||||
temp_path = f.name
|
||||
|
||||
yield temp_path
|
||||
|
||||
# Cleanup
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
@pytest.fixture
|
||||
def temp_directory(self):
|
||||
"""Create a temporary directory with test files."""
|
||||
import tempfile
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
|
||||
# Create some test files
|
||||
for i in range(3):
|
||||
with open(os.path.join(temp_dir, f"test_{i}.txt"), "w") as f:
|
||||
f.write(f"Test file {i} content.\n")
|
||||
|
||||
yield temp_dir
|
||||
|
||||
# Cleanup
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_save_data_item_path_object(self, temp_text_file):
|
||||
"""Test save_data_item_to_storage with Path object."""
|
||||
path_obj = Path(temp_text_file)
|
||||
result = await save_data_item_to_storage(path_obj)
|
||||
|
||||
# Should return a file:// URL
|
||||
assert result.startswith("file://")
|
||||
assert str(path_obj.resolve()) in result
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_save_data_item_string_vs_path(self, temp_text_file):
|
||||
"""Test that Path object vs string path are handled consistently."""
|
||||
path_obj = Path(temp_text_file)
|
||||
string_path = str(path_obj.resolve())
|
||||
|
||||
# Both should work and produce similar results
|
||||
result_path = await save_data_item_to_storage(path_obj)
|
||||
result_string = await save_data_item_to_storage(string_path)
|
||||
|
||||
# Both should be file:// URLs pointing to the same file
|
||||
assert result_path.startswith("file://")
|
||||
assert result_string.startswith("file://")
|
||||
|
||||
# Extract the actual file paths from the URLs
|
||||
path_from_path_obj = result_path.replace("file://", "")
|
||||
path_from_string = result_string.replace("file://", "")
|
||||
|
||||
# They should resolve to the same absolute path
|
||||
assert os.path.normpath(path_from_path_obj) == os.path.normpath(path_from_string)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_save_data_item_text_content(self):
|
||||
"""Test that plain text strings are handled as content, not paths."""
|
||||
text_content = "This is plain text content, not a file path."
|
||||
result = await save_data_item_to_storage(text_content)
|
||||
|
||||
# Should create a file and return file:// URL since this is text content
|
||||
assert result.startswith("file://")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_data_directories_path_object(self, temp_directory):
|
||||
"""Test resolve_data_directories with Path object."""
|
||||
path_obj = Path(temp_directory)
|
||||
result = await resolve_data_directories([path_obj])
|
||||
|
||||
# Should return a list of Path objects for the files in the directory
|
||||
assert len(result) == 3 # We created 3 test files
|
||||
assert all(isinstance(item, Path) for item in result)
|
||||
assert all(item.suffix == ".txt" for item in result)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_data_directories_mixed_types(self, temp_directory, temp_text_file):
|
||||
"""Test resolve_data_directories with mixed Path and string types."""
|
||||
path_obj = Path(temp_text_file)
|
||||
string_path = str(temp_text_file)
|
||||
directory_path = Path(temp_directory)
|
||||
|
||||
# Mix of types
|
||||
mixed_data = [path_obj, string_path, directory_path]
|
||||
result = await resolve_data_directories(mixed_data)
|
||||
|
||||
# Should have:
|
||||
# - 1 Path object (original file as Path)
|
||||
# - 1 string (original file as string)
|
||||
# - 3 Path objects (from directory expansion)
|
||||
assert len(result) == 5
|
||||
|
||||
# Count types
|
||||
path_objects = [item for item in result if isinstance(item, Path)]
|
||||
string_objects = [item for item in result if isinstance(item, str)]
|
||||
|
||||
assert len(path_objects) == 4 # 1 original + 3 from directory
|
||||
assert len(string_objects) == 1 # 1 original string
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolve_data_directories_path_single_file(self, temp_text_file):
|
||||
"""Test resolve_data_directories with a single Path file."""
|
||||
path_obj = Path(temp_text_file)
|
||||
result = await resolve_data_directories([path_obj])
|
||||
|
||||
# Should return the same Path object
|
||||
assert len(result) == 1
|
||||
assert isinstance(result[0], Path)
|
||||
assert str(result[0]) == str(path_obj)
|
||||
Loading…
Add table
Reference in a new issue