cognee/cognee/infrastructure/files/storage/StorageManager.py
Daulet Amirkhanov f58ba86e7c
feat: add welcome tutorial notebook for new users (#1425)
<!-- .github/pull_request_template.md -->

## Description
<!-- 
Please provide a clear, human-generated description of the changes in
this PR.
DO NOT use AI-generated descriptions. We want to understand your thought
process and reasoning.
-->

Update default tutorial:
1. Use tutorial from [notebook_tutorial
branch](https://github.com/topoteretes/cognee/blob/notebook_tutorial/notebooks/tutorial.ipynb),
specifically - it's .zip version with all necessary data files
2. Use Jupyter Notebook `Notebook` abstractions to read, and map `ipynb`
into our Notebook model
3. Dynamically update starter notebook code blocks that reference
starter data files, and swap them with local paths to downloaded copies
4. Test coverage



| Before | After (storage backend = local) | After (s3) |
|--------|---------------------------------|------------|
| <img width="613" height="546" alt="Screenshot 2025-09-17 at 01 00 58"
src="https://github.com/user-attachments/assets/20b59021-96c1-4a83-977f-e064324bd758"
/> | <img width="1480" height="262" alt="Screenshot 2025-09-18 at 13 01
57"
src="https://github.com/user-attachments/assets/bd56ea78-7c6a-42e3-ae3f-4157da231b2d"
/> | <img width="1485" height="307" alt="Screenshot 2025-09-18 at 12 56
08"
src="https://github.com/user-attachments/assets/248ae720-4c78-445a-ba8b-8a2991ed3f80"
/> |



## File Replacements

### S3 Demo  

https://github.com/user-attachments/assets/bd46eec9-ef77-4f69-9ef0-e7d1612ff9b3

---

### Local FS Demo  

https://github.com/user-attachments/assets/8251cea0-81b3-4cac-a968-9576c358f334


## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [x] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Changes Made
<!-- List the specific changes made in this PR -->
- 
- 
- 

## Testing
<!-- Describe how you tested your changes -->

## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [ ] **This PR contains minimal changes necessary to address the
issue/feature**
- [ ] My code follows the project's coding standards and style
guidelines
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [ ] I have searched existing PRs to ensure this change hasn't been
submitted already
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages

## Related Issues
<!-- Link any related issues using "Fixes #issue_number" or "Relates to
#issue_number" -->

## Additional Notes
<!-- Add any additional notes, concerns, or context for reviewers -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
2025-09-18 18:07:05 +02:00

171 lines
5.5 KiB
Python

import inspect
from typing import BinaryIO
from contextlib import asynccontextmanager
from .storage import Storage
class StorageManager:
"""
Manages storage operations by delegating tasks to a storage backend.
Public methods include:
- store: Store data in the specified path.
- open: Open a file from the specified path.
- remove: Remove the file at the specified path.
- remove_all: Remove all files under the directory tree.
"""
storage: Storage = None
def __init__(self, storage: Storage):
self.storage = storage
async def file_exists(self, file_path: str):
"""
Check if a specified file exists in the storage.
Parameters:
-----------
- file_path (str): The path of the file to check for existence.
Returns:
--------
- bool: True if the file exists, otherwise False.
"""
if inspect.iscoroutinefunction(self.storage.file_exists):
return await self.storage.file_exists(file_path)
else:
return self.storage.file_exists(file_path)
async def is_file(self, file_path: str):
if inspect.iscoroutinefunction(self.storage.is_file):
return await self.storage.is_file(file_path)
else:
return self.storage.is_file(file_path)
async def get_size(self, file_path: str) -> int:
if inspect.iscoroutinefunction(self.storage.get_size):
return await self.storage.get_size(file_path)
else:
return self.storage.get_size(file_path)
async def store(self, file_path: str, data: BinaryIO, overwrite: bool = False) -> str:
"""
Store data at the specified file path.
Parameters:
-----------
- file_path (str): The path where the data should be stored.
- data (BinaryIO): The data in a binary format that needs to be stored.
- overwrite (bool): If True, overwrite the existing file.
Returns:
--------
Returns the full path to the file.
"""
if inspect.iscoroutinefunction(self.storage.store):
return await self.storage.store(file_path, data, overwrite)
else:
return self.storage.store(file_path, data, overwrite)
@asynccontextmanager
async def open(self, file_path: str, encoding: str = None, *args, **kwargs):
"""
Retrieve data from the specified file path.
Parameters:
-----------
- file_path (str): The path from which to retrieve the data.
Returns:
--------
Returns the retrieved data, as defined by the storage implementation.
"""
# Check the actual storage type by class name to determine if open() is async or sync
if self.storage.__class__.__name__ == "S3FileStorage":
# S3FileStorage.open() is async
async with self.storage.open(file_path, *args, **kwargs) as file:
yield file
else:
# LocalFileStorage.open() is sync
with self.storage.open(file_path, *args, **kwargs) as file:
yield file
async def ensure_directory_exists(self, directory_path: str = ""):
"""
Ensure that the specified directory exists, creating it if necessary.
If the directory already exists, no action is taken.
Parameters:
-----------
- directory_path (str): The path of the directory to check or create.
"""
if inspect.iscoroutinefunction(self.storage.ensure_directory_exists):
return await self.storage.ensure_directory_exists(directory_path)
else:
return self.storage.ensure_directory_exists(directory_path)
async def remove(self, file_path: str):
"""
Remove the file at the specified path.
Parameters:
-----------
- file_path (str): The path of the file to be removed.
Returns:
--------
Returns the outcome of the remove operation, as defined by the storage
implementation.
"""
if inspect.iscoroutinefunction(self.storage.remove):
return await self.storage.remove(file_path)
else:
return self.storage.remove(file_path)
async def list_files(self, directory_path: str, recursive: bool = False) -> list[str]:
"""
List all files in the specified directory.
Parameters:
-----------
- directory_path (str): The directory path to list files from
- recursive (bool): If True, list files recursively in subdirectories
Returns:
--------
- list[str]: List of file paths relative to the storage root
"""
if inspect.iscoroutinefunction(self.storage.list_files):
return await self.storage.list_files(directory_path, recursive)
else:
return self.storage.list_files(directory_path, recursive)
async def remove_all(self, tree_path: str = None):
"""
Remove an entire directory tree at the specified path, including all files and
subdirectories.
If the directory does not exist, no action is taken and no exception is raised.
Parameters:
-----------
- tree_path (str): The root path of the directory tree to be removed.
"""
if inspect.iscoroutinefunction(self.storage.remove_all):
return await self.storage.remove_all(tree_path)
else:
return self.storage.remove_all(tree_path)