<!-- .github/pull_request_template.md --> ## Description <!-- Please provide a clear, human-generated description of the changes in this PR. DO NOT use AI-generated descriptions. We want to understand your thought process and reasoning. --> Update default tutorial: 1. Use tutorial from [notebook_tutorial branch](https://github.com/topoteretes/cognee/blob/notebook_tutorial/notebooks/tutorial.ipynb), specifically - it's .zip version with all necessary data files 2. Use Jupyter Notebook `Notebook` abstractions to read, and map `ipynb` into our Notebook model 3. Dynamically update starter notebook code blocks that reference starter data files, and swap them with local paths to downloaded copies 4. Test coverage | Before | After (storage backend = local) | After (s3) | |--------|---------------------------------|------------| | <img width="613" height="546" alt="Screenshot 2025-09-17 at 01 00 58" src="https://github.com/user-attachments/assets/20b59021-96c1-4a83-977f-e064324bd758" /> | <img width="1480" height="262" alt="Screenshot 2025-09-18 at 13 01 57" src="https://github.com/user-attachments/assets/bd56ea78-7c6a-42e3-ae3f-4157da231b2d" /> | <img width="1485" height="307" alt="Screenshot 2025-09-18 at 12 56 08" src="https://github.com/user-attachments/assets/248ae720-4c78-445a-ba8b-8a2991ed3f80" /> | ## File Replacements ### S3 Demo https://github.com/user-attachments/assets/bd46eec9-ef77-4f69-9ef0-e7d1612ff9b3 --- ### Local FS Demo https://github.com/user-attachments/assets/8251cea0-81b3-4cac-a968-9576c358f334 ## Type of Change <!-- Please check the relevant option --> - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Changes Made <!-- List the specific changes made in this PR --> - - - ## Testing <!-- Describe how you tested your changes --> ## Screenshots/Videos (if applicable) <!-- Add screenshots or videos to help explain your changes --> ## Pre-submission Checklist <!-- Please check all boxes that apply before submitting your PR --> - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## Related Issues <!-- Link any related issues using "Fixes #issue_number" or "Relates to #issue_number" --> ## Additional Notes <!-- Add any additional notes, concerns, or context for reviewers --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
258 lines
9.5 KiB
Python
258 lines
9.5 KiB
Python
import json
|
|
import nbformat
|
|
import asyncio
|
|
from nbformat.notebooknode import NotebookNode
|
|
from typing import List, Literal, Optional, cast, Tuple
|
|
from uuid import uuid4, UUID as UUID_t
|
|
from pydantic import BaseModel, ConfigDict
|
|
from datetime import datetime, timezone
|
|
from fastapi.encoders import jsonable_encoder
|
|
from sqlalchemy import Boolean, Column, DateTime, JSON, UUID, String, TypeDecorator
|
|
from sqlalchemy.orm import mapped_column, Mapped
|
|
from pathlib import Path
|
|
|
|
from cognee.infrastructure.databases.relational import Base
|
|
from cognee.shared.cache import (
|
|
download_and_extract_zip,
|
|
get_tutorial_data_dir,
|
|
generate_content_hash,
|
|
)
|
|
from cognee.infrastructure.files.storage.get_file_storage import get_file_storage
|
|
from cognee.base_config import get_base_config
|
|
|
|
|
|
class NotebookCell(BaseModel):
|
|
id: UUID_t
|
|
type: Literal["markdown", "code"]
|
|
name: str
|
|
content: str
|
|
|
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
|
|
|
|
class NotebookCellList(TypeDecorator):
|
|
impl = JSON
|
|
cache_ok = True
|
|
|
|
def process_bind_param(self, notebook_cells, dialect):
|
|
if notebook_cells is None:
|
|
return []
|
|
return [
|
|
json.dumps(jsonable_encoder(cell)) if isinstance(cell, NotebookCell) else cell
|
|
for cell in notebook_cells
|
|
]
|
|
|
|
def process_result_value(self, cells_json_list, dialect):
|
|
if cells_json_list is None:
|
|
return []
|
|
return [NotebookCell(**json.loads(json_string)) for json_string in cells_json_list]
|
|
|
|
|
|
class Notebook(Base):
|
|
__tablename__ = "notebooks"
|
|
|
|
id: Mapped[UUID_t] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid4)
|
|
|
|
owner_id: Mapped[UUID_t] = mapped_column(UUID(as_uuid=True), index=True)
|
|
|
|
name: Mapped[str] = mapped_column(String, nullable=False)
|
|
|
|
cells: Mapped[List[NotebookCell]] = mapped_column(NotebookCellList, nullable=False)
|
|
|
|
deletable: Mapped[bool] = mapped_column(Boolean, default=True)
|
|
|
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
@classmethod
|
|
async def from_ipynb_zip_url(
|
|
cls,
|
|
zip_url: str,
|
|
owner_id: UUID_t,
|
|
notebook_filename: str = "tutorial.ipynb",
|
|
name: Optional[str] = None,
|
|
deletable: bool = True,
|
|
force: bool = False,
|
|
) -> "Notebook":
|
|
"""
|
|
Create a Notebook instance from a remote zip file containing notebook + data files.
|
|
|
|
Args:
|
|
zip_url: Remote URL to fetch the .zip file from
|
|
owner_id: UUID of the notebook owner
|
|
notebook_filename: Name of the .ipynb file within the zip
|
|
name: Optional custom name for the notebook
|
|
deletable: Whether the notebook can be deleted
|
|
force: If True, re-download even if already cached
|
|
|
|
Returns:
|
|
Notebook instance
|
|
"""
|
|
# Generate a cache key based on the zip URL
|
|
content_hash = generate_content_hash(zip_url, notebook_filename)
|
|
|
|
# Download and extract the zip file to tutorial_data/{content_hash}
|
|
try:
|
|
extracted_cache_dir = await download_and_extract_zip(
|
|
url=zip_url,
|
|
cache_dir_name=f"tutorial_data/{content_hash}",
|
|
version_or_hash=content_hash,
|
|
force=force,
|
|
)
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to download tutorial zip from {zip_url}") from e
|
|
|
|
# Use cache system to access the notebook file
|
|
from cognee.shared.cache import cache_file_exists, read_cache_file
|
|
|
|
notebook_file_path = f"{extracted_cache_dir}/{notebook_filename}"
|
|
|
|
# Check if the notebook file exists in cache
|
|
if not await cache_file_exists(notebook_file_path):
|
|
raise FileNotFoundError(f"Notebook file '{notebook_filename}' not found in zip")
|
|
|
|
# Read and parse the notebook using cache system
|
|
async with await read_cache_file(notebook_file_path, encoding="utf-8") as f:
|
|
notebook_content = await asyncio.to_thread(f.read)
|
|
notebook = cls.from_ipynb_string(notebook_content, owner_id, name, deletable)
|
|
|
|
# Update file paths in notebook cells to point to actual cached data files
|
|
await cls._update_file_paths_in_cells(notebook, extracted_cache_dir)
|
|
|
|
return notebook
|
|
|
|
@staticmethod
|
|
async def _update_file_paths_in_cells(notebook: "Notebook", cache_dir: str) -> None:
|
|
"""
|
|
Update file paths in code cells to use actual cached data files.
|
|
Works with both local filesystem and S3 storage.
|
|
|
|
Args:
|
|
notebook: Parsed Notebook instance with cells to update
|
|
cache_dir: Path to the cached tutorial directory containing data files
|
|
"""
|
|
import re
|
|
from cognee.shared.cache import list_cache_files, cache_file_exists
|
|
from cognee.shared.logging_utils import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
# Look for data files in the data subdirectory
|
|
data_dir = f"{cache_dir}/data"
|
|
|
|
try:
|
|
# Get all data files in the cache directory using cache system
|
|
data_files = {}
|
|
if await cache_file_exists(data_dir):
|
|
file_list = await list_cache_files(data_dir)
|
|
else:
|
|
file_list = []
|
|
|
|
for file_path in file_list:
|
|
# Extract just the filename
|
|
filename = file_path.split("/")[-1]
|
|
# Use the file path as provided by cache system
|
|
data_files[filename] = file_path
|
|
|
|
except Exception as e:
|
|
# If we can't list files, skip updating paths
|
|
logger.error(f"Error listing data files in {data_dir}: {e}")
|
|
return
|
|
|
|
# Pattern to match file://data/filename patterns in code cells
|
|
file_pattern = r'"file://data/([^"]+)"'
|
|
|
|
def replace_path(match):
|
|
filename = match.group(1)
|
|
if filename in data_files:
|
|
file_path = data_files[filename]
|
|
# For local filesystem, preserve file:// prefix
|
|
if not file_path.startswith("s3://"):
|
|
return f'"file://{file_path}"'
|
|
else:
|
|
# For S3, return the S3 URL as-is
|
|
return f'"{file_path}"'
|
|
return match.group(0) # Keep original if file not found
|
|
|
|
# Update only code cells
|
|
updated_cells = 0
|
|
for cell in notebook.cells:
|
|
if cell.type == "code":
|
|
original_content = cell.content
|
|
# Update file paths in the cell content
|
|
cell.content = re.sub(file_pattern, replace_path, cell.content)
|
|
if original_content != cell.content:
|
|
updated_cells += 1
|
|
|
|
# Log summary of updates (useful for monitoring)
|
|
if updated_cells > 0:
|
|
logger.info(f"Updated file paths in {updated_cells} notebook cells")
|
|
|
|
@classmethod
|
|
def from_ipynb_string(
|
|
cls,
|
|
notebook_content: str,
|
|
owner_id: UUID_t,
|
|
name: Optional[str] = None,
|
|
deletable: bool = True,
|
|
) -> "Notebook":
|
|
"""
|
|
Create a Notebook instance from Jupyter notebook string content.
|
|
|
|
Args:
|
|
notebook_content: Raw Jupyter notebook content as string
|
|
owner_id: UUID of the notebook owner
|
|
name: Optional custom name for the notebook
|
|
deletable: Whether the notebook can be deleted
|
|
|
|
Returns:
|
|
Notebook instance ready to be saved to database
|
|
"""
|
|
# Parse and validate the Jupyter notebook using nbformat
|
|
# Note: nbformat.reads() has loose typing, so we cast to NotebookNode
|
|
jupyter_nb = cast(
|
|
NotebookNode, nbformat.reads(notebook_content, as_version=nbformat.NO_CONVERT)
|
|
)
|
|
|
|
# Convert Jupyter cells to NotebookCell objects
|
|
cells = []
|
|
for jupyter_cell in jupyter_nb.cells:
|
|
# Each cell is also a NotebookNode with dynamic attributes
|
|
cell = cast(NotebookNode, jupyter_cell)
|
|
# Skip raw cells as they're not supported in our model
|
|
if cell.cell_type == "raw":
|
|
continue
|
|
|
|
# Get the source content
|
|
content = cell.source
|
|
|
|
# Generate a name based on content or cell index
|
|
cell_name = cls._generate_cell_name(cell)
|
|
|
|
# Map cell types (jupyter uses "code"/"markdown", we use same)
|
|
cell_type = "code" if cell.cell_type == "code" else "markdown"
|
|
|
|
cells.append(NotebookCell(id=uuid4(), type=cell_type, name=cell_name, content=content))
|
|
|
|
# Extract notebook name from metadata if not provided
|
|
if name is None:
|
|
kernelspec = jupyter_nb.metadata.get("kernelspec", {})
|
|
name = kernelspec.get("display_name") or kernelspec.get("name", "Imported Notebook")
|
|
|
|
return cls(id=uuid4(), owner_id=owner_id, name=name, cells=cells, deletable=deletable)
|
|
|
|
@staticmethod
|
|
def _generate_cell_name(jupyter_cell: NotebookNode) -> str:
|
|
"""Generate a meaningful name for a notebook cell using nbformat cell."""
|
|
if jupyter_cell.cell_type == "markdown":
|
|
# Try to extract a title from markdown headers
|
|
content = jupyter_cell.source
|
|
|
|
lines = content.strip().split("\n")
|
|
if lines and lines[0].startswith("#"):
|
|
# Extract header text, clean it up
|
|
header = lines[0].lstrip("#").strip()
|
|
return header[:50] if len(header) > 50 else header
|
|
else:
|
|
return "Markdown Cell"
|
|
else:
|
|
return "Code Cell"
|