diff --git a/cognee/api/v1/update/update.py b/cognee/api/v1/update/update.py index 4de4333af..b4b1f5e5a 100644 --- a/cognee/api/v1/update/update.py +++ b/cognee/api/v1/update/update.py @@ -18,6 +18,60 @@ async def update( preferred_loaders: List[str] = None, incremental_loading: bool = True, ): + """ + Update existing data in Cognee. + + Supported Input Types: + - **Text strings**: Direct text content (str) - any string not starting with "/" or "file://" + - **File paths**: Local file paths as strings in these formats: + * Absolute paths: "/path/to/document.pdf" + * File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt" + * S3 paths: "s3://bucket-name/path/to/file.pdf" + - **Binary file objects**: File handles/streams (BinaryIO) + - **Lists**: Multiple files or text strings in a single call + + Supported File Formats: + - Text files (.txt, .md, .csv) + - PDFs (.pdf) + - Images (.png, .jpg, .jpeg) - extracted via OCR/vision models + - Audio files (.mp3, .wav) - transcribed to text + - Code files (.py, .js, .ts, etc.) - parsed for structure and content + - Office documents (.docx, .pptx) + + Workflow: + 1. **Data Resolution**: Resolves file paths and validates accessibility + 2. **Content Extraction**: Extracts text content from various file formats + 3. **Dataset Storage**: Stores processed content in the specified dataset + 4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions + 5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset + + Args: + data_id: UUID of existing data to update + data: The latest version of the data. Can be: + - Single text string: "Your text content here" + - Absolute file path: "/path/to/document.pdf" + - File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt" + - S3 path: "s3://my-bucket/documents/file.pdf" + - List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle] + - Binary file object: open("file.txt", "rb") + dataset_name: Name of the dataset to store data in. Defaults to "main_dataset". + Create separate datasets to organize different knowledge domains. + user: User object for authentication and permissions. Uses default user if None. + Default user: "default_user@example.com" (created automatically on first use). + Users can only access datasets they have permissions for. + node_set: Optional list of node identifiers for graph organization and access control. + Used for grouping related data points in the knowledge graph. + vector_db_config: Optional configuration for vector database (for custom setups). + graph_db_config: Optional configuration for graph database (for custom setups). + dataset_id: Optional specific dataset UUID to use instead of dataset_name. + + Returns: + PipelineRunInfo: Information about the ingestion pipeline execution including: + - Pipeline run ID for tracking + - Dataset ID where data was stored + - Processing status and any errors + - Execution timestamps and metadata + """ await delete( data_id=data_id, dataset_id=dataset_id,