diff --git a/cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py b/cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py index 241837d76..bef3ce85f 100644 --- a/cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py +++ b/cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py @@ -66,7 +66,13 @@ class LoaderResultToIngestionData(IngestionData): metadata["name"] = f"content_{content_hash}{ext}" if "content_hash" not in metadata: - metadata["content_hash"] = self.get_identifier() + # Store content hash without prefix for compatibility with deletion system + identifier = self.get_identifier() + if "_" in identifier: + # Remove content type prefix (e.g., "text_abc123" -> "abc123") + metadata["content_hash"] = identifier.split("_", 1)[-1] + else: + metadata["content_hash"] = identifier if "file_path" not in metadata and self.original_file_path: metadata["file_path"] = self.original_file_path @@ -192,7 +198,7 @@ class LoaderToIngestionAdapter: if file_path.startswith("s3://"): if s3fs: with s3fs.open(file_path, "rb") as file: - return classify(file, s3fs=s3fs) + return classify(file) else: raise ValueError("S3 file path provided but no s3fs available") else: diff --git a/cognee/tasks/ingestion/plugin_ingest_data.py b/cognee/tasks/ingestion/plugin_ingest_data.py index 994de6f5c..9e7562da6 100644 --- a/cognee/tasks/ingestion/plugin_ingest_data.py +++ b/cognee/tasks/ingestion/plugin_ingest_data.py @@ -159,7 +159,7 @@ async def plugin_ingest_data( logger.warning(f"Plugin system failed for {file_path}, falling back: {e}") # Fallback to existing system for full backward compatibility with open_data_file(file_path) as file: - classified_data = ingestion.classify(file, s3fs=fs) + classified_data = ingestion.classify(file) # Preserve all existing data processing logic data_id = ingestion.identify(classified_data, user) @@ -212,7 +212,12 @@ async def plugin_ingest_data( } return mime_map.get(ext.lower(), "text/plain") elif field_name == "content_hash": - return str(data_id) + # Extract the raw content hash for compatibility with deletion system + content_identifier = classified_data.get_identifier() + # Remove content type prefix if present (e.g., "text_abc123" -> "abc123") + if "_" in content_identifier: + return content_identifier.split("_", 1)[-1] + return content_identifier return default_value