added fixes for nltk

2025-07-19 15:31:12 +02:00 · 2025-07-19 15:31:12 +02:00 · 411e9a6205
commit 411e9a6205
parent 3429af32c2
2 changed files with 15 additions and 4 deletions
--- a/cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py
+++ b/cognee/tasks/ingestion/adapters/loader_to_ingestion_adapter.py
@ -66,7 +66,13 @@ class LoaderResultToIngestionData(IngestionData):
                metadata["name"] = f"content_{content_hash}{ext}"
        if "content_hash" not in metadata:
-            metadata["content_hash"] = self.get_identifier()
+            # Store content hash without prefix for compatibility with deletion system
            identifier = self.get_identifier()
            if "_" in identifier:
                # Remove content type prefix (e.g., "text_abc123" -> "abc123")
                metadata["content_hash"] = identifier.split("_", 1)[-1]
            else:
                metadata["content_hash"] = identifier
        if "file_path" not in metadata and self.original_file_path:
            metadata["file_path"] = self.original_file_path
@ -192,7 +198,7 @@ class LoaderToIngestionAdapter:
        if file_path.startswith("s3://"):
            if s3fs:
                with s3fs.open(file_path, "rb") as file:
-                    return classify(file, s3fs=s3fs)
+                    return classify(file)
            else:
                raise ValueError("S3 file path provided but no s3fs available")
        else:
--- a/cognee/tasks/ingestion/plugin_ingest_data.py
+++ b/cognee/tasks/ingestion/plugin_ingest_data.py
@ -159,7 +159,7 @@ async def plugin_ingest_data(
                logger.warning(f"Plugin system failed for {file_path}, falling back: {e}")
                # Fallback to existing system for full backward compatibility
                with open_data_file(file_path) as file:
-                    classified_data = ingestion.classify(file, s3fs=fs)
+                    classified_data = ingestion.classify(file)
            # Preserve all existing data processing logic
            data_id = ingestion.identify(classified_data, user)
@ -212,7 +212,12 @@ async def plugin_ingest_data(
                    }
                    return mime_map.get(ext.lower(), "text/plain")
                elif field_name == "content_hash":
-                    return str(data_id)
+                    # Extract the raw content hash for compatibility with deletion system
                    content_identifier = classified_data.get_identifier()
                    # Remove content type prefix if present (e.g., "text_abc123" -> "abc123")
                    if "_" in content_identifier:
                        return content_identifier.split("_", 1)[-1]
                    return content_identifier
                return default_value