added fixes for nltk

This commit is contained in:
vasilije 2025-07-19 15:31:12 +02:00
parent 3429af32c2
commit 411e9a6205
2 changed files with 15 additions and 4 deletions

View file

@ -66,7 +66,13 @@ class LoaderResultToIngestionData(IngestionData):
metadata["name"] = f"content_{content_hash}{ext}"
if "content_hash" not in metadata:
metadata["content_hash"] = self.get_identifier()
# Store content hash without prefix for compatibility with deletion system
identifier = self.get_identifier()
if "_" in identifier:
# Remove content type prefix (e.g., "text_abc123" -> "abc123")
metadata["content_hash"] = identifier.split("_", 1)[-1]
else:
metadata["content_hash"] = identifier
if "file_path" not in metadata and self.original_file_path:
metadata["file_path"] = self.original_file_path
@ -192,7 +198,7 @@ class LoaderToIngestionAdapter:
if file_path.startswith("s3://"):
if s3fs:
with s3fs.open(file_path, "rb") as file:
return classify(file, s3fs=s3fs)
return classify(file)
else:
raise ValueError("S3 file path provided but no s3fs available")
else:

View file

@ -159,7 +159,7 @@ async def plugin_ingest_data(
logger.warning(f"Plugin system failed for {file_path}, falling back: {e}")
# Fallback to existing system for full backward compatibility
with open_data_file(file_path) as file:
classified_data = ingestion.classify(file, s3fs=fs)
classified_data = ingestion.classify(file)
# Preserve all existing data processing logic
data_id = ingestion.identify(classified_data, user)
@ -212,7 +212,12 @@ async def plugin_ingest_data(
}
return mime_map.get(ext.lower(), "text/plain")
elif field_name == "content_hash":
return str(data_id)
# Extract the raw content hash for compatibility with deletion system
content_identifier = classified_data.get_identifier()
# Remove content type prefix if present (e.g., "text_abc123" -> "abc123")
if "_" in content_identifier:
return content_identifier.split("_", 1)[-1]
return content_identifier
return default_value