added fixes for nltk
This commit is contained in:
parent
3429af32c2
commit
411e9a6205
2 changed files with 15 additions and 4 deletions
|
|
@ -66,7 +66,13 @@ class LoaderResultToIngestionData(IngestionData):
|
|||
metadata["name"] = f"content_{content_hash}{ext}"
|
||||
|
||||
if "content_hash" not in metadata:
|
||||
metadata["content_hash"] = self.get_identifier()
|
||||
# Store content hash without prefix for compatibility with deletion system
|
||||
identifier = self.get_identifier()
|
||||
if "_" in identifier:
|
||||
# Remove content type prefix (e.g., "text_abc123" -> "abc123")
|
||||
metadata["content_hash"] = identifier.split("_", 1)[-1]
|
||||
else:
|
||||
metadata["content_hash"] = identifier
|
||||
|
||||
if "file_path" not in metadata and self.original_file_path:
|
||||
metadata["file_path"] = self.original_file_path
|
||||
|
|
@ -192,7 +198,7 @@ class LoaderToIngestionAdapter:
|
|||
if file_path.startswith("s3://"):
|
||||
if s3fs:
|
||||
with s3fs.open(file_path, "rb") as file:
|
||||
return classify(file, s3fs=s3fs)
|
||||
return classify(file)
|
||||
else:
|
||||
raise ValueError("S3 file path provided but no s3fs available")
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -159,7 +159,7 @@ async def plugin_ingest_data(
|
|||
logger.warning(f"Plugin system failed for {file_path}, falling back: {e}")
|
||||
# Fallback to existing system for full backward compatibility
|
||||
with open_data_file(file_path) as file:
|
||||
classified_data = ingestion.classify(file, s3fs=fs)
|
||||
classified_data = ingestion.classify(file)
|
||||
|
||||
# Preserve all existing data processing logic
|
||||
data_id = ingestion.identify(classified_data, user)
|
||||
|
|
@ -212,7 +212,12 @@ async def plugin_ingest_data(
|
|||
}
|
||||
return mime_map.get(ext.lower(), "text/plain")
|
||||
elif field_name == "content_hash":
|
||||
return str(data_id)
|
||||
# Extract the raw content hash for compatibility with deletion system
|
||||
content_identifier = classified_data.get_identifier()
|
||||
# Remove content type prefix if present (e.g., "text_abc123" -> "abc123")
|
||||
if "_" in content_identifier:
|
||||
return content_identifier.split("_", 1)[-1]
|
||||
return content_identifier
|
||||
|
||||
return default_value
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue