added fixes for nltk
This commit is contained in:
parent
3429af32c2
commit
411e9a6205
2 changed files with 15 additions and 4 deletions
|
|
@ -66,7 +66,13 @@ class LoaderResultToIngestionData(IngestionData):
|
||||||
metadata["name"] = f"content_{content_hash}{ext}"
|
metadata["name"] = f"content_{content_hash}{ext}"
|
||||||
|
|
||||||
if "content_hash" not in metadata:
|
if "content_hash" not in metadata:
|
||||||
metadata["content_hash"] = self.get_identifier()
|
# Store content hash without prefix for compatibility with deletion system
|
||||||
|
identifier = self.get_identifier()
|
||||||
|
if "_" in identifier:
|
||||||
|
# Remove content type prefix (e.g., "text_abc123" -> "abc123")
|
||||||
|
metadata["content_hash"] = identifier.split("_", 1)[-1]
|
||||||
|
else:
|
||||||
|
metadata["content_hash"] = identifier
|
||||||
|
|
||||||
if "file_path" not in metadata and self.original_file_path:
|
if "file_path" not in metadata and self.original_file_path:
|
||||||
metadata["file_path"] = self.original_file_path
|
metadata["file_path"] = self.original_file_path
|
||||||
|
|
@ -192,7 +198,7 @@ class LoaderToIngestionAdapter:
|
||||||
if file_path.startswith("s3://"):
|
if file_path.startswith("s3://"):
|
||||||
if s3fs:
|
if s3fs:
|
||||||
with s3fs.open(file_path, "rb") as file:
|
with s3fs.open(file_path, "rb") as file:
|
||||||
return classify(file, s3fs=s3fs)
|
return classify(file)
|
||||||
else:
|
else:
|
||||||
raise ValueError("S3 file path provided but no s3fs available")
|
raise ValueError("S3 file path provided but no s3fs available")
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -159,7 +159,7 @@ async def plugin_ingest_data(
|
||||||
logger.warning(f"Plugin system failed for {file_path}, falling back: {e}")
|
logger.warning(f"Plugin system failed for {file_path}, falling back: {e}")
|
||||||
# Fallback to existing system for full backward compatibility
|
# Fallback to existing system for full backward compatibility
|
||||||
with open_data_file(file_path) as file:
|
with open_data_file(file_path) as file:
|
||||||
classified_data = ingestion.classify(file, s3fs=fs)
|
classified_data = ingestion.classify(file)
|
||||||
|
|
||||||
# Preserve all existing data processing logic
|
# Preserve all existing data processing logic
|
||||||
data_id = ingestion.identify(classified_data, user)
|
data_id = ingestion.identify(classified_data, user)
|
||||||
|
|
@ -212,7 +212,12 @@ async def plugin_ingest_data(
|
||||||
}
|
}
|
||||||
return mime_map.get(ext.lower(), "text/plain")
|
return mime_map.get(ext.lower(), "text/plain")
|
||||||
elif field_name == "content_hash":
|
elif field_name == "content_hash":
|
||||||
return str(data_id)
|
# Extract the raw content hash for compatibility with deletion system
|
||||||
|
content_identifier = classified_data.get_identifier()
|
||||||
|
# Remove content type prefix if present (e.g., "text_abc123" -> "abc123")
|
||||||
|
if "_" in content_identifier:
|
||||||
|
return content_identifier.split("_", 1)[-1]
|
||||||
|
return content_identifier
|
||||||
|
|
||||||
return default_value
|
return default_value
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue