Feat: use filepath for files with the same name

This commit is contained in:
Jonah879 2025-12-04 15:01:17 +00:00
parent ad03ede7cd
commit 7e3bf6beca

View file

@ -120,20 +120,27 @@ class BlobStorageConnector(LoadConnector, PollConnector):
paginator = self.s3_client.get_paginator("list_objects_v2") paginator = self.s3_client.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix) pages = paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix)
batch: list[Document] = [] # Collect all objects first to count filename occurrences
all_objects = []
for page in pages: for page in pages:
if "Contents" not in page: if "Contents" not in page:
continue continue
for obj in page["Contents"]: for obj in page["Contents"]:
if obj["Key"].endswith("/"): if obj["Key"].endswith("/"):
continue continue
last_modified = obj["LastModified"].replace(tzinfo=timezone.utc) last_modified = obj["LastModified"].replace(tzinfo=timezone.utc)
if start < last_modified <= end:
all_objects.append(obj)
if not (start < last_modified <= end): # Count filename occurrences to determine which need full paths
continue filename_counts: dict[str, int] = {}
for obj in all_objects:
file_name = os.path.basename(obj["Key"])
filename_counts[file_name] = filename_counts.get(file_name, 0) + 1
batch: list[Document] = []
for obj in all_objects:
last_modified = obj["LastModified"].replace(tzinfo=timezone.utc)
file_name = os.path.basename(obj["Key"]) file_name = os.path.basename(obj["Key"])
key = obj["Key"] key = obj["Key"]
@ -147,17 +154,27 @@ class BlobStorageConnector(LoadConnector, PollConnector):
f"{file_name} exceeds size threshold of {self.size_threshold}. Skipping." f"{file_name} exceeds size threshold of {self.size_threshold}. Skipping."
) )
continue continue
try: try:
blob = download_object(self.s3_client, self.bucket_name, key, self.size_threshold) blob = download_object(self.s3_client, self.bucket_name, key, self.size_threshold)
if blob is None: if blob is None:
continue continue
# Use full path only if filename appears multiple times
if filename_counts.get(file_name, 0) > 1:
relative_path = key
if self.prefix and key.startswith(self.prefix):
relative_path = key[len(self.prefix):]
semantic_id = relative_path.replace('/', ' / ') if relative_path else file_name
else:
semantic_id = file_name
batch.append( batch.append(
Document( Document(
id=f"{self.bucket_type}:{self.bucket_name}:{key}", id=f"{self.bucket_type}:{self.bucket_name}:{key}",
blob=blob, blob=blob,
source=DocumentSource(self.bucket_type.value), source=DocumentSource(self.bucket_type.value),
semantic_identifier=file_name, semantic_identifier=semantic_id,
extension=get_file_ext(file_name), extension=get_file_ext(file_name),
doc_updated_at=last_modified, doc_updated_at=last_modified,
size_bytes=size_bytes if size_bytes else 0 size_bytes=size_bytes if size_bytes else 0