From 7e3bf6beca2f9ee5734c8a14497696afc669ce48 Mon Sep 17 00:00:00 2001 From: Jonah879 Date: Thu, 4 Dec 2025 15:01:17 +0000 Subject: [PATCH 1/6] Feat: use filepath for files with the same name --- common/data_source/blob_connector.py | 89 +++++++++++++++++----------- 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/common/data_source/blob_connector.py b/common/data_source/blob_connector.py index 4cc893649..c4b4fba11 100644 --- a/common/data_source/blob_connector.py +++ b/common/data_source/blob_connector.py @@ -120,55 +120,72 @@ class BlobStorageConnector(LoadConnector, PollConnector): paginator = self.s3_client.get_paginator("list_objects_v2") pages = paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix) - batch: list[Document] = [] + # Collect all objects first to count filename occurrences + all_objects = [] for page in pages: if "Contents" not in page: continue - for obj in page["Contents"]: if obj["Key"].endswith("/"): continue - last_modified = obj["LastModified"].replace(tzinfo=timezone.utc) + if start < last_modified <= end: + all_objects.append(obj) + + # Count filename occurrences to determine which need full paths + filename_counts: dict[str, int] = {} + for obj in all_objects: + file_name = os.path.basename(obj["Key"]) + filename_counts[file_name] = filename_counts.get(file_name, 0) + 1 - if not (start < last_modified <= end): + batch: list[Document] = [] + for obj in all_objects: + last_modified = obj["LastModified"].replace(tzinfo=timezone.utc) + file_name = os.path.basename(obj["Key"]) + key = obj["Key"] + + size_bytes = extract_size_bytes(obj) + if ( + self.size_threshold is not None + and isinstance(size_bytes, int) + and size_bytes > self.size_threshold + ): + logging.warning( + f"{file_name} exceeds size threshold of {self.size_threshold}. Skipping." + ) + continue + + try: + blob = download_object(self.s3_client, self.bucket_name, key, self.size_threshold) + if blob is None: continue - file_name = os.path.basename(obj["Key"]) - key = obj["Key"] + # Use full path only if filename appears multiple times + if filename_counts.get(file_name, 0) > 1: + relative_path = key + if self.prefix and key.startswith(self.prefix): + relative_path = key[len(self.prefix):] + semantic_id = relative_path.replace('/', ' / ') if relative_path else file_name + else: + semantic_id = file_name - size_bytes = extract_size_bytes(obj) - if ( - self.size_threshold is not None - and isinstance(size_bytes, int) - and size_bytes > self.size_threshold - ): - logging.warning( - f"{file_name} exceeds size threshold of {self.size_threshold}. Skipping." + batch.append( + Document( + id=f"{self.bucket_type}:{self.bucket_name}:{key}", + blob=blob, + source=DocumentSource(self.bucket_type.value), + semantic_identifier=semantic_id, + extension=get_file_ext(file_name), + doc_updated_at=last_modified, + size_bytes=size_bytes if size_bytes else 0 ) - continue - try: - blob = download_object(self.s3_client, self.bucket_name, key, self.size_threshold) - if blob is None: - continue + ) + if len(batch) == self.batch_size: + yield batch + batch = [] - batch.append( - Document( - id=f"{self.bucket_type}:{self.bucket_name}:{key}", - blob=blob, - source=DocumentSource(self.bucket_type.value), - semantic_identifier=file_name, - extension=get_file_ext(file_name), - doc_updated_at=last_modified, - size_bytes=size_bytes if size_bytes else 0 - ) - ) - if len(batch) == self.batch_size: - yield batch - batch = [] - - except Exception: - logging.exception(f"Error decoding object {key}") + except Exception: + logging.exception(f"Error decoding object {key}") if batch: yield batch From 052772b60476c1e6f5e920aea5e7339cc6bf3e97 Mon Sep 17 00:00:00 2001 From: Jonah879 Date: Mon, 8 Dec 2025 12:54:22 +0000 Subject: [PATCH 2/6] Feat: handle duplicates for confluence, dropbox and notion --- common/data_source/confluence_connector.py | 53 +++++++++++++-- common/data_source/dropbox_connector.py | 76 +++++++++++++++++----- common/data_source/notion_connector.py | 11 ++++ 3 files changed, 116 insertions(+), 24 deletions(-) diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py index a057d0694..256ce4a69 100644 --- a/common/data_source/confluence_connector.py +++ b/common/data_source/confluence_connector.py @@ -1494,7 +1494,7 @@ class ConfluenceConnector( return comment_string def _convert_page_to_document( - self, page: dict[str, Any] + self, page: dict[str, Any], title_counts: dict[str, int] | None = None ) -> Document | ConnectorFailure: """ Converts a Confluence page to a Document object. @@ -1552,11 +1552,18 @@ class ConfluenceConnector( BasicExpertInfo(display_name=display_name, email=email) ) + # Build semantic identifier - use full path only if title appears multiple times + semantic_id = page_title + if title_counts and title_counts.get(page_title, 0) > 1: + space_name = page.get("space", {}).get("name", "") + if space_name: + semantic_id = f"{space_name} / {page_title}" + # Create the document return Document( id=page_url, source=DocumentSource.CONFLUENCE, - semantic_identifier=page_title, + semantic_identifier=semantic_id, extension=".html", # Confluence pages are HTML blob=page_content.encode("utf-8"), # Encode page content as bytes size_bytes=len(page_content.encode("utf-8")), # Calculate size in bytes @@ -1593,10 +1600,21 @@ class ConfluenceConnector( attachment_docs: list[Document] = [] page_url = "" + # Collect all attachments first to count filename occurrences + all_attachments = [] for attachment in self.confluence_client.paginated_cql_retrieval( cql=attachment_query, expand=",".join(_ATTACHMENT_EXPANSION_FIELDS), ): + all_attachments.append(attachment) + + # Count attachment title occurrences + attachment_title_counts: dict[str, int] = {} + for attachment in all_attachments: + attachment_title = attachment.get("title", "") + attachment_title_counts[attachment_title] = attachment_title_counts.get(attachment_title, 0) + 1 + + for attachment in all_attachments: media_type: str = attachment.get("metadata", {}).get("mediaType", "") # TODO(rkuo): this check is partially redundant with validate_attachment_filetype @@ -1677,11 +1695,22 @@ class ConfluenceConnector( extension = Path(attachment.get("title", "")).suffix or ".unknown" + # Build semantic identifier - use full path only if title appears multiple times + attachment_title = attachment.get("title", object_url) + semantic_id = attachment_title + if attachment_title_counts.get(attachment_title, 0) > 1: + space_name = attachment.get("space", {}).get("name", "") + page_title = page.get("title", "") + if space_name and page_title: + semantic_id = f"{space_name} / {page_title} / {attachment_title}" + elif page_title: + semantic_id = f"{page_title} / {attachment_title}" + attachment_doc = Document( id=attachment_id, # sections=sections, source=DocumentSource.CONFLUENCE, - semantic_identifier=attachment.get("title", object_url), + semantic_identifier=semantic_id, extension=extension, blob=file_blob, size_bytes=len(file_blob), @@ -1739,7 +1768,9 @@ class ConfluenceConnector( ) logging.debug(f"page_query_url: {page_query_url}") - # store the next page start for confluence server, cursor for confluence cloud + # Collect all pages first to count title occurrences + all_pages = [] + def store_next_page_url(next_page_url: str) -> None: checkpoint.next_page_url = next_page_url @@ -1748,8 +1779,18 @@ class ConfluenceConnector( limit=self.batch_size, next_page_callback=store_next_page_url, ): - # Build doc from page - doc_or_failure = self._convert_page_to_document(page) + all_pages.append(page) + + # Count page title occurrences + title_counts: dict[str, int] = {} + for page in all_pages: + page_title = page.get("title", "") + title_counts[page_title] = title_counts.get(page_title, 0) + 1 + + # Process all pages + for page in all_pages: + # Build doc from page with conditional semantic_id + doc_or_failure = self._convert_page_to_document(page, title_counts) if isinstance(doc_or_failure, ConnectorFailure): yield doc_or_failure diff --git a/common/data_source/dropbox_connector.py b/common/data_source/dropbox_connector.py index 0a0a3c2de..0e7131d8f 100644 --- a/common/data_source/dropbox_connector.py +++ b/common/data_source/dropbox_connector.py @@ -87,15 +87,69 @@ class DropboxConnector(LoadConnector, PollConnector): if self.dropbox_client is None: raise ConnectorMissingCredentialError("Dropbox") + # Collect all files first to count filename occurrences + all_files = [] + self._collect_files_recursive(path, start, end, all_files) + + # Count filename occurrences + filename_counts: dict[str, int] = {} + for entry, _ in all_files: + filename_counts[entry.name] = filename_counts.get(entry.name, 0) + 1 + + # Process files in batches + batch: list[Document] = [] + for entry, downloaded_file in all_files: + modified_time = entry.client_modified + if modified_time.tzinfo is None: + modified_time = modified_time.replace(tzinfo=timezone.utc) + else: + modified_time = modified_time.astimezone(timezone.utc) + + # Use full path only if filename appears multiple times + if filename_counts.get(entry.name, 0) > 1: + # Remove leading slash and replace slashes with ' / ' + relative_path = entry.path_display.lstrip('/') + semantic_id = relative_path.replace('/', ' / ') if relative_path else entry.name + else: + semantic_id = entry.name + + batch.append( + Document( + id=f"dropbox:{entry.id}", + blob=downloaded_file, + source=DocumentSource.DROPBOX, + semantic_identifier=semantic_id, + extension=get_file_ext(entry.name), + doc_updated_at=modified_time, + size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file), + ) + ) + + if len(batch) == self.batch_size: + yield batch + batch = [] + + if batch: + yield batch + + def _collect_files_recursive( + self, + path: str, + start: SecondsSinceUnixEpoch | None, + end: SecondsSinceUnixEpoch | None, + all_files: list, + ) -> None: + """Recursively collect all files matching time criteria.""" + if self.dropbox_client is None: + raise ConnectorMissingCredentialError("Dropbox") + result = self.dropbox_client.files_list_folder( path, - limit=self.batch_size, recursive=False, include_non_downloadable_files=False, ) while True: - batch: list[Document] = [] for entry in result.entries: if isinstance(entry, FileMetadata): modified_time = entry.client_modified @@ -112,27 +166,13 @@ class DropboxConnector(LoadConnector, PollConnector): try: downloaded_file = self._download_file(entry.path_display) + all_files.append((entry, downloaded_file)) except Exception: logger.exception(f"[Dropbox]: Error downloading file {entry.path_display}") continue - batch.append( - Document( - id=f"dropbox:{entry.id}", - blob=downloaded_file, - source=DocumentSource.DROPBOX, - semantic_identifier=entry.name, - extension=get_file_ext(entry.name), - doc_updated_at=modified_time, - size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file), - ) - ) - elif isinstance(entry, FolderMetadata): - yield from self._yield_files_recursive(entry.path_lower, start, end) - - if batch: - yield batch + self._collect_files_recursive(entry.path_lower, start, end, all_files) if not result.has_more: break diff --git a/common/data_source/notion_connector.py b/common/data_source/notion_connector.py index e29bbbe76..85e3a5258 100644 --- a/common/data_source/notion_connector.py +++ b/common/data_source/notion_connector.py @@ -447,6 +447,17 @@ class NotionConnector(LoadConnector, PollConnector): raw_page_title = self._read_page_title(page) page_title = raw_page_title or f"Untitled Page with ID {page.id}" + + # Count attachment semantic_identifier occurrences within this page + attachment_name_counts: dict[str, int] = {} + for att_doc in attachment_docs: + name = att_doc.semantic_identifier + attachment_name_counts[name] = attachment_name_counts.get(name, 0) + 1 + + # Update semantic identifiers for duplicate attachments + for att_doc in attachment_docs: + if attachment_name_counts.get(att_doc.semantic_identifier, 0) > 1: + att_doc.semantic_identifier = f"{page_title} / {att_doc.semantic_identifier}" if not page_blocks: if not raw_page_title: From df4219d30edbf7e2b34fe5d690aa0f675e9bdc6d Mon Sep 17 00:00:00 2001 From: Jonah879 Date: Thu, 11 Dec 2025 11:48:53 +0000 Subject: [PATCH 3/6] fix confluence connector --- common/data_source/confluence_connector.py | 89 ++++++++++------------ rag/svr/sync_data_source.py | 46 +++++++++-- 2 files changed, 82 insertions(+), 53 deletions(-) diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py index 256ce4a69..43dfcaa43 100644 --- a/common/data_source/confluence_connector.py +++ b/common/data_source/confluence_connector.py @@ -1494,7 +1494,7 @@ class ConfluenceConnector( return comment_string def _convert_page_to_document( - self, page: dict[str, Any], title_counts: dict[str, int] | None = None + self, page: dict[str, Any] ) -> Document | ConnectorFailure: """ Converts a Confluence page to a Document object. @@ -1510,6 +1510,27 @@ class ConfluenceConnector( self.wiki_base, page["_links"]["webui"], self.is_cloud ) + # Build hierarchical path for semantic identifier + space_name = page.get("space", {}).get("name", "") + + # Build path from ancestors + path_parts = [] + if space_name: + path_parts.append(space_name) + + # Add ancestor pages to path if available + if "ancestors" in page and page["ancestors"]: + for ancestor in page["ancestors"]: + ancestor_title = ancestor.get("title", "") + if ancestor_title: + path_parts.append(ancestor_title) + + # Add current page title + path_parts.append(page_title) + + # Create full path identifier + semantic_identifier = " / ".join(path_parts) if len(path_parts) > 1 else page_title + # Get the page content page_content = extract_text_from_confluence_html( self.confluence_client, page, self._fetched_titles @@ -1552,18 +1573,11 @@ class ConfluenceConnector( BasicExpertInfo(display_name=display_name, email=email) ) - # Build semantic identifier - use full path only if title appears multiple times - semantic_id = page_title - if title_counts and title_counts.get(page_title, 0) > 1: - space_name = page.get("space", {}).get("name", "") - if space_name: - semantic_id = f"{space_name} / {page_title}" - # Create the document return Document( id=page_url, source=DocumentSource.CONFLUENCE, - semantic_identifier=semantic_id, + semantic_identifier=semantic_identifier, extension=".html", # Confluence pages are HTML blob=page_content.encode("utf-8"), # Encode page content as bytes size_bytes=len(page_content.encode("utf-8")), # Calculate size in bytes @@ -1600,23 +1614,11 @@ class ConfluenceConnector( attachment_docs: list[Document] = [] page_url = "" - # Collect all attachments first to count filename occurrences - all_attachments = [] for attachment in self.confluence_client.paginated_cql_retrieval( cql=attachment_query, expand=",".join(_ATTACHMENT_EXPANSION_FIELDS), ): - all_attachments.append(attachment) - - # Count attachment title occurrences - attachment_title_counts: dict[str, int] = {} - for attachment in all_attachments: - attachment_title = attachment.get("title", "") - attachment_title_counts[attachment_title] = attachment_title_counts.get(attachment_title, 0) + 1 - - for attachment in all_attachments: media_type: str = attachment.get("metadata", {}).get("mediaType", "") - # TODO(rkuo): this check is partially redundant with validate_attachment_filetype # and checks in convert_attachment_to_content/process_attachment # but doing the check here avoids an unnecessary download. Due for refactoring. @@ -1684,6 +1686,21 @@ class ConfluenceConnector( self.wiki_base, attachment["_links"]["webui"], self.is_cloud ) + # Build semantic identifier with space and page context + attachment_title = attachment.get("title", object_url) + space_name = page.get("space", {}).get("name", "") + page_title = page.get("title", "") + + # Create hierarchical name: Space > Page > Attachment + attachment_path_parts = [] + if space_name: + attachment_path_parts.append(space_name) + if page_title: + attachment_path_parts.append(page_title) + attachment_path_parts.append(attachment_title) + + attachment_semantic_identifier = " / ".join(attachment_path_parts) if len(attachment_path_parts) > 1 else attachment_title + primary_owners: list[BasicExpertInfo] | None = None if "version" in attachment and "by" in attachment["version"]: author = attachment["version"]["by"] @@ -1695,22 +1712,12 @@ class ConfluenceConnector( extension = Path(attachment.get("title", "")).suffix or ".unknown" - # Build semantic identifier - use full path only if title appears multiple times - attachment_title = attachment.get("title", object_url) - semantic_id = attachment_title - if attachment_title_counts.get(attachment_title, 0) > 1: - space_name = attachment.get("space", {}).get("name", "") - page_title = page.get("title", "") - if space_name and page_title: - semantic_id = f"{space_name} / {page_title} / {attachment_title}" - elif page_title: - semantic_id = f"{page_title} / {attachment_title}" attachment_doc = Document( id=attachment_id, # sections=sections, source=DocumentSource.CONFLUENCE, - semantic_identifier=semantic_id, + semantic_identifier=attachment_semantic_identifier, extension=extension, blob=file_blob, size_bytes=len(file_blob), @@ -1767,10 +1774,8 @@ class ConfluenceConnector( start_ts, end, self.batch_size ) logging.debug(f"page_query_url: {page_query_url}") - - # Collect all pages first to count title occurrences - all_pages = [] + # store the next page start for confluence server, cursor for confluence cloud def store_next_page_url(next_page_url: str) -> None: checkpoint.next_page_url = next_page_url @@ -1779,18 +1784,8 @@ class ConfluenceConnector( limit=self.batch_size, next_page_callback=store_next_page_url, ): - all_pages.append(page) - - # Count page title occurrences - title_counts: dict[str, int] = {} - for page in all_pages: - page_title = page.get("title", "") - title_counts[page_title] = title_counts.get(page_title, 0) + 1 - - # Process all pages - for page in all_pages: - # Build doc from page with conditional semantic_id - doc_or_failure = self._convert_page_to_document(page, title_counts) + # Build doc from page + doc_or_failure = self._convert_page_to_document(page) if isinstance(doc_or_failure, ConnectorFailure): yield doc_or_failure diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 4349b6f55..991ad7467 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -196,14 +196,48 @@ class Confluence(SyncBase): end_time = datetime.now(timezone.utc).timestamp() - document_generator = load_all_docs_from_checkpoint_connector( - connector=self.connector, - start=start_time, - end=end_time, - ) + raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE + try: + batch_size = int(raw_batch_size) + except (TypeError, ValueError): + batch_size = INDEX_BATCH_SIZE + if batch_size <= 0: + batch_size = INDEX_BATCH_SIZE + def document_batches(): + checkpoint = self.connector.build_dummy_checkpoint() + pending_docs = [] + iterations = 0 + iteration_limit = 100_000 + + while checkpoint.has_more: + wrapper = CheckpointOutputWrapper() + doc_generator = wrapper(self.connector.load_from_checkpoint(start_time, end_time, checkpoint)) + for document, failure, next_checkpoint in doc_generator: + if failure is not None: + logging.warning("Confluence connector failure: %s", getattr(failure, "failure_message", failure)) + continue + if document is not None: + pending_docs.append(document) + if len(pending_docs) >= batch_size: + yield pending_docs + pending_docs = [] + if next_checkpoint is not None: + checkpoint = next_checkpoint + + iterations += 1 + if iterations > iteration_limit: + raise RuntimeError("Too many iterations while loading Confluence documents.") + + if pending_docs: + yield pending_docs + + async def async_wrapper(): + for batch in document_batches(): + yield batch + logging.info("Connect to Confluence: {} {}".format(self.conf["wiki_base"], begin_info)) - return [document_generator] + return async_wrapper() class Notion(SyncBase): From fd0411b789076efbf4c950d71ba5f1ab410cba12 Mon Sep 17 00:00:00 2001 From: Jonah879 Date: Thu, 11 Dec 2025 11:51:03 +0000 Subject: [PATCH 4/6] fix errors --- rag/svr/sync_data_source.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index ce422ac77..d1d1da802 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -42,7 +42,6 @@ from common.data_source.config import INDEX_BATCH_SIZE from common.data_source.confluence_connector import ConfluenceConnector from common.data_source.gmail_connector import GmailConnector from common.data_source.interfaces import CheckpointOutputWrapper -from common.data_source.utils import load_all_docs_from_checkpoint_connector from common.log_utils import init_root_logger from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc from common.versions import get_ragflow_version From 311f48c87f2352b443fa265cc1470f85c0f5c026 Mon Sep 17 00:00:00 2001 From: Jonah879 Date: Thu, 11 Dec 2025 12:07:50 +0000 Subject: [PATCH 5/6] add ancestors for filepath in confluence --- common/data_source/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/data_source/config.py b/common/data_source/config.py index a3d86720c..6e4679b18 100644 --- a/common/data_source/config.py +++ b/common/data_source/config.py @@ -83,6 +83,7 @@ _PAGE_EXPANSION_FIELDS = [ "space", "metadata.labels", "history.lastUpdated", + "ancestors", ] From e86fff19a81a5d28dfa92179dfdbfb9d630f0b5a Mon Sep 17 00:00:00 2001 From: Jonah879 Date: Thu, 11 Dec 2025 12:53:35 +0000 Subject: [PATCH 6/6] show full path for duplicates only --- common/data_source/confluence_connector.py | 37 +++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py index 62e44de96..647e4c63a 100644 --- a/common/data_source/confluence_connector.py +++ b/common/data_source/confluence_connector.py @@ -1311,6 +1311,9 @@ class ConfluenceConnector( self._low_timeout_confluence_client: OnyxConfluence | None = None self._fetched_titles: set[str] = set() self.allow_images = False + # Track document names to detect duplicates + self._document_name_counts: dict[str, int] = {} + self._document_name_paths: dict[str, list[str]] = {} # Remove trailing slash from wiki_base if present self.wiki_base = wiki_base.rstrip("/") @@ -1531,8 +1534,21 @@ class ConfluenceConnector( # Add current page title path_parts.append(page_title) - # Create full path identifier - semantic_identifier = " / ".join(path_parts) if len(path_parts) > 1 else page_title + # Track page names for duplicate detection + full_path = " / ".join(path_parts) if len(path_parts) > 1 else page_title + + # Count occurrences of this page title + if page_title not in self._document_name_counts: + self._document_name_counts[page_title] = 0 + self._document_name_paths[page_title] = [] + self._document_name_counts[page_title] += 1 + self._document_name_paths[page_title].append(full_path) + + # Use simple name if no duplicates, otherwise use full path + if self._document_name_counts[page_title] == 1: + semantic_identifier = page_title + else: + semantic_identifier = full_path # Get the page content page_content = extract_text_from_confluence_html( @@ -1694,7 +1710,7 @@ class ConfluenceConnector( space_name = page.get("space", {}).get("name", "") page_title = page.get("title", "") - # Create hierarchical name: Space > Page > Attachment + # Create hierarchical name: Space / Page / Attachment attachment_path_parts = [] if space_name: attachment_path_parts.append(space_name) @@ -1702,7 +1718,20 @@ class ConfluenceConnector( attachment_path_parts.append(page_title) attachment_path_parts.append(attachment_title) - attachment_semantic_identifier = " / ".join(attachment_path_parts) if len(attachment_path_parts) > 1 else attachment_title + full_attachment_path = " / ".join(attachment_path_parts) if len(attachment_path_parts) > 1 else attachment_title + + # Track attachment names for duplicate detection + if attachment_title not in self._document_name_counts: + self._document_name_counts[attachment_title] = 0 + self._document_name_paths[attachment_title] = [] + self._document_name_counts[attachment_title] += 1 + self._document_name_paths[attachment_title].append(full_attachment_path) + + # Use simple name if no duplicates, otherwise use full path + if self._document_name_counts[attachment_title] == 1: + attachment_semantic_identifier = attachment_title + else: + attachment_semantic_identifier = full_attachment_path primary_owners: list[BasicExpertInfo] | None = None if "version" in attachment and "by" in attachment["version"]: