From 7e3bf6beca2f9ee5734c8a14497696afc669ce48 Mon Sep 17 00:00:00 2001
From: Jonah879 <jonah.hartmann@uni-konstanz.de>
Date: Thu, 4 Dec 2025 15:01:17 +0000
Subject: [PATCH 1/6] Feat: use filepath for files with the same name

---
 common/data_source/blob_connector.py | 89 +++++++++++++++++-----------
 1 file changed, 53 insertions(+), 36 deletions(-)

diff --git a/common/data_source/blob_connector.py b/common/data_source/blob_connector.py
index 4cc893649..c4b4fba11 100644
--- a/common/data_source/blob_connector.py
+++ b/common/data_source/blob_connector.py
@@ -120,55 +120,72 @@ class BlobStorageConnector(LoadConnector, PollConnector):
         paginator = self.s3_client.get_paginator("list_objects_v2")
         pages = paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix)
 
-        batch: list[Document] = []
+        # Collect all objects first to count filename occurrences
+        all_objects = []
         for page in pages:
             if "Contents" not in page:
                 continue
-
             for obj in page["Contents"]:
                 if obj["Key"].endswith("/"):
                     continue
-
                 last_modified = obj["LastModified"].replace(tzinfo=timezone.utc)
+                if start < last_modified <= end:
+                    all_objects.append(obj)
+        
+        # Count filename occurrences to determine which need full paths
+        filename_counts: dict[str, int] = {}
+        for obj in all_objects:
+            file_name = os.path.basename(obj["Key"])
+            filename_counts[file_name] = filename_counts.get(file_name, 0) + 1
 
-                if not (start < last_modified <= end):
+        batch: list[Document] = []
+        for obj in all_objects:
+            last_modified = obj["LastModified"].replace(tzinfo=timezone.utc)
+            file_name = os.path.basename(obj["Key"])
+            key = obj["Key"]
+
+            size_bytes = extract_size_bytes(obj)
+            if (
+                self.size_threshold is not None
+                and isinstance(size_bytes, int)
+                and size_bytes > self.size_threshold
+            ):
+                logging.warning(
+                    f"{file_name} exceeds size threshold of {self.size_threshold}. Skipping."
+                )
+                continue
+            
+            try:
+                blob = download_object(self.s3_client, self.bucket_name, key, self.size_threshold)
+                if blob is None:
                     continue
 
-                file_name = os.path.basename(obj["Key"])
-                key = obj["Key"]
+                # Use full path only if filename appears multiple times
+                if filename_counts.get(file_name, 0) > 1:
+                    relative_path = key
+                    if self.prefix and key.startswith(self.prefix):
+                        relative_path = key[len(self.prefix):]
+                    semantic_id = relative_path.replace('/', ' / ') if relative_path else file_name
+                else:
+                    semantic_id = file_name
 
-                size_bytes = extract_size_bytes(obj)
-                if (
-                    self.size_threshold is not None
-                    and isinstance(size_bytes, int)
-                    and size_bytes > self.size_threshold
-                ):
-                    logging.warning(
-                        f"{file_name} exceeds size threshold of {self.size_threshold}. Skipping."
+                batch.append(
+                    Document(
+                        id=f"{self.bucket_type}:{self.bucket_name}:{key}",
+                        blob=blob,
+                        source=DocumentSource(self.bucket_type.value),
+                        semantic_identifier=semantic_id,
+                        extension=get_file_ext(file_name),
+                        doc_updated_at=last_modified,
+                        size_bytes=size_bytes if size_bytes else 0
                     )
-                    continue
-                try:
-                    blob = download_object(self.s3_client, self.bucket_name, key, self.size_threshold)
-                    if blob is None:
-                        continue
+                )
+                if len(batch) == self.batch_size:
+                    yield batch
+                    batch = []
 
-                    batch.append(
-                        Document(
-                            id=f"{self.bucket_type}:{self.bucket_name}:{key}",
-                            blob=blob,
-                            source=DocumentSource(self.bucket_type.value),
-                            semantic_identifier=file_name,
-                            extension=get_file_ext(file_name),
-                            doc_updated_at=last_modified,
-                            size_bytes=size_bytes if size_bytes else 0
-                        )
-                    )
-                    if len(batch) == self.batch_size:
-                        yield batch
-                        batch = []
-
-                except Exception:
-                    logging.exception(f"Error decoding object {key}")
+            except Exception:
+                logging.exception(f"Error decoding object {key}")
         
         if batch:
             yield batch

From 052772b60476c1e6f5e920aea5e7339cc6bf3e97 Mon Sep 17 00:00:00 2001
From: Jonah879 <jonah.hartmann@uni-konstanz.de>
Date: Mon, 8 Dec 2025 12:54:22 +0000
Subject: [PATCH 2/6] Feat: handle duplicates for confluence, dropbox and
 notion

---
 common/data_source/confluence_connector.py | 53 +++++++++++++--
 common/data_source/dropbox_connector.py    | 76 +++++++++++++++++-----
 common/data_source/notion_connector.py     | 11 ++++
 3 files changed, 116 insertions(+), 24 deletions(-)

diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py
index a057d0694..256ce4a69 100644
--- a/common/data_source/confluence_connector.py
+++ b/common/data_source/confluence_connector.py
@@ -1494,7 +1494,7 @@ class ConfluenceConnector(
         return comment_string
 
     def _convert_page_to_document(
-        self, page: dict[str, Any]
+        self, page: dict[str, Any], title_counts: dict[str, int] | None = None
     ) -> Document | ConnectorFailure:
         """
         Converts a Confluence page to a Document object.
@@ -1552,11 +1552,18 @@ class ConfluenceConnector(
                     BasicExpertInfo(display_name=display_name, email=email)
                 )
 
+            # Build semantic identifier - use full path only if title appears multiple times
+            semantic_id = page_title
+            if title_counts and title_counts.get(page_title, 0) > 1:
+                space_name = page.get("space", {}).get("name", "")
+                if space_name:
+                    semantic_id = f"{space_name} / {page_title}"
+
             # Create the document
             return Document(
                 id=page_url,
                 source=DocumentSource.CONFLUENCE,
-                semantic_identifier=page_title,
+                semantic_identifier=semantic_id,
                 extension=".html",  # Confluence pages are HTML
                 blob=page_content.encode("utf-8"),  # Encode page content as bytes
                 size_bytes=len(page_content.encode("utf-8")),  # Calculate size in bytes
@@ -1593,10 +1600,21 @@ class ConfluenceConnector(
         attachment_docs: list[Document] = []
         page_url = ""
 
+        # Collect all attachments first to count filename occurrences
+        all_attachments = []
         for attachment in self.confluence_client.paginated_cql_retrieval(
             cql=attachment_query,
             expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
         ):
+            all_attachments.append(attachment)
+        
+        # Count attachment title occurrences
+        attachment_title_counts: dict[str, int] = {}
+        for attachment in all_attachments:
+            attachment_title = attachment.get("title", "")
+            attachment_title_counts[attachment_title] = attachment_title_counts.get(attachment_title, 0) + 1
+
+        for attachment in all_attachments:
             media_type: str = attachment.get("metadata", {}).get("mediaType", "")
 
             # TODO(rkuo): this check is partially redundant with validate_attachment_filetype
@@ -1677,11 +1695,22 @@ class ConfluenceConnector(
 
                 extension = Path(attachment.get("title", "")).suffix or ".unknown"
 
+                # Build semantic identifier - use full path only if title appears multiple times
+                attachment_title = attachment.get("title", object_url)
+                semantic_id = attachment_title
+                if attachment_title_counts.get(attachment_title, 0) > 1:
+                    space_name = attachment.get("space", {}).get("name", "")
+                    page_title = page.get("title", "")
+                    if space_name and page_title:
+                        semantic_id = f"{space_name} / {page_title} / {attachment_title}"
+                    elif page_title:
+                        semantic_id = f"{page_title} / {attachment_title}"
+
                 attachment_doc = Document(
                     id=attachment_id,
                     # sections=sections,
                     source=DocumentSource.CONFLUENCE,
-                    semantic_identifier=attachment.get("title", object_url),
+                    semantic_identifier=semantic_id,
                     extension=extension,
                     blob=file_blob,
                     size_bytes=len(file_blob),
@@ -1739,7 +1768,9 @@ class ConfluenceConnector(
         )
         logging.debug(f"page_query_url: {page_query_url}")
 
-        # store the next page start for confluence server, cursor for confluence cloud
+        # Collect all pages first to count title occurrences
+        all_pages = []
+        
         def store_next_page_url(next_page_url: str) -> None:
             checkpoint.next_page_url = next_page_url
 
@@ -1748,8 +1779,18 @@ class ConfluenceConnector(
             limit=self.batch_size,
             next_page_callback=store_next_page_url,
         ):
-            # Build doc from page
-            doc_or_failure = self._convert_page_to_document(page)
+            all_pages.append(page)
+        
+        # Count page title occurrences
+        title_counts: dict[str, int] = {}
+        for page in all_pages:
+            page_title = page.get("title", "")
+            title_counts[page_title] = title_counts.get(page_title, 0) + 1
+        
+        # Process all pages
+        for page in all_pages:
+            # Build doc from page with conditional semantic_id
+            doc_or_failure = self._convert_page_to_document(page, title_counts)
 
             if isinstance(doc_or_failure, ConnectorFailure):
                 yield doc_or_failure
diff --git a/common/data_source/dropbox_connector.py b/common/data_source/dropbox_connector.py
index 0a0a3c2de..0e7131d8f 100644
--- a/common/data_source/dropbox_connector.py
+++ b/common/data_source/dropbox_connector.py
@@ -87,15 +87,69 @@ class DropboxConnector(LoadConnector, PollConnector):
         if self.dropbox_client is None:
             raise ConnectorMissingCredentialError("Dropbox")
 
+        # Collect all files first to count filename occurrences
+        all_files = []
+        self._collect_files_recursive(path, start, end, all_files)
+        
+        # Count filename occurrences
+        filename_counts: dict[str, int] = {}
+        for entry, _ in all_files:
+            filename_counts[entry.name] = filename_counts.get(entry.name, 0) + 1
+        
+        # Process files in batches
+        batch: list[Document] = []
+        for entry, downloaded_file in all_files:
+            modified_time = entry.client_modified
+            if modified_time.tzinfo is None:
+                modified_time = modified_time.replace(tzinfo=timezone.utc)
+            else:
+                modified_time = modified_time.astimezone(timezone.utc)
+            
+            # Use full path only if filename appears multiple times
+            if filename_counts.get(entry.name, 0) > 1:
+                # Remove leading slash and replace slashes with ' / '
+                relative_path = entry.path_display.lstrip('/')
+                semantic_id = relative_path.replace('/', ' / ') if relative_path else entry.name
+            else:
+                semantic_id = entry.name
+            
+            batch.append(
+                Document(
+                    id=f"dropbox:{entry.id}",
+                    blob=downloaded_file,
+                    source=DocumentSource.DROPBOX,
+                    semantic_identifier=semantic_id,
+                    extension=get_file_ext(entry.name),
+                    doc_updated_at=modified_time,
+                    size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file),
+                )
+            )
+            
+            if len(batch) == self.batch_size:
+                yield batch
+                batch = []
+        
+        if batch:
+            yield batch
+
+    def _collect_files_recursive(
+        self,
+        path: str,
+        start: SecondsSinceUnixEpoch | None,
+        end: SecondsSinceUnixEpoch | None,
+        all_files: list,
+    ) -> None:
+        """Recursively collect all files matching time criteria."""
+        if self.dropbox_client is None:
+            raise ConnectorMissingCredentialError("Dropbox")
+
         result = self.dropbox_client.files_list_folder(
             path,
-            limit=self.batch_size,
             recursive=False,
             include_non_downloadable_files=False,
         )
 
         while True:
-            batch: list[Document] = []
             for entry in result.entries:
                 if isinstance(entry, FileMetadata):
                     modified_time = entry.client_modified
@@ -112,27 +166,13 @@ class DropboxConnector(LoadConnector, PollConnector):
 
                     try:
                         downloaded_file = self._download_file(entry.path_display)
+                        all_files.append((entry, downloaded_file))
                     except Exception:
                         logger.exception(f"[Dropbox]: Error downloading file {entry.path_display}")
                         continue
 
-                    batch.append(
-                        Document(
-                            id=f"dropbox:{entry.id}",
-                            blob=downloaded_file,
-                            source=DocumentSource.DROPBOX,
-                            semantic_identifier=entry.name,
-                            extension=get_file_ext(entry.name),
-                            doc_updated_at=modified_time,
-                            size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file),
-                        )
-                    )
-
                 elif isinstance(entry, FolderMetadata):
-                    yield from self._yield_files_recursive(entry.path_lower, start, end)
-
-            if batch:
-                yield batch
+                    self._collect_files_recursive(entry.path_lower, start, end, all_files)
 
             if not result.has_more:
                 break
diff --git a/common/data_source/notion_connector.py b/common/data_source/notion_connector.py
index e29bbbe76..85e3a5258 100644
--- a/common/data_source/notion_connector.py
+++ b/common/data_source/notion_connector.py
@@ -447,6 +447,17 @@ class NotionConnector(LoadConnector, PollConnector):
 
             raw_page_title = self._read_page_title(page)
             page_title = raw_page_title or f"Untitled Page with ID {page.id}"
+            
+            # Count attachment semantic_identifier occurrences within this page
+            attachment_name_counts: dict[str, int] = {}
+            for att_doc in attachment_docs:
+                name = att_doc.semantic_identifier
+                attachment_name_counts[name] = attachment_name_counts.get(name, 0) + 1
+            
+            # Update semantic identifiers for duplicate attachments
+            for att_doc in attachment_docs:
+                if attachment_name_counts.get(att_doc.semantic_identifier, 0) > 1:
+                    att_doc.semantic_identifier = f"{page_title} / {att_doc.semantic_identifier}"
 
             if not page_blocks:
                 if not raw_page_title:

From df4219d30edbf7e2b34fe5d690aa0f675e9bdc6d Mon Sep 17 00:00:00 2001
From: Jonah879 <jonah.hartmann@uni-konstanz.de>
Date: Thu, 11 Dec 2025 11:48:53 +0000
Subject: [PATCH 3/6] fix confluence connector

---
 common/data_source/confluence_connector.py | 89 ++++++++++------------
 rag/svr/sync_data_source.py                | 46 +++++++++--
 2 files changed, 82 insertions(+), 53 deletions(-)

diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py
index 256ce4a69..43dfcaa43 100644
--- a/common/data_source/confluence_connector.py
+++ b/common/data_source/confluence_connector.py
@@ -1494,7 +1494,7 @@ class ConfluenceConnector(
         return comment_string
 
     def _convert_page_to_document(
-        self, page: dict[str, Any], title_counts: dict[str, int] | None = None
+        self, page: dict[str, Any]
     ) -> Document | ConnectorFailure:
         """
         Converts a Confluence page to a Document object.
@@ -1510,6 +1510,27 @@ class ConfluenceConnector(
                 self.wiki_base, page["_links"]["webui"], self.is_cloud
             )
 
+            # Build hierarchical path for semantic identifier
+            space_name = page.get("space", {}).get("name", "")
+            
+            # Build path from ancestors
+            path_parts = []
+            if space_name:
+                path_parts.append(space_name)
+            
+            # Add ancestor pages to path if available
+            if "ancestors" in page and page["ancestors"]:
+                for ancestor in page["ancestors"]:
+                    ancestor_title = ancestor.get("title", "")
+                    if ancestor_title:
+                        path_parts.append(ancestor_title)
+            
+            # Add current page title
+            path_parts.append(page_title)
+            
+            # Create full path identifier
+            semantic_identifier = " / ".join(path_parts) if len(path_parts) > 1 else page_title
+
             # Get the page content
             page_content = extract_text_from_confluence_html(
                 self.confluence_client, page, self._fetched_titles
@@ -1552,18 +1573,11 @@ class ConfluenceConnector(
                     BasicExpertInfo(display_name=display_name, email=email)
                 )
 
-            # Build semantic identifier - use full path only if title appears multiple times
-            semantic_id = page_title
-            if title_counts and title_counts.get(page_title, 0) > 1:
-                space_name = page.get("space", {}).get("name", "")
-                if space_name:
-                    semantic_id = f"{space_name} / {page_title}"
-
             # Create the document
             return Document(
                 id=page_url,
                 source=DocumentSource.CONFLUENCE,
-                semantic_identifier=semantic_id,
+                semantic_identifier=semantic_identifier,
                 extension=".html",  # Confluence pages are HTML
                 blob=page_content.encode("utf-8"),  # Encode page content as bytes
                 size_bytes=len(page_content.encode("utf-8")),  # Calculate size in bytes
@@ -1600,23 +1614,11 @@ class ConfluenceConnector(
         attachment_docs: list[Document] = []
         page_url = ""
 
-        # Collect all attachments first to count filename occurrences
-        all_attachments = []
         for attachment in self.confluence_client.paginated_cql_retrieval(
             cql=attachment_query,
             expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
         ):
-            all_attachments.append(attachment)
-        
-        # Count attachment title occurrences
-        attachment_title_counts: dict[str, int] = {}
-        for attachment in all_attachments:
-            attachment_title = attachment.get("title", "")
-            attachment_title_counts[attachment_title] = attachment_title_counts.get(attachment_title, 0) + 1
-
-        for attachment in all_attachments:
             media_type: str = attachment.get("metadata", {}).get("mediaType", "")
-
             # TODO(rkuo): this check is partially redundant with validate_attachment_filetype
             # and checks in convert_attachment_to_content/process_attachment
             # but doing the check here avoids an unnecessary download. Due for refactoring.
@@ -1684,6 +1686,21 @@ class ConfluenceConnector(
                     self.wiki_base, attachment["_links"]["webui"], self.is_cloud
                 )
 
+                # Build semantic identifier with space and page context
+                attachment_title = attachment.get("title", object_url)
+                space_name = page.get("space", {}).get("name", "")
+                page_title = page.get("title", "")
+                
+                # Create hierarchical name: Space > Page > Attachment
+                attachment_path_parts = []
+                if space_name:
+                    attachment_path_parts.append(space_name)
+                if page_title:
+                    attachment_path_parts.append(page_title)
+                attachment_path_parts.append(attachment_title)
+                
+                attachment_semantic_identifier = " / ".join(attachment_path_parts) if len(attachment_path_parts) > 1 else attachment_title
+
                 primary_owners: list[BasicExpertInfo] | None = None
                 if "version" in attachment and "by" in attachment["version"]:
                     author = attachment["version"]["by"]
@@ -1695,22 +1712,12 @@ class ConfluenceConnector(
 
                 extension = Path(attachment.get("title", "")).suffix or ".unknown"
 
-                # Build semantic identifier - use full path only if title appears multiple times
-                attachment_title = attachment.get("title", object_url)
-                semantic_id = attachment_title
-                if attachment_title_counts.get(attachment_title, 0) > 1:
-                    space_name = attachment.get("space", {}).get("name", "")
-                    page_title = page.get("title", "")
-                    if space_name and page_title:
-                        semantic_id = f"{space_name} / {page_title} / {attachment_title}"
-                    elif page_title:
-                        semantic_id = f"{page_title} / {attachment_title}"
 
                 attachment_doc = Document(
                     id=attachment_id,
                     # sections=sections,
                     source=DocumentSource.CONFLUENCE,
-                    semantic_identifier=semantic_id,
+                    semantic_identifier=attachment_semantic_identifier,
                     extension=extension,
                     blob=file_blob,
                     size_bytes=len(file_blob),
@@ -1767,10 +1774,8 @@ class ConfluenceConnector(
             start_ts, end, self.batch_size
         )
         logging.debug(f"page_query_url: {page_query_url}")
-
-        # Collect all pages first to count title occurrences
-        all_pages = []
         
+        # store the next page start for confluence server, cursor for confluence cloud
         def store_next_page_url(next_page_url: str) -> None:
             checkpoint.next_page_url = next_page_url
 
@@ -1779,18 +1784,8 @@ class ConfluenceConnector(
             limit=self.batch_size,
             next_page_callback=store_next_page_url,
         ):
-            all_pages.append(page)
-        
-        # Count page title occurrences
-        title_counts: dict[str, int] = {}
-        for page in all_pages:
-            page_title = page.get("title", "")
-            title_counts[page_title] = title_counts.get(page_title, 0) + 1
-        
-        # Process all pages
-        for page in all_pages:
-            # Build doc from page with conditional semantic_id
-            doc_or_failure = self._convert_page_to_document(page, title_counts)
+            # Build doc from page
+            doc_or_failure = self._convert_page_to_document(page)
 
             if isinstance(doc_or_failure, ConnectorFailure):
                 yield doc_or_failure
diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py
index 4349b6f55..991ad7467 100644
--- a/rag/svr/sync_data_source.py
+++ b/rag/svr/sync_data_source.py
@@ -196,14 +196,48 @@ class Confluence(SyncBase):
 
         end_time = datetime.now(timezone.utc).timestamp()
 
-        document_generator = load_all_docs_from_checkpoint_connector(
-            connector=self.connector,
-            start=start_time,
-            end=end_time,
-        )
+        raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
+        try:
+            batch_size = int(raw_batch_size)
+        except (TypeError, ValueError):
+            batch_size = INDEX_BATCH_SIZE
+        if batch_size <= 0:
+            batch_size = INDEX_BATCH_SIZE
 
+        def document_batches():
+            checkpoint = self.connector.build_dummy_checkpoint()
+            pending_docs = []
+            iterations = 0
+            iteration_limit = 100_000
+
+            while checkpoint.has_more:
+                wrapper = CheckpointOutputWrapper()
+                doc_generator = wrapper(self.connector.load_from_checkpoint(start_time, end_time, checkpoint))
+                for document, failure, next_checkpoint in doc_generator:
+                    if failure is not None:
+                        logging.warning("Confluence connector failure: %s", getattr(failure, "failure_message", failure))
+                        continue
+                    if document is not None:
+                        pending_docs.append(document)
+                        if len(pending_docs) >= batch_size:
+                            yield pending_docs
+                            pending_docs = []
+                    if next_checkpoint is not None:
+                        checkpoint = next_checkpoint
+
+                iterations += 1
+                if iterations > iteration_limit:
+                    raise RuntimeError("Too many iterations while loading Confluence documents.")
+
+            if pending_docs:
+                yield pending_docs
+
+        async def async_wrapper():
+            for batch in document_batches():
+                yield batch
+        
         logging.info("Connect to Confluence: {} {}".format(self.conf["wiki_base"], begin_info))
-        return [document_generator]
+        return async_wrapper()
 
 
 class Notion(SyncBase):

From fd0411b789076efbf4c950d71ba5f1ab410cba12 Mon Sep 17 00:00:00 2001
From: Jonah879 <jonah.hartmann@uni-konstanz.de>
Date: Thu, 11 Dec 2025 11:51:03 +0000
Subject: [PATCH 4/6] fix errors

---
 rag/svr/sync_data_source.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py
index ce422ac77..d1d1da802 100644
--- a/rag/svr/sync_data_source.py
+++ b/rag/svr/sync_data_source.py
@@ -42,7 +42,6 @@ from common.data_source.config import INDEX_BATCH_SIZE
 from common.data_source.confluence_connector import ConfluenceConnector
 from common.data_source.gmail_connector import GmailConnector
 from common.data_source.interfaces import CheckpointOutputWrapper
-from common.data_source.utils import load_all_docs_from_checkpoint_connector
 from common.log_utils import init_root_logger
 from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
 from common.versions import get_ragflow_version

From 311f48c87f2352b443fa265cc1470f85c0f5c026 Mon Sep 17 00:00:00 2001
From: Jonah879 <jonah.hartmann@uni-konstanz.de>
Date: Thu, 11 Dec 2025 12:07:50 +0000
Subject: [PATCH 5/6] add ancestors for filepath in confluence

---
 common/data_source/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/data_source/config.py b/common/data_source/config.py
index a3d86720c..6e4679b18 100644
--- a/common/data_source/config.py
+++ b/common/data_source/config.py
@@ -83,6 +83,7 @@ _PAGE_EXPANSION_FIELDS = [
     "space",
     "metadata.labels",
     "history.lastUpdated",
+    "ancestors",
 ]
 
 

From e86fff19a81a5d28dfa92179dfdbfb9d630f0b5a Mon Sep 17 00:00:00 2001
From: Jonah879 <jonah.hartmann@uni-konstanz.de>
Date: Thu, 11 Dec 2025 12:53:35 +0000
Subject: [PATCH 6/6] show full path for duplicates only

---
 common/data_source/confluence_connector.py | 37 +++++++++++++++++++---
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py
index 62e44de96..647e4c63a 100644
--- a/common/data_source/confluence_connector.py
+++ b/common/data_source/confluence_connector.py
@@ -1311,6 +1311,9 @@ class ConfluenceConnector(
         self._low_timeout_confluence_client: OnyxConfluence | None = None
         self._fetched_titles: set[str] = set()
         self.allow_images = False
+        # Track document names to detect duplicates
+        self._document_name_counts: dict[str, int] = {}
+        self._document_name_paths: dict[str, list[str]] = {}
 
         # Remove trailing slash from wiki_base if present
         self.wiki_base = wiki_base.rstrip("/")
@@ -1531,8 +1534,21 @@ class ConfluenceConnector(
             # Add current page title
             path_parts.append(page_title)
             
-            # Create full path identifier
-            semantic_identifier = " / ".join(path_parts) if len(path_parts) > 1 else page_title
+            # Track page names for duplicate detection
+            full_path = " / ".join(path_parts) if len(path_parts) > 1 else page_title
+            
+            # Count occurrences of this page title
+            if page_title not in self._document_name_counts:
+                self._document_name_counts[page_title] = 0
+                self._document_name_paths[page_title] = []
+            self._document_name_counts[page_title] += 1
+            self._document_name_paths[page_title].append(full_path)
+            
+            # Use simple name if no duplicates, otherwise use full path
+            if self._document_name_counts[page_title] == 1:
+                semantic_identifier = page_title
+            else:
+                semantic_identifier = full_path
 
             # Get the page content
             page_content = extract_text_from_confluence_html(
@@ -1694,7 +1710,7 @@ class ConfluenceConnector(
                 space_name = page.get("space", {}).get("name", "")
                 page_title = page.get("title", "")
                 
-                # Create hierarchical name: Space > Page > Attachment
+                # Create hierarchical name: Space / Page / Attachment
                 attachment_path_parts = []
                 if space_name:
                     attachment_path_parts.append(space_name)
@@ -1702,7 +1718,20 @@ class ConfluenceConnector(
                     attachment_path_parts.append(page_title)
                 attachment_path_parts.append(attachment_title)
                 
-                attachment_semantic_identifier = " / ".join(attachment_path_parts) if len(attachment_path_parts) > 1 else attachment_title
+                full_attachment_path = " / ".join(attachment_path_parts) if len(attachment_path_parts) > 1 else attachment_title
+                
+                # Track attachment names for duplicate detection
+                if attachment_title not in self._document_name_counts:
+                    self._document_name_counts[attachment_title] = 0
+                    self._document_name_paths[attachment_title] = []
+                self._document_name_counts[attachment_title] += 1
+                self._document_name_paths[attachment_title].append(full_attachment_path)
+                
+                # Use simple name if no duplicates, otherwise use full path
+                if self._document_name_counts[attachment_title] == 1:
+                    attachment_semantic_identifier = attachment_title
+                else:
+                    attachment_semantic_identifier = full_attachment_path
 
                 primary_owners: list[BasicExpertInfo] | None = None
                 if "version" in attachment and "by" in attachment["version"]: