From ef31e3471384d20a3abb4430f3aa21dfe4bd0f54 Mon Sep 17 00:00:00 2001
From: Billy Bao <newyorkupperbay@gmail.com>
Date: Thu, 27 Nov 2025 18:02:28 +0800
Subject: [PATCH] fix

---
 common/data_source/gmail_connector.py  | 44 ++-------------
 common/data_source/google_util/util.py | 76 +++++++++++++++++++++++++-
 2 files changed, 79 insertions(+), 41 deletions(-)

diff --git a/common/data_source/gmail_connector.py b/common/data_source/gmail_connector.py
index 96017c917..1757f4ffe 100644
--- a/common/data_source/gmail_connector.py
+++ b/common/data_source/gmail_connector.py
@@ -1,8 +1,6 @@
 import logging
-import re
-import unicodedata
+import os
 from typing import Any
-
 from google.oauth2.credentials import Credentials as OAuthCredentials
 from google.oauth2.service_account import Credentials as ServiceAccountCredentials
 from googleapiclient.errors import HttpError
@@ -11,7 +9,7 @@ from common.data_source.config import INDEX_BATCH_SIZE, SLIM_BATCH_SIZE, Documen
 from common.data_source.google_util.auth import get_google_creds
 from common.data_source.google_util.constant import DB_CREDENTIALS_PRIMARY_ADMIN_KEY, MISSING_SCOPES_ERROR_STR, SCOPE_INSTRUCTIONS, USER_FIELDS
 from common.data_source.google_util.resource import get_admin_service, get_gmail_service
-from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval
+from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval, sanitize_filename, clean_string
 from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch, SlimConnectorWithPermSync
 from common.data_source.models import BasicExpertInfo, Document, ExternalAccess, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SlimDocument, TextSection
 from common.data_source.utils import build_time_range_query, clean_email_and_extract_name, get_message_body, is_mail_service_disabled_error, gmail_time_str_to_utc
@@ -95,43 +93,11 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
                     from_emails[email] = display_name if not from_emails.get(email) else None
                 else:
                     other_emails[email] = display_name if not other_emails.get(email) else None
+
         if not semantic_identifier:
             semantic_identifier = message_metadata.get("subject", "")
-
-            def clean_string(text: str | None) -> str | None:
-                """
-                Clean a string to make it safe for insertion into MySQL (utf8mb4).
-                - Normalize Unicode
-                - Remove control characters / zero-width characters
-                - Optionally remove high-plane emoji and symbols
-                """
-                if text is None:
-                    return None
-
-                # 0. Ensure the value is a string
-                text = str(text)
-
-                # 1. Normalize Unicode (NFC)
-                text = unicodedata.normalize("NFC", text)
-
-                # 2. Remove ASCII control characters (except tab, newline, carriage return)
-                text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
-
-                # 3. Remove zero-width characters / BOM
-                text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text)
-
-                # 4. Remove high Unicode characters (emoji, special symbols)
-                text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)
-
-                # 5. Final fallback: strip any invalid UTF-8 sequences
-                try:
-                    text.encode("utf-8")
-                except UnicodeEncodeError:
-                    text = text.encode("utf-8", errors="ignore").decode("utf-8")
-
-                return text
-
             semantic_identifier = clean_string(semantic_identifier)
+            semantic_identifier = sanitize_filename(semantic_identifier)
 
         if message_metadata.get("updated_at"):
             updated_at = message_metadata.get("updated_at")
@@ -167,7 +133,7 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
         primary_owners=primary_owners,
         secondary_owners=secondary_owners,
         doc_updated_at=updated_at_datetime,
-        metadata={},
+        metadata=message_metadata,
         external_access=ExternalAccess(
             external_user_emails={email_used_to_fetch_thread},
             external_user_group_ids=set(),
diff --git a/common/data_source/google_util/util.py b/common/data_source/google_util/util.py
index 6cf15200f..164445e7e 100644
--- a/common/data_source/google_util/util.py
+++ b/common/data_source/google_util/util.py
@@ -1,11 +1,12 @@
 import json
 import logging
 import os
+import re
 import socket
 from collections.abc import Callable, Iterator
 from enum import Enum
 from typing import Any
-
+import unicodedata
 from googleapiclient.errors import HttpError  # type: ignore  # type: ignore
 
 from common.data_source.config import DocumentSource
@@ -188,4 +189,75 @@ def get_credentials_from_env(email: str, oauth: bool = False, source="drive") ->
         cred_key: refried_credential_string,
         DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
         DB_CREDENTIALS_AUTHENTICATION_METHOD: "uploaded",
-    }
\ No newline at end of file
+    }
+
+def sanitize_filename(name: str) -> str:
+    """
+    Soft sanitize for MinIO/S3:
+    - Replace only prohibited characters with a space.
+    - Preserve readability (no ugly underscores).
+    - Collapse multiple spaces.
+    """
+    if name is None:
+        return "file.txt"
+
+    name = str(name).strip()
+
+    # Characters that MUST NOT appear in S3/MinIO object keys
+    # Replace them with a space (not underscore)
+    forbidden = r'[\\\?\#\%\*\:\|\<\>"]'
+    name = re.sub(forbidden, " ", name)
+
+    # Replace slashes "/" (S3 interprets as folder) with space
+    name = name.replace("/", " ")
+
+    # Collapse multiple spaces into one
+    name = re.sub(r"\s+", " ", name)
+
+    # Trim both ends
+    name = name.strip()
+
+    # Enforce reasonable max length
+    if len(name) > 200:
+        base, ext = os.path.splitext(name)
+        name = base[:180].rstrip() + ext
+
+    # Ensure there is an extension (your original logic)
+    if not os.path.splitext(name)[1]:
+        name += ".txt"
+
+    return name
+
+
+def clean_string(text: str | None) -> str | None:
+    """
+    Clean a string to make it safe for insertion into MySQL (utf8mb4).
+    - Normalize Unicode
+    - Remove control characters / zero-width characters
+    - Optionally remove high-plane emoji and symbols
+    """
+    if text is None:
+        return None
+
+    # 0. Ensure the value is a string
+    text = str(text)
+
+    # 1. Normalize Unicode (NFC)
+    text = unicodedata.normalize("NFC", text)
+
+    # 2. Remove ASCII control characters (except tab, newline, carriage return)
+    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
+
+    # 3. Remove zero-width characters / BOM
+    text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text)
+
+    # 4. Remove high Unicode characters (emoji, special symbols)
+    text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)
+
+    # 5. Final fallback: strip any invalid UTF-8 sequences
+    try:
+        text.encode("utf-8")
+    except UnicodeEncodeError:
+        text = text.encode("utf-8", errors="ignore").decode("utf-8")
+
+    return text
\ No newline at end of file