fix

2025-11-27 18:02:28 +08:00 · 2025-11-27 18:02:28 +08:00 · ef31e34713
commit ef31e34713
parent ad5e67a50e
2 changed files with 79 additions and 41 deletions
--- a/common/data_source/gmail_connector.py
+++ b/common/data_source/gmail_connector.py
@ -1,8 +1,6 @@
 import logging
-import re
+import os
 import unicodedata
 from typing import Any
 from google.oauth2.credentials import Credentials as OAuthCredentials
 from google.oauth2.service_account import Credentials as ServiceAccountCredentials
 from googleapiclient.errors import HttpError
@ -11,7 +9,7 @@ from common.data_source.config import INDEX_BATCH_SIZE, SLIM_BATCH_SIZE, Documen
 from common.data_source.google_util.auth import get_google_creds
 from common.data_source.google_util.constant import DB_CREDENTIALS_PRIMARY_ADMIN_KEY, MISSING_SCOPES_ERROR_STR, SCOPE_INSTRUCTIONS, USER_FIELDS
 from common.data_source.google_util.resource import get_admin_service, get_gmail_service
-from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval
+from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval, sanitize_filename, clean_string
 from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch, SlimConnectorWithPermSync
 from common.data_source.models import BasicExpertInfo, Document, ExternalAccess, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SlimDocument, TextSection
 from common.data_source.utils import build_time_range_query, clean_email_and_extract_name, get_message_body, is_mail_service_disabled_error, gmail_time_str_to_utc
@ -95,43 +93,11 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
                    from_emails[email] = display_name if not from_emails.get(email) else None
                else:
                    other_emails[email] = display_name if not other_emails.get(email) else None
        if not semantic_identifier:
            semantic_identifier = message_metadata.get("subject", "")
            def clean_string(text: str | None) -> str | None:
                """
                Clean a string to make it safe for insertion into MySQL (utf8mb4).
                - Normalize Unicode
                - Remove control characters / zero-width characters
                - Optionally remove high-plane emoji and symbols
                """
                if text is None:
                    return None
                # 0. Ensure the value is a string
                text = str(text)
                # 1. Normalize Unicode (NFC)
                text = unicodedata.normalize("NFC", text)
                # 2. Remove ASCII control characters (except tab, newline, carriage return)
                text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
                # 3. Remove zero-width characters / BOM
                text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text)
                # 4. Remove high Unicode characters (emoji, special symbols)
                text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)
                # 5. Final fallback: strip any invalid UTF-8 sequences
                try:
                    text.encode("utf-8")
                except UnicodeEncodeError:
                    text = text.encode("utf-8", errors="ignore").decode("utf-8")
                return text
            semantic_identifier = clean_string(semantic_identifier)
            semantic_identifier = sanitize_filename(semantic_identifier)
        if message_metadata.get("updated_at"):
            updated_at = message_metadata.get("updated_at")
@ -167,7 +133,7 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
        primary_owners=primary_owners,
        secondary_owners=secondary_owners,
        doc_updated_at=updated_at_datetime,
-        metadata={},
+        metadata=message_metadata,
        external_access=ExternalAccess(
            external_user_emails={email_used_to_fetch_thread},
            external_user_group_ids=set(),
--- a/common/data_source/google_util/util.py
+++ b/common/data_source/google_util/util.py
@ -1,11 +1,12 @@
 import json
 import logging
 import os
 import re
 import socket
 from collections.abc import Callable, Iterator
 from enum import Enum
 from typing import Any
-
+import unicodedata
 from googleapiclient.errors import HttpError  # type: ignore  # type: ignore
 from common.data_source.config import DocumentSource
@ -188,4 +189,75 @@ def get_credentials_from_env(email: str, oauth: bool = False, source="drive") ->
        cred_key: refried_credential_string,
        DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
        DB_CREDENTIALS_AUTHENTICATION_METHOD: "uploaded",
-    }
+    }
 def sanitize_filename(name: str) -> str:
    """
    Soft sanitize for MinIO/S3:
    - Replace only prohibited characters with a space.
    - Preserve readability (no ugly underscores).
    - Collapse multiple spaces.
    """
    if name is None:
        return "file.txt"
    name = str(name).strip()
    # Characters that MUST NOT appear in S3/MinIO object keys
    # Replace them with a space (not underscore)
    forbidden = r'[\\\?\#\%\*\:\|\<\>"]'
    name = re.sub(forbidden, " ", name)
    # Replace slashes "/" (S3 interprets as folder) with space
    name = name.replace("/", " ")
    # Collapse multiple spaces into one
    name = re.sub(r"\s+", " ", name)
    # Trim both ends
    name = name.strip()
    # Enforce reasonable max length
    if len(name) > 200:
        base, ext = os.path.splitext(name)
        name = base[:180].rstrip() + ext
    # Ensure there is an extension (your original logic)
    if not os.path.splitext(name)[1]:
        name += ".txt"
    return name
 def clean_string(text: str | None) -> str | None:
    """
    Clean a string to make it safe for insertion into MySQL (utf8mb4).
    - Normalize Unicode
    - Remove control characters / zero-width characters
    - Optionally remove high-plane emoji and symbols
    """
    if text is None:
        return None
    # 0. Ensure the value is a string
    text = str(text)
    # 1. Normalize Unicode (NFC)
    text = unicodedata.normalize("NFC", text)
    # 2. Remove ASCII control characters (except tab, newline, carriage return)
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
    # 3. Remove zero-width characters / BOM
    text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text)
    # 4. Remove high Unicode characters (emoji, special symbols)
    text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)
    # 5. Final fallback: strip any invalid UTF-8 sequences
    try:
        text.encode("utf-8")
    except UnicodeEncodeError:
        text = text.encode("utf-8", errors="ignore").decode("utf-8")
    return text