From ef31e3471384d20a3abb4430f3aa21dfe4bd0f54 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Thu, 27 Nov 2025 18:02:28 +0800 Subject: [PATCH] fix --- common/data_source/gmail_connector.py | 44 ++------------- common/data_source/google_util/util.py | 76 +++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 41 deletions(-) diff --git a/common/data_source/gmail_connector.py b/common/data_source/gmail_connector.py index 96017c917..1757f4ffe 100644 --- a/common/data_source/gmail_connector.py +++ b/common/data_source/gmail_connector.py @@ -1,8 +1,6 @@ import logging -import re -import unicodedata +import os from typing import Any - from google.oauth2.credentials import Credentials as OAuthCredentials from google.oauth2.service_account import Credentials as ServiceAccountCredentials from googleapiclient.errors import HttpError @@ -11,7 +9,7 @@ from common.data_source.config import INDEX_BATCH_SIZE, SLIM_BATCH_SIZE, Documen from common.data_source.google_util.auth import get_google_creds from common.data_source.google_util.constant import DB_CREDENTIALS_PRIMARY_ADMIN_KEY, MISSING_SCOPES_ERROR_STR, SCOPE_INSTRUCTIONS, USER_FIELDS from common.data_source.google_util.resource import get_admin_service, get_gmail_service -from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval +from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval, sanitize_filename, clean_string from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch, SlimConnectorWithPermSync from common.data_source.models import BasicExpertInfo, Document, ExternalAccess, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SlimDocument, TextSection from common.data_source.utils import build_time_range_query, clean_email_and_extract_name, get_message_body, is_mail_service_disabled_error, gmail_time_str_to_utc @@ -95,43 +93,11 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread: from_emails[email] = display_name if not from_emails.get(email) else None else: other_emails[email] = display_name if not other_emails.get(email) else None + if not semantic_identifier: semantic_identifier = message_metadata.get("subject", "") - - def clean_string(text: str | None) -> str | None: - """ - Clean a string to make it safe for insertion into MySQL (utf8mb4). - - Normalize Unicode - - Remove control characters / zero-width characters - - Optionally remove high-plane emoji and symbols - """ - if text is None: - return None - - # 0. Ensure the value is a string - text = str(text) - - # 1. Normalize Unicode (NFC) - text = unicodedata.normalize("NFC", text) - - # 2. Remove ASCII control characters (except tab, newline, carriage return) - text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text) - - # 3. Remove zero-width characters / BOM - text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text) - - # 4. Remove high Unicode characters (emoji, special symbols) - text = re.sub(r"[\U00010000-\U0010FFFF]", "", text) - - # 5. Final fallback: strip any invalid UTF-8 sequences - try: - text.encode("utf-8") - except UnicodeEncodeError: - text = text.encode("utf-8", errors="ignore").decode("utf-8") - - return text - semantic_identifier = clean_string(semantic_identifier) + semantic_identifier = sanitize_filename(semantic_identifier) if message_metadata.get("updated_at"): updated_at = message_metadata.get("updated_at") @@ -167,7 +133,7 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread: primary_owners=primary_owners, secondary_owners=secondary_owners, doc_updated_at=updated_at_datetime, - metadata={}, + metadata=message_metadata, external_access=ExternalAccess( external_user_emails={email_used_to_fetch_thread}, external_user_group_ids=set(), diff --git a/common/data_source/google_util/util.py b/common/data_source/google_util/util.py index 6cf15200f..164445e7e 100644 --- a/common/data_source/google_util/util.py +++ b/common/data_source/google_util/util.py @@ -1,11 +1,12 @@ import json import logging import os +import re import socket from collections.abc import Callable, Iterator from enum import Enum from typing import Any - +import unicodedata from googleapiclient.errors import HttpError # type: ignore # type: ignore from common.data_source.config import DocumentSource @@ -188,4 +189,75 @@ def get_credentials_from_env(email: str, oauth: bool = False, source="drive") -> cred_key: refried_credential_string, DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email, DB_CREDENTIALS_AUTHENTICATION_METHOD: "uploaded", - } \ No newline at end of file + } + +def sanitize_filename(name: str) -> str: + """ + Soft sanitize for MinIO/S3: + - Replace only prohibited characters with a space. + - Preserve readability (no ugly underscores). + - Collapse multiple spaces. + """ + if name is None: + return "file.txt" + + name = str(name).strip() + + # Characters that MUST NOT appear in S3/MinIO object keys + # Replace them with a space (not underscore) + forbidden = r'[\\\?\#\%\*\:\|\<\>"]' + name = re.sub(forbidden, " ", name) + + # Replace slashes "/" (S3 interprets as folder) with space + name = name.replace("/", " ") + + # Collapse multiple spaces into one + name = re.sub(r"\s+", " ", name) + + # Trim both ends + name = name.strip() + + # Enforce reasonable max length + if len(name) > 200: + base, ext = os.path.splitext(name) + name = base[:180].rstrip() + ext + + # Ensure there is an extension (your original logic) + if not os.path.splitext(name)[1]: + name += ".txt" + + return name + + +def clean_string(text: str | None) -> str | None: + """ + Clean a string to make it safe for insertion into MySQL (utf8mb4). + - Normalize Unicode + - Remove control characters / zero-width characters + - Optionally remove high-plane emoji and symbols + """ + if text is None: + return None + + # 0. Ensure the value is a string + text = str(text) + + # 1. Normalize Unicode (NFC) + text = unicodedata.normalize("NFC", text) + + # 2. Remove ASCII control characters (except tab, newline, carriage return) + text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text) + + # 3. Remove zero-width characters / BOM + text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text) + + # 4. Remove high Unicode characters (emoji, special symbols) + text = re.sub(r"[\U00010000-\U0010FFFF]", "", text) + + # 5. Final fallback: strip any invalid UTF-8 sequences + try: + text.encode("utf-8") + except UnicodeEncodeError: + text = text.encode("utf-8", errors="ignore").decode("utf-8") + + return text \ No newline at end of file