From 9570f4a3061c53159ab7f32446f8f22d1167df90 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Thu, 27 Nov 2025 15:56:52 +0800 Subject: [PATCH] fix gmail connector --- api/apps/connector_app.py | 16 ++++----- common/data_source/gmail_connector.py | 48 +++++++++++++++++++++++---- common/data_source/utils.py | 2 +- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/api/apps/connector_app.py b/api/apps/connector_app.py index 51c11db4f..34da2293b 100644 --- a/api/apps/connector_app.py +++ b/api/apps/connector_app.py @@ -194,16 +194,16 @@ async def _render_web_oauth_popup(flow_id: str, success: bool, message: str, sou @login_required @validate_request("credentials") async def start_google_web_oauth(): - source = request.args.get("type", "drive") - if source not in ("drive", "gmail"): + source = request.args.get("type", "google-drive") + if source not in ("google-drive", "gmail"): return get_json_result(code=RetCode.ARGUMENT_ERROR, message="Invalid Google OAuth type.") if source == "gmail": redirect_uri = GMAIL_WEB_OAUTH_REDIRECT_URI scopes = GOOGLE_SCOPES[DocumentSource.GMAIL] else: - redirect_uri = GOOGLE_DRIVE_WEB_OAUTH_REDIRECT_URI if source == "drive" else GMAIL_WEB_OAUTH_REDIRECT_URI - scopes = GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE if source == "drive" else DocumentSource.GMAIL] + redirect_uri = GOOGLE_DRIVE_WEB_OAUTH_REDIRECT_URI if source == "google-drive" else GMAIL_WEB_OAUTH_REDIRECT_URI + scopes = GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE if source == "google-drive" else DocumentSource.GMAIL] if not redirect_uri: return get_json_result( @@ -269,7 +269,7 @@ async def google_gmail_web_oauth_callback(): state_id = request.args.get("state") error = request.args.get("error") source = "gmail" - if source not in ("drive", "gmail"): + if source != 'gmail': return await _render_web_oauth_popup("", False, "Invalid Google OAuth type.", source) error_description = request.args.get("error_description") or error @@ -323,8 +323,8 @@ async def google_gmail_web_oauth_callback(): async def google_drive_web_oauth_callback(): state_id = request.args.get("state") error = request.args.get("error") - source = "drive" - if source not in ("drive", "gmail"): + source = "google-drive" + if source not in ("google-drive", "gmail"): return await _render_web_oauth_popup("", False, "Invalid Google OAuth type.", source) error_description = request.args.get("error_description") or error @@ -376,7 +376,7 @@ async def google_drive_web_oauth_callback(): async def poll_google_web_result(): req = await request.json or {} source = request.args.get("type") - if source not in ("drive", "gmail"): + if source not in ("google-drive", "gmail"): return get_json_result(code=RetCode.ARGUMENT_ERROR, message="Invalid Google OAuth type.") flow_id = req.get("flow_id") cache_raw = REDIS_CONN.get(_web_result_cache_key(flow_id, source)) diff --git a/common/data_source/gmail_connector.py b/common/data_source/gmail_connector.py index 78e2818c9..96017c917 100644 --- a/common/data_source/gmail_connector.py +++ b/common/data_source/gmail_connector.py @@ -1,6 +1,6 @@ -import json import logging -import sys +import re +import unicodedata from typing import Any from google.oauth2.credentials import Credentials as OAuthCredentials @@ -98,9 +98,44 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread: if not semantic_identifier: semantic_identifier = message_metadata.get("subject", "") + def clean_string(text: str | None) -> str | None: + """ + Clean a string to make it safe for insertion into MySQL (utf8mb4). + - Normalize Unicode + - Remove control characters / zero-width characters + - Optionally remove high-plane emoji and symbols + """ + if text is None: + return None + + # 0. Ensure the value is a string + text = str(text) + + # 1. Normalize Unicode (NFC) + text = unicodedata.normalize("NFC", text) + + # 2. Remove ASCII control characters (except tab, newline, carriage return) + text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text) + + # 3. Remove zero-width characters / BOM + text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text) + + # 4. Remove high Unicode characters (emoji, special symbols) + text = re.sub(r"[\U00010000-\U0010FFFF]", "", text) + + # 5. Final fallback: strip any invalid UTF-8 sequences + try: + text.encode("utf-8") + except UnicodeEncodeError: + text = text.encode("utf-8", errors="ignore").decode("utf-8") + + return text + + semantic_identifier = clean_string(semantic_identifier) + if message_metadata.get("updated_at"): updated_at = message_metadata.get("updated_at") - + updated_at_datetime = None if updated_at: updated_at_datetime = gmail_time_str_to_utc(updated_at) @@ -118,7 +153,7 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread: combined_sections = "\n\n".join( sec.text for sec in sections if hasattr(sec, "text") ) - blob = combined_sections.encode("utf-8") + blob = combined_sections size_bytes = len(blob) extension = '.txt' @@ -323,7 +358,7 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO) try: email = os.environ.get("GMAIL_TEST_EMAIL", "newyorkupperbay@gmail.com") - creds = get_credentials_from_env(email, oauth=True) + creds = get_credentials_from_env(email, oauth=True, source="gmail") print("Credentials loaded successfully") print(f"{creds=}") @@ -335,9 +370,10 @@ if __name__ == "__main__": print("Gmail is ready to use") for file in connector._fetch_threads( - int(time.time()) - 3 * 24 * 60 * 60, + int(time.time()) - 1 * 24 * 60 * 60, int(time.time()), ): + print("new batch","-"*80) for f in file: print(f) print("\n\n") diff --git a/common/data_source/utils.py b/common/data_source/utils.py index 9f676f81d..a18b9321f 100644 --- a/common/data_source/utils.py +++ b/common/data_source/utils.py @@ -733,7 +733,7 @@ def build_time_range_query( """Build time range query for Gmail API""" query = "" if time_range_start is not None and time_range_start != 0: - query += f"after:{int(time_range_start)}" + query += f"after:{int(time_range_start) + 1}" if time_range_end is not None and time_range_end != 0: query += f" before:{int(time_range_end)}" query = query.strip()