fix gmail connector

This commit is contained in:
Billy Bao 2025-11-27 15:56:52 +08:00
parent 1bb1f67f7a
commit 9570f4a306
3 changed files with 51 additions and 15 deletions

View file

@ -194,16 +194,16 @@ async def _render_web_oauth_popup(flow_id: str, success: bool, message: str, sou
@login_required
@validate_request("credentials")
async def start_google_web_oauth():
source = request.args.get("type", "drive")
if source not in ("drive", "gmail"):
source = request.args.get("type", "google-drive")
if source not in ("google-drive", "gmail"):
return get_json_result(code=RetCode.ARGUMENT_ERROR, message="Invalid Google OAuth type.")
if source == "gmail":
redirect_uri = GMAIL_WEB_OAUTH_REDIRECT_URI
scopes = GOOGLE_SCOPES[DocumentSource.GMAIL]
else:
redirect_uri = GOOGLE_DRIVE_WEB_OAUTH_REDIRECT_URI if source == "drive" else GMAIL_WEB_OAUTH_REDIRECT_URI
scopes = GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE if source == "drive" else DocumentSource.GMAIL]
redirect_uri = GOOGLE_DRIVE_WEB_OAUTH_REDIRECT_URI if source == "google-drive" else GMAIL_WEB_OAUTH_REDIRECT_URI
scopes = GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE if source == "google-drive" else DocumentSource.GMAIL]
if not redirect_uri:
return get_json_result(
@ -269,7 +269,7 @@ async def google_gmail_web_oauth_callback():
state_id = request.args.get("state")
error = request.args.get("error")
source = "gmail"
if source not in ("drive", "gmail"):
if source != 'gmail':
return await _render_web_oauth_popup("", False, "Invalid Google OAuth type.", source)
error_description = request.args.get("error_description") or error
@ -323,8 +323,8 @@ async def google_gmail_web_oauth_callback():
async def google_drive_web_oauth_callback():
state_id = request.args.get("state")
error = request.args.get("error")
source = "drive"
if source not in ("drive", "gmail"):
source = "google-drive"
if source not in ("google-drive", "gmail"):
return await _render_web_oauth_popup("", False, "Invalid Google OAuth type.", source)
error_description = request.args.get("error_description") or error
@ -376,7 +376,7 @@ async def google_drive_web_oauth_callback():
async def poll_google_web_result():
req = await request.json or {}
source = request.args.get("type")
if source not in ("drive", "gmail"):
if source not in ("google-drive", "gmail"):
return get_json_result(code=RetCode.ARGUMENT_ERROR, message="Invalid Google OAuth type.")
flow_id = req.get("flow_id")
cache_raw = REDIS_CONN.get(_web_result_cache_key(flow_id, source))

View file

@ -1,6 +1,6 @@
import json
import logging
import sys
import re
import unicodedata
from typing import Any
from google.oauth2.credentials import Credentials as OAuthCredentials
@ -98,9 +98,44 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
if not semantic_identifier:
semantic_identifier = message_metadata.get("subject", "")
def clean_string(text: str | None) -> str | None:
"""
Clean a string to make it safe for insertion into MySQL (utf8mb4).
- Normalize Unicode
- Remove control characters / zero-width characters
- Optionally remove high-plane emoji and symbols
"""
if text is None:
return None
# 0. Ensure the value is a string
text = str(text)
# 1. Normalize Unicode (NFC)
text = unicodedata.normalize("NFC", text)
# 2. Remove ASCII control characters (except tab, newline, carriage return)
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
# 3. Remove zero-width characters / BOM
text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text)
# 4. Remove high Unicode characters (emoji, special symbols)
text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)
# 5. Final fallback: strip any invalid UTF-8 sequences
try:
text.encode("utf-8")
except UnicodeEncodeError:
text = text.encode("utf-8", errors="ignore").decode("utf-8")
return text
semantic_identifier = clean_string(semantic_identifier)
if message_metadata.get("updated_at"):
updated_at = message_metadata.get("updated_at")
updated_at_datetime = None
if updated_at:
updated_at_datetime = gmail_time_str_to_utc(updated_at)
@ -118,7 +153,7 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
combined_sections = "\n\n".join(
sec.text for sec in sections if hasattr(sec, "text")
)
blob = combined_sections.encode("utf-8")
blob = combined_sections
size_bytes = len(blob)
extension = '.txt'
@ -323,7 +358,7 @@ if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
try:
email = os.environ.get("GMAIL_TEST_EMAIL", "newyorkupperbay@gmail.com")
creds = get_credentials_from_env(email, oauth=True)
creds = get_credentials_from_env(email, oauth=True, source="gmail")
print("Credentials loaded successfully")
print(f"{creds=}")
@ -335,9 +370,10 @@ if __name__ == "__main__":
print("Gmail is ready to use")
for file in connector._fetch_threads(
int(time.time()) - 3 * 24 * 60 * 60,
int(time.time()) - 1 * 24 * 60 * 60,
int(time.time()),
):
print("new batch","-"*80)
for f in file:
print(f)
print("\n\n")

View file

@ -733,7 +733,7 @@ def build_time_range_query(
"""Build time range query for Gmail API"""
query = ""
if time_range_start is not None and time_range_start != 0:
query += f"after:{int(time_range_start)}"
query += f"after:{int(time_range_start) + 1}"
if time_range_end is not None and time_range_end != 0:
query += f" before:{int(time_range_end)}"
query = query.strip()