fix
This commit is contained in:
parent
ad5e67a50e
commit
ef31e34713
2 changed files with 79 additions and 41 deletions
|
|
@ -1,8 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
import re
|
import os
|
||||||
import unicodedata
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from google.oauth2.credentials import Credentials as OAuthCredentials
|
from google.oauth2.credentials import Credentials as OAuthCredentials
|
||||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials
|
from google.oauth2.service_account import Credentials as ServiceAccountCredentials
|
||||||
from googleapiclient.errors import HttpError
|
from googleapiclient.errors import HttpError
|
||||||
|
|
@ -11,7 +9,7 @@ from common.data_source.config import INDEX_BATCH_SIZE, SLIM_BATCH_SIZE, Documen
|
||||||
from common.data_source.google_util.auth import get_google_creds
|
from common.data_source.google_util.auth import get_google_creds
|
||||||
from common.data_source.google_util.constant import DB_CREDENTIALS_PRIMARY_ADMIN_KEY, MISSING_SCOPES_ERROR_STR, SCOPE_INSTRUCTIONS, USER_FIELDS
|
from common.data_source.google_util.constant import DB_CREDENTIALS_PRIMARY_ADMIN_KEY, MISSING_SCOPES_ERROR_STR, SCOPE_INSTRUCTIONS, USER_FIELDS
|
||||||
from common.data_source.google_util.resource import get_admin_service, get_gmail_service
|
from common.data_source.google_util.resource import get_admin_service, get_gmail_service
|
||||||
from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval
|
from common.data_source.google_util.util import _execute_single_retrieval, execute_paginated_retrieval, sanitize_filename, clean_string
|
||||||
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch, SlimConnectorWithPermSync
|
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch, SlimConnectorWithPermSync
|
||||||
from common.data_source.models import BasicExpertInfo, Document, ExternalAccess, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SlimDocument, TextSection
|
from common.data_source.models import BasicExpertInfo, Document, ExternalAccess, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SlimDocument, TextSection
|
||||||
from common.data_source.utils import build_time_range_query, clean_email_and_extract_name, get_message_body, is_mail_service_disabled_error, gmail_time_str_to_utc
|
from common.data_source.utils import build_time_range_query, clean_email_and_extract_name, get_message_body, is_mail_service_disabled_error, gmail_time_str_to_utc
|
||||||
|
|
@ -95,43 +93,11 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
|
||||||
from_emails[email] = display_name if not from_emails.get(email) else None
|
from_emails[email] = display_name if not from_emails.get(email) else None
|
||||||
else:
|
else:
|
||||||
other_emails[email] = display_name if not other_emails.get(email) else None
|
other_emails[email] = display_name if not other_emails.get(email) else None
|
||||||
|
|
||||||
if not semantic_identifier:
|
if not semantic_identifier:
|
||||||
semantic_identifier = message_metadata.get("subject", "")
|
semantic_identifier = message_metadata.get("subject", "")
|
||||||
|
|
||||||
def clean_string(text: str | None) -> str | None:
|
|
||||||
"""
|
|
||||||
Clean a string to make it safe for insertion into MySQL (utf8mb4).
|
|
||||||
- Normalize Unicode
|
|
||||||
- Remove control characters / zero-width characters
|
|
||||||
- Optionally remove high-plane emoji and symbols
|
|
||||||
"""
|
|
||||||
if text is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 0. Ensure the value is a string
|
|
||||||
text = str(text)
|
|
||||||
|
|
||||||
# 1. Normalize Unicode (NFC)
|
|
||||||
text = unicodedata.normalize("NFC", text)
|
|
||||||
|
|
||||||
# 2. Remove ASCII control characters (except tab, newline, carriage return)
|
|
||||||
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
|
|
||||||
|
|
||||||
# 3. Remove zero-width characters / BOM
|
|
||||||
text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text)
|
|
||||||
|
|
||||||
# 4. Remove high Unicode characters (emoji, special symbols)
|
|
||||||
text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)
|
|
||||||
|
|
||||||
# 5. Final fallback: strip any invalid UTF-8 sequences
|
|
||||||
try:
|
|
||||||
text.encode("utf-8")
|
|
||||||
except UnicodeEncodeError:
|
|
||||||
text = text.encode("utf-8", errors="ignore").decode("utf-8")
|
|
||||||
|
|
||||||
return text
|
|
||||||
|
|
||||||
semantic_identifier = clean_string(semantic_identifier)
|
semantic_identifier = clean_string(semantic_identifier)
|
||||||
|
semantic_identifier = sanitize_filename(semantic_identifier)
|
||||||
|
|
||||||
if message_metadata.get("updated_at"):
|
if message_metadata.get("updated_at"):
|
||||||
updated_at = message_metadata.get("updated_at")
|
updated_at = message_metadata.get("updated_at")
|
||||||
|
|
@ -167,7 +133,7 @@ def thread_to_document(full_thread: dict[str, Any], email_used_to_fetch_thread:
|
||||||
primary_owners=primary_owners,
|
primary_owners=primary_owners,
|
||||||
secondary_owners=secondary_owners,
|
secondary_owners=secondary_owners,
|
||||||
doc_updated_at=updated_at_datetime,
|
doc_updated_at=updated_at_datetime,
|
||||||
metadata={},
|
metadata=message_metadata,
|
||||||
external_access=ExternalAccess(
|
external_access=ExternalAccess(
|
||||||
external_user_emails={email_used_to_fetch_thread},
|
external_user_emails={email_used_to_fetch_thread},
|
||||||
external_user_group_ids=set(),
|
external_user_group_ids=set(),
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,12 @@
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import socket
|
import socket
|
||||||
from collections.abc import Callable, Iterator
|
from collections.abc import Callable, Iterator
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
import unicodedata
|
||||||
from googleapiclient.errors import HttpError # type: ignore # type: ignore
|
from googleapiclient.errors import HttpError # type: ignore # type: ignore
|
||||||
|
|
||||||
from common.data_source.config import DocumentSource
|
from common.data_source.config import DocumentSource
|
||||||
|
|
@ -188,4 +189,75 @@ def get_credentials_from_env(email: str, oauth: bool = False, source="drive") ->
|
||||||
cred_key: refried_credential_string,
|
cred_key: refried_credential_string,
|
||||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
|
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
|
||||||
DB_CREDENTIALS_AUTHENTICATION_METHOD: "uploaded",
|
DB_CREDENTIALS_AUTHENTICATION_METHOD: "uploaded",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def sanitize_filename(name: str) -> str:
|
||||||
|
"""
|
||||||
|
Soft sanitize for MinIO/S3:
|
||||||
|
- Replace only prohibited characters with a space.
|
||||||
|
- Preserve readability (no ugly underscores).
|
||||||
|
- Collapse multiple spaces.
|
||||||
|
"""
|
||||||
|
if name is None:
|
||||||
|
return "file.txt"
|
||||||
|
|
||||||
|
name = str(name).strip()
|
||||||
|
|
||||||
|
# Characters that MUST NOT appear in S3/MinIO object keys
|
||||||
|
# Replace them with a space (not underscore)
|
||||||
|
forbidden = r'[\\\?\#\%\*\:\|\<\>"]'
|
||||||
|
name = re.sub(forbidden, " ", name)
|
||||||
|
|
||||||
|
# Replace slashes "/" (S3 interprets as folder) with space
|
||||||
|
name = name.replace("/", " ")
|
||||||
|
|
||||||
|
# Collapse multiple spaces into one
|
||||||
|
name = re.sub(r"\s+", " ", name)
|
||||||
|
|
||||||
|
# Trim both ends
|
||||||
|
name = name.strip()
|
||||||
|
|
||||||
|
# Enforce reasonable max length
|
||||||
|
if len(name) > 200:
|
||||||
|
base, ext = os.path.splitext(name)
|
||||||
|
name = base[:180].rstrip() + ext
|
||||||
|
|
||||||
|
# Ensure there is an extension (your original logic)
|
||||||
|
if not os.path.splitext(name)[1]:
|
||||||
|
name += ".txt"
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def clean_string(text: str | None) -> str | None:
|
||||||
|
"""
|
||||||
|
Clean a string to make it safe for insertion into MySQL (utf8mb4).
|
||||||
|
- Normalize Unicode
|
||||||
|
- Remove control characters / zero-width characters
|
||||||
|
- Optionally remove high-plane emoji and symbols
|
||||||
|
"""
|
||||||
|
if text is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 0. Ensure the value is a string
|
||||||
|
text = str(text)
|
||||||
|
|
||||||
|
# 1. Normalize Unicode (NFC)
|
||||||
|
text = unicodedata.normalize("NFC", text)
|
||||||
|
|
||||||
|
# 2. Remove ASCII control characters (except tab, newline, carriage return)
|
||||||
|
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
|
||||||
|
|
||||||
|
# 3. Remove zero-width characters / BOM
|
||||||
|
text = re.sub(r"[\u200b-\u200d\uFEFF]", "", text)
|
||||||
|
|
||||||
|
# 4. Remove high Unicode characters (emoji, special symbols)
|
||||||
|
text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)
|
||||||
|
|
||||||
|
# 5. Final fallback: strip any invalid UTF-8 sequences
|
||||||
|
try:
|
||||||
|
text.encode("utf-8")
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
text = text.encode("utf-8", errors="ignore").decode("utf-8")
|
||||||
|
|
||||||
|
return text
|
||||||
Loading…
Add table
Reference in a new issue