From 1cf0a202cecf21dbc49878736c48083516c9b367 Mon Sep 17 00:00:00 2001 From: Faizan Shaikh Date: Fri, 19 Dec 2025 18:20:17 +0530 Subject: [PATCH] test: Add reproduction script for LiteLLMEmbeddingEngine tokenizer model name parsing. --- reproduce_issue_1915.py | 50 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 reproduce_issue_1915.py diff --git a/reproduce_issue_1915.py b/reproduce_issue_1915.py new file mode 100644 index 000000000..29e89d051 --- /dev/null +++ b/reproduce_issue_1915.py @@ -0,0 +1,50 @@ +import sys +import os +import asyncio +import logging + +# Add project root to path +sys.path.append(os.getcwd()) + +# Configure logging to see warnings/errors +logging.basicConfig(level=logging.INFO) + +from cognee.infrastructure.databases.vector.embeddings.LiteLLMEmbeddingEngine import LiteLLMEmbeddingEngine +from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer +from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer + +def test_tokenizer_model_name(): + print("--- Starting Reproduction Test ---") + + # Scenario: provider="custom", model="openai/BAAI/bge-m3" + # This simulates what the user reports using. + engine = LiteLLMEmbeddingEngine( + model="openai/BAAI/bge-m3", + provider="custom", + api_key="fake", + endpoint="fake" + ) + + tokenizer = engine.get_tokenizer() + + print(f"Provider: {engine.provider}") + print(f"Model Input: {engine.model}") + print(f"Resulting Tokenizer: {type(tokenizer).__name__}") + + if hasattr(tokenizer, 'model'): + print(f"Tokenizer Model: {tokenizer.model}") + + # We expect or want HuggingFaceTokenizer with model="BAAI/bge-m3" + + if isinstance(tokenizer, HuggingFaceTokenizer): + if tokenizer.model == "BAAI/bge-m3": + print("SUCCESS: Tokenizer model is 'BAAI/bge-m3'") + else: + print(f"FAILURE: Tokenizer model is '{tokenizer.model}' (Expected 'BAAI/bge-m3')") + + elif isinstance(tokenizer, TikTokenTokenizer): + # If it fell back to TikToken, that means HF failed (likely because 'openai/BAAI/bge-m3' was passed) + print("FAILURE: Fell back to TikToken (likely due to HF loading error with 'openai/BAAI/bge-m3')") + +if __name__ == "__main__": + test_tokenizer_model_name()