test: Add reproduction script for LiteLLMEmbeddingEngine tokenizer model name parsing.

This commit is contained in:
Faizan Shaikh 2025-12-19 18:20:17 +05:30
parent d8d3844805
commit 1cf0a202ce

50
reproduce_issue_1915.py Normal file
View file

@ -0,0 +1,50 @@
import sys
import os
import asyncio
import logging
# Add project root to path
sys.path.append(os.getcwd())
# Configure logging to see warnings/errors
logging.basicConfig(level=logging.INFO)
from cognee.infrastructure.databases.vector.embeddings.LiteLLMEmbeddingEngine import LiteLLMEmbeddingEngine
from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
def test_tokenizer_model_name():
print("--- Starting Reproduction Test ---")
# Scenario: provider="custom", model="openai/BAAI/bge-m3"
# This simulates what the user reports using.
engine = LiteLLMEmbeddingEngine(
model="openai/BAAI/bge-m3",
provider="custom",
api_key="fake",
endpoint="fake"
)
tokenizer = engine.get_tokenizer()
print(f"Provider: {engine.provider}")
print(f"Model Input: {engine.model}")
print(f"Resulting Tokenizer: {type(tokenizer).__name__}")
if hasattr(tokenizer, 'model'):
print(f"Tokenizer Model: {tokenizer.model}")
# We expect or want HuggingFaceTokenizer with model="BAAI/bge-m3"
if isinstance(tokenizer, HuggingFaceTokenizer):
if tokenizer.model == "BAAI/bge-m3":
print("SUCCESS: Tokenizer model is 'BAAI/bge-m3'")
else:
print(f"FAILURE: Tokenizer model is '{tokenizer.model}' (Expected 'BAAI/bge-m3')")
elif isinstance(tokenizer, TikTokenTokenizer):
# If it fell back to TikToken, that means HF failed (likely because 'openai/BAAI/bge-m3' was passed)
print("FAILURE: Fell back to TikToken (likely due to HF loading error with 'openai/BAAI/bge-m3')")
if __name__ == "__main__":
test_tokenizer_model_name()