50 lines
1.7 KiB
Python
50 lines
1.7 KiB
Python
import sys
|
|
import os
|
|
import asyncio
|
|
import logging
|
|
|
|
# Add project root to path
|
|
sys.path.append(os.getcwd())
|
|
|
|
# Configure logging to see warnings/errors
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
from cognee.infrastructure.databases.vector.embeddings.LiteLLMEmbeddingEngine import LiteLLMEmbeddingEngine
|
|
from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
|
|
from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
|
|
|
|
def test_tokenizer_model_name():
|
|
print("--- Starting Reproduction Test ---")
|
|
|
|
# Scenario: provider="custom", model="openai/BAAI/bge-m3"
|
|
# This simulates what the user reports using.
|
|
engine = LiteLLMEmbeddingEngine(
|
|
model="openai/BAAI/bge-m3",
|
|
provider="custom",
|
|
api_key="fake",
|
|
endpoint="fake"
|
|
)
|
|
|
|
tokenizer = engine.get_tokenizer()
|
|
|
|
print(f"Provider: {engine.provider}")
|
|
print(f"Model Input: {engine.model}")
|
|
print(f"Resulting Tokenizer: {type(tokenizer).__name__}")
|
|
|
|
if hasattr(tokenizer, 'model'):
|
|
print(f"Tokenizer Model: {tokenizer.model}")
|
|
|
|
# We expect or want HuggingFaceTokenizer with model="BAAI/bge-m3"
|
|
|
|
if isinstance(tokenizer, HuggingFaceTokenizer):
|
|
if tokenizer.model == "BAAI/bge-m3":
|
|
print("SUCCESS: Tokenizer model is 'BAAI/bge-m3'")
|
|
else:
|
|
print(f"FAILURE: Tokenizer model is '{tokenizer.model}' (Expected 'BAAI/bge-m3')")
|
|
|
|
elif isinstance(tokenizer, TikTokenTokenizer):
|
|
# If it fell back to TikToken, that means HF failed (likely because 'openai/BAAI/bge-m3' was passed)
|
|
print("FAILURE: Fell back to TikToken (likely due to HF loading error with 'openai/BAAI/bge-m3')")
|
|
|
|
if __name__ == "__main__":
|
|
test_tokenizer_model_name()
|