test: Add reproduction script for LiteLLMEmbeddingEngine tokenizer model name parsing.
This commit is contained in:
parent
d8d3844805
commit
1cf0a202ce
1 changed files with 50 additions and 0 deletions
50
reproduce_issue_1915.py
Normal file
50
reproduce_issue_1915.py
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Add project root to path
|
||||||
|
sys.path.append(os.getcwd())
|
||||||
|
|
||||||
|
# Configure logging to see warnings/errors
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
from cognee.infrastructure.databases.vector.embeddings.LiteLLMEmbeddingEngine import LiteLLMEmbeddingEngine
|
||||||
|
from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
|
||||||
|
from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
|
||||||
|
|
||||||
|
def test_tokenizer_model_name():
|
||||||
|
print("--- Starting Reproduction Test ---")
|
||||||
|
|
||||||
|
# Scenario: provider="custom", model="openai/BAAI/bge-m3"
|
||||||
|
# This simulates what the user reports using.
|
||||||
|
engine = LiteLLMEmbeddingEngine(
|
||||||
|
model="openai/BAAI/bge-m3",
|
||||||
|
provider="custom",
|
||||||
|
api_key="fake",
|
||||||
|
endpoint="fake"
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = engine.get_tokenizer()
|
||||||
|
|
||||||
|
print(f"Provider: {engine.provider}")
|
||||||
|
print(f"Model Input: {engine.model}")
|
||||||
|
print(f"Resulting Tokenizer: {type(tokenizer).__name__}")
|
||||||
|
|
||||||
|
if hasattr(tokenizer, 'model'):
|
||||||
|
print(f"Tokenizer Model: {tokenizer.model}")
|
||||||
|
|
||||||
|
# We expect or want HuggingFaceTokenizer with model="BAAI/bge-m3"
|
||||||
|
|
||||||
|
if isinstance(tokenizer, HuggingFaceTokenizer):
|
||||||
|
if tokenizer.model == "BAAI/bge-m3":
|
||||||
|
print("SUCCESS: Tokenizer model is 'BAAI/bge-m3'")
|
||||||
|
else:
|
||||||
|
print(f"FAILURE: Tokenizer model is '{tokenizer.model}' (Expected 'BAAI/bge-m3')")
|
||||||
|
|
||||||
|
elif isinstance(tokenizer, TikTokenTokenizer):
|
||||||
|
# If it fell back to TikToken, that means HF failed (likely because 'openai/BAAI/bge-m3' was passed)
|
||||||
|
print("FAILURE: Fell back to TikToken (likely due to HF loading error with 'openai/BAAI/bge-m3')")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_tokenizer_model_name()
|
||||||
Loading…
Add table
Reference in a new issue