From 93249c72c514c84face1aa9faf57f94db7605487 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 21 Jan 2025 19:53:22 +0100 Subject: [PATCH] fix: Initial commit to resolve issue with using tokenizer based on LLMs Currently TikToken is used for tokenizing by default which is only supported by OpenAI, this is an initial commit in an attempt to add Cognee tokenizing support for multiple LLMs --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- .../embeddings/LiteLLMEmbeddingEngine.py | 23 +++++++ .../databases/vector/embeddings/config.py | 2 +- .../vector/embeddings/get_embedding_engine.py | 1 + .../llm/tokenizer/HuggingFace/__init__.py | 1 + .../llm/tokenizer/HuggingFace/adapter.py | 22 ++++++ .../llm/tokenizer/TikToken/__init__.py | 1 + .../llm/tokenizer/TikToken/adapter.py | 69 +++++++++++++++++++ .../infrastructure/llm/tokenizer/__init__.py | 1 + .../llm/tokenizer/tokenizer_interface.py | 18 +++++ cognee/modules/chunking/TextChunker.py | 16 +++-- cognee/modules/cognify/config.py | 1 - .../document_types/AudioDocument.py | 6 +- .../processing/document_types/Document.py | 2 +- .../document_types/ImageDocument.py | 6 +- .../processing/document_types/PdfDocument.py | 6 +- .../processing/document_types/TextDocument.py | 6 +- .../document_types/UnstructuredDocument.py | 4 +- cognee/shared/utils.py | 39 ----------- cognee/tasks/chunks/chunk_by_paragraph.py | 16 ++--- .../extract_chunks_from_documents.py | 5 +- .../repo_processor/get_source_code_chunks.py | 13 ++-- 22 files changed, 176 insertions(+), 84 deletions(-) create mode 100644 cognee/infrastructure/llm/tokenizer/HuggingFace/__init__.py create mode 100644 cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py create mode 100644 cognee/infrastructure/llm/tokenizer/TikToken/__init__.py create mode 100644 cognee/infrastructure/llm/tokenizer/TikToken/adapter.py create mode 100644 cognee/infrastructure/llm/tokenizer/__init__.py create mode 100644 cognee/infrastructure/llm/tokenizer/tokenizer_interface.py diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index dc2af0cd5..4a864eb0e 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens), + Task(extract_chunks_from_documents), Task( extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50} ), diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index f0a40ca36..842256659 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -6,6 +6,9 @@ import litellm import os from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException +from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer +from transformers import AutoTokenizer +import tiktoken # Assuming this is how you import TikToken litellm.set_verbose = False logger = logging.getLogger("LiteLLMEmbeddingEngine") @@ -15,23 +18,30 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): api_key: str endpoint: str api_version: str + provider: str model: str dimensions: int mock: bool def __init__( self, + provider: str = "openai", model: Optional[str] = "text-embedding-3-large", dimensions: Optional[int] = 3072, api_key: str = None, endpoint: str = None, api_version: str = None, + max_tokens: int = float("inf"), ): self.api_key = api_key self.endpoint = endpoint self.api_version = api_version + # TODO: Add or remove provider info + self.provider = provider self.model = model self.dimensions = dimensions + self.max_tokens = max_tokens + self.tokenizer = self.set_tokenizer() enable_mocking = os.getenv("MOCK_EMBEDDING", "false") if isinstance(enable_mocking, bool): @@ -104,3 +114,16 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): def get_vector_size(self) -> int: return self.dimensions + + def set_tokenizer(self): + logger.debug(f"Loading tokenizer for model {self.model}...") + # If model also contains provider information, extract only model information + model = self.model.split("/")[-1] + + if "openai" in self.provider.lower() or "gpt" in self.model: + tokenizer = TikTokenTokenizer(model=model, max_tokens=self.max_tokens) + else: + tokenizer = AutoTokenizer.from_pretrained(self.model) + + logger.debug(f"Tokenizer loaded for model: {self.model}") + return tokenizer diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py index 042c063f8..62335ea41 100644 --- a/cognee/infrastructure/databases/vector/embeddings/config.py +++ b/cognee/infrastructure/databases/vector/embeddings/config.py @@ -9,7 +9,7 @@ class EmbeddingConfig(BaseSettings): embedding_endpoint: Optional[str] = None embedding_api_key: Optional[str] = None embedding_api_version: Optional[str] = None - + embedding_max_tokens: Optional[int] = float("inf") model_config = SettingsConfigDict(env_file=".env", extra="allow") diff --git a/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py b/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py index 6bfb4dd15..e894da892 100644 --- a/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +++ b/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py @@ -15,4 +15,5 @@ def get_embedding_engine() -> EmbeddingEngine: api_version=config.embedding_api_version, model=config.embedding_model, dimensions=config.embedding_dimensions, + max_tokens=config.embedding_max_tokens, ) diff --git a/cognee/infrastructure/llm/tokenizer/HuggingFace/__init__.py b/cognee/infrastructure/llm/tokenizer/HuggingFace/__init__.py new file mode 100644 index 000000000..7cdfb9aa0 --- /dev/null +++ b/cognee/infrastructure/llm/tokenizer/HuggingFace/__init__.py @@ -0,0 +1 @@ +from .adapter import HuggingFaceTokenizer diff --git a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py new file mode 100644 index 000000000..19238b62e --- /dev/null +++ b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py @@ -0,0 +1,22 @@ +from typing import List, Any + +from ..tokenizer_interface import TokenizerInterface + + +class HuggingFaceTokenizer(TokenizerInterface): + def __init__( + self, + model: str, + max_tokens: int = float("inf"), + ): + self.model = model + self.max_tokens = max_tokens + + def extract_tokens(self, text: str) -> List[Any]: + raise NotImplementedError + + def num_tokens_from_text(self, text: str) -> int: + raise NotImplementedError + + def trim_text_to_max_tokens(self, text: str) -> str: + raise NotImplementedError diff --git a/cognee/infrastructure/llm/tokenizer/TikToken/__init__.py b/cognee/infrastructure/llm/tokenizer/TikToken/__init__.py new file mode 100644 index 000000000..4c7a39401 --- /dev/null +++ b/cognee/infrastructure/llm/tokenizer/TikToken/__init__.py @@ -0,0 +1 @@ +from .adapter import TikTokenTokenizer diff --git a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py new file mode 100644 index 000000000..6ba1e0027 --- /dev/null +++ b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py @@ -0,0 +1,69 @@ +from typing import List, Any +import tiktoken + +from ..tokenizer_interface import TokenizerInterface + + +class TikTokenTokenizer(TokenizerInterface): + """ + Tokenizer adapter for OpenAI. + Inteded to be used as part of LLM Embedding and LLM Adapters classes + """ + + def __init__( + self, + model: str, + max_tokens: int = float("inf"), + ): + self.model = model + self.max_tokens = max_tokens + # Initialize TikToken for GPT based on model + self.tokenizer = tiktoken.encoding_for_model(self.model) + + def extract_tokens(self, text: str) -> List[Any]: + tokens = [] + # Using TikToken's method to tokenize text + token_ids = self.tokenizer.encode(text) + # Go through tokens and decode them to text value + for token_id in token_ids: + token = self.tokenizer.decode([token_id]) + tokens.append(token) + return tokens + + def num_tokens_from_text(self, text: str) -> int: + """ + Returns the number of tokens in the given text. + Args: + text: str + + Returns: + number of tokens in the given text + + """ + num_tokens = len(self.tokenizer.encode(text)) + return num_tokens + + def trim_text_to_max_tokens(self, text: str) -> str: + """ + Trims the text so that the number of tokens does not exceed max_tokens. + + Args: + text (str): Original text string to be trimmed. + + Returns: + str: Trimmed version of text or original text if under the limit. + """ + # First check the number of tokens + num_tokens = self.num_tokens_from_string(text) + + # If the number of tokens is within the limit, return the text as is + if num_tokens <= self.max_tokens: + return text + + # If the number exceeds the limit, trim the text + # This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut + encoded_text = self.tokenizer.encode(text) + trimmed_encoded_text = encoded_text[: self.max_tokens] + # Decoding the trimmed text + trimmed_text = self.tokenizer.decode(trimmed_encoded_text) + return trimmed_text diff --git a/cognee/infrastructure/llm/tokenizer/__init__.py b/cognee/infrastructure/llm/tokenizer/__init__.py new file mode 100644 index 000000000..0cc895c13 --- /dev/null +++ b/cognee/infrastructure/llm/tokenizer/__init__.py @@ -0,0 +1 @@ +from .tokenizer_interface import TokenizerInterface diff --git a/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py b/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py new file mode 100644 index 000000000..abd111f12 --- /dev/null +++ b/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py @@ -0,0 +1,18 @@ +from typing import List, Protocol, Any +from abc import abstractmethod + + +class TokenizerInterface(Protocol): + """Tokenizer interface""" + + @abstractmethod + def extract_tokens(self, text: str) -> List[Any]: + raise NotImplementedError + + @abstractmethod + def num_tokens_from_text(self, text: str) -> int: + raise NotImplementedError + + @abstractmethod + def trim_text_to_max_tokens(self, text: str) -> str: + raise NotImplementedError diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index cd71dd128..4fa246272 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -14,17 +14,22 @@ class TextChunker: chunk_size = 0 token_count = 0 - def __init__( - self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024 - ): + def __init__(self, document, get_text: callable, chunk_size: int = 1024): self.document = document self.max_chunk_size = chunk_size self.get_text = get_text - self.max_tokens = max_tokens if max_tokens else float("inf") def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data): word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size - token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens + + # Get embedding engine related to vector database + from cognee.infrastructure.databases.vector.get_vector_engine import get_vector_engine + + embedding_engine = get_vector_engine().embedding_engine + + token_count_fits = ( + token_count_before + chunk_data["token_count"] <= embedding_engine.max_tokens + ) return word_count_fits and token_count_fits def read(self): @@ -32,7 +37,6 @@ class TextChunker: for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, - self.max_tokens, self.max_chunk_size, batch_paragraphs=True, ): diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py index dd94d8b41..4ba0f4bd6 100644 --- a/cognee/modules/cognify/config.py +++ b/cognee/modules/cognify/config.py @@ -8,7 +8,6 @@ import os class CognifyConfig(BaseSettings): classification_model: object = DefaultContentPrediction summarization_model: object = SummarizedContent - max_tokens: Optional[int] = os.getenv("MAX_TOKENS") model_config = SettingsConfigDict(env_file=".env", extra="allow") def to_dict(self) -> dict: diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index b7d2476b4..77c700482 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -13,14 +13,12 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return result.text - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): + def read(self, chunk_size: int, chunker: str): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func( - self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens - ) + chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text]) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index e1bbb70ba..e3a56a0a5 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -11,5 +11,5 @@ class Document(DataPoint): mime_type: str _metadata: dict = {"index_fields": ["name"], "type": "Document"} - def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str: + def read(self, chunk_size: int, chunker=str) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index c055b8253..3b5c27d75 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -13,13 +13,11 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return result.choices[0].message.content - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): + def read(self, chunk_size: int, chunker: str): # Transcribe the image file text = self.transcribe_image() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func( - self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens - ) + chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text]) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 768f91264..0142610bf 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -9,7 +9,7 @@ from .Document import Document class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): + def read(self, chunk_size: int, chunker: str): file = PdfReader(self.raw_data_location) def get_text(): @@ -18,9 +18,7 @@ class PdfDocument(Document): yield page_text chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func( - self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens - ) + chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index b62ccd56e..692edcb89 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -7,7 +7,7 @@ from .Document import Document class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): + def read(self, chunk_size: int, chunker: str): def get_text(): with open(self.raw_data_location, mode="r", encoding="utf-8") as file: while True: @@ -20,8 +20,6 @@ class TextDocument(Document): chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func( - self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens - ) + chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index 1c291d0dc..b3849616f 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -10,7 +10,7 @@ from .Document import Document class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str: + def read(self, chunk_size: int, chunker: str) -> str: def get_text(): try: from unstructured.partition.auto import partition @@ -29,6 +29,6 @@ class UnstructuredDocument(Document): yield text - chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens) + chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text) yield from chunker.read() diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index 6b1ca7f8f..4e5523fd2 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -10,8 +10,6 @@ import graphistry import networkx as nx import pandas as pd import matplotlib.pyplot as plt -import tiktoken -import time import logging import sys @@ -100,15 +98,6 @@ def send_telemetry(event_name: str, user_id, additional_properties: dict = {}): print(f"Error sending telemetry through proxy: {response.status_code}") -def num_tokens_from_string(string: str, encoding_name: str) -> int: - """Returns the number of tokens in a text string.""" - - # tiktoken.get_encoding("cl100k_base") - encoding = tiktoken.encoding_for_model(encoding_name) - num_tokens = len(encoding.encode(string)) - return num_tokens - - def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str: h = hashlib.md5() @@ -134,34 +123,6 @@ def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str: raise IngestionError(message=f"Failed to load data from {file}: {e}") -def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> str: - """ - Trims the text so that the number of tokens does not exceed max_tokens. - - Args: - text (str): Original text string to be trimmed. - max_tokens (int): Maximum number of tokens allowed. - encoding_name (str): The name of the token encoding to use. - - Returns: - str: Trimmed version of text or original text if under the limit. - """ - # First check the number of tokens - num_tokens = num_tokens_from_string(text, encoding_name) - - # If the number of tokens is within the limit, return the text as is - if num_tokens <= max_tokens: - return text - - # If the number exceeds the limit, trim the text - # This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut - encoded_text = tiktoken.get_encoding(encoding_name).encode(text) - trimmed_encoded_text = encoded_text[:max_tokens] - # Decoding the trimmed text - trimmed_text = tiktoken.get_encoding(encoding_name).decode(trimmed_encoded_text) - return trimmed_text - - def generate_color_palette(unique_layers): colormap = plt.cm.get_cmap("viridis", len(unique_layers)) colors = [colormap(i) for i in range(len(unique_layers))] diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 24d566074..9cb88935e 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -4,13 +4,13 @@ from uuid import NAMESPACE_OID, uuid5 import tiktoken from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine from .chunk_by_sentence import chunk_by_sentence def chunk_by_paragraph( data: str, - max_tokens: Optional[Union[int, float]] = None, paragraph_length: int = 1024, batch_paragraphs: bool = True, ) -> Iterator[Dict[str, Any]]: @@ -24,24 +24,22 @@ def chunk_by_paragraph( paragraph_ids = [] last_cut_type = None current_token_count = 0 - if not max_tokens: - max_tokens = float("inf") + # Get vector and embedding engine vector_engine = get_vector_engine() - embedding_model = vector_engine.embedding_engine.model - embedding_model = embedding_model.split("/")[-1] + embedding_engine = vector_engine.embedding_engine + + # embedding_model = embedding_engine.model.split("/")[-1] for paragraph_id, sentence, word_count, end_type in chunk_by_sentence( data, maximum_length=paragraph_length ): # Check if this sentence would exceed length limit - - tokenizer = tiktoken.encoding_for_model(embedding_model) - token_count = len(tokenizer.encode(sentence)) + token_count = embedding_engine.tokenizer.num_tokens_from_text(sentence) if current_word_count > 0 and ( current_word_count + word_count > paragraph_length - or current_token_count + token_count > max_tokens + or current_token_count + token_count > embedding_engine.max_tokens ): # Yield current chunk chunk_dict = { diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index 5ce224002..6b239975c 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -7,10 +7,7 @@ async def extract_chunks_from_documents( documents: list[Document], chunk_size: int = 1024, chunker="text_chunker", - max_tokens: Optional[int] = None, ): for document in documents: - for document_chunk in document.read( - chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens - ): + for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker): yield document_chunk diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 82fa46cf0..ada71e596 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -89,26 +89,31 @@ def _get_subchunk_token_counts( def _get_chunk_source_code( - code_token_counts: list[tuple[str, int]], overlap: float, max_tokens: int + code_token_counts: list[tuple[str, int]], overlap: float ) -> tuple[list[tuple[str, int]], str]: """Generates a chunk of source code from tokenized subchunks with overlap handling.""" current_count = 0 cumulative_counts = [] current_source_code = "" + # Get embedding engine used in vector database + from cognee.infrastructure.databases.vector.get_vector_engine import get_vector_engine + + embedding_engine = get_vector_engine().embedding_engine + for i, (child_code, token_count) in enumerate(code_token_counts): current_count += token_count cumulative_counts.append(current_count) - if current_count > max_tokens: + if current_count > embedding_engine.max_tokens: break current_source_code += f"\n{child_code}" - if current_count <= max_tokens: + if current_count <= embedding_engine.max_tokens: return [], current_source_code.strip() cutoff = 1 for i, cum_count in enumerate(cumulative_counts): - if cum_count > (1 - overlap) * max_tokens: + if cum_count > (1 - overlap) * embedding_engine.max_tokens: break cutoff = i