refactor: Refactor get source code chunks based on tokenizer rework

This commit is contained in:
Igor Ilic 2025-01-24 13:40:10 +01:00
parent 844d99cb72
commit 902979c1de

View file

@ -122,21 +122,18 @@ def _get_chunk_source_code(
def get_source_code_chunks_from_code_part( def get_source_code_chunks_from_code_part(
code_file_part: CodePart, code_file_part: CodePart,
max_tokens: int = 8192,
overlap: float = 0.25, overlap: float = 0.25,
granularity: float = 0.1, granularity: float = 0.1,
model_name: str = "text-embedding-3-large",
) -> Generator[SourceCodeChunk, None, None]: ) -> Generator[SourceCodeChunk, None, None]:
"""Yields source code chunks from a CodePart object, with configurable token limits and overlap.""" """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
if not code_file_part.source_code: if not code_file_part.source_code:
logger.error(f"No source code in CodeFile {code_file_part.id}") logger.error(f"No source code in CodeFile {code_file_part.id}")
return return
vector_engine = get_vector_engine() embedding_engine = get_vector_engine().embedding_engine
embedding_model = vector_engine.embedding_engine.model tokenizer = embedding_engine.tokenizer
model_name = embedding_model.split("/")[-1]
tokenizer = tiktoken.encoding_for_model(model_name) max_subchunk_tokens = max(1, int(granularity * embedding_engine.max_tokens))
max_subchunk_tokens = max(1, int(granularity * max_tokens))
subchunk_token_counts = _get_subchunk_token_counts( subchunk_token_counts = _get_subchunk_token_counts(
tokenizer, code_file_part.source_code, max_subchunk_tokens tokenizer, code_file_part.source_code, max_subchunk_tokens
) )
@ -144,7 +141,7 @@ def get_source_code_chunks_from_code_part(
previous_chunk = None previous_chunk = None
while subchunk_token_counts: while subchunk_token_counts:
subchunk_token_counts, chunk_source_code = _get_chunk_source_code( subchunk_token_counts, chunk_source_code = _get_chunk_source_code(
subchunk_token_counts, overlap, max_tokens subchunk_token_counts, overlap
) )
if not chunk_source_code: if not chunk_source_code:
continue continue