refactor: Refactor get source code chunks based on tokenizer rework
This commit is contained in:
parent
844d99cb72
commit
902979c1de
1 changed files with 5 additions and 8 deletions
|
|
@ -122,21 +122,18 @@ def _get_chunk_source_code(
|
||||||
|
|
||||||
def get_source_code_chunks_from_code_part(
|
def get_source_code_chunks_from_code_part(
|
||||||
code_file_part: CodePart,
|
code_file_part: CodePart,
|
||||||
max_tokens: int = 8192,
|
|
||||||
overlap: float = 0.25,
|
overlap: float = 0.25,
|
||||||
granularity: float = 0.1,
|
granularity: float = 0.1,
|
||||||
model_name: str = "text-embedding-3-large",
|
|
||||||
) -> Generator[SourceCodeChunk, None, None]:
|
) -> Generator[SourceCodeChunk, None, None]:
|
||||||
"""Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
|
"""Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
|
||||||
if not code_file_part.source_code:
|
if not code_file_part.source_code:
|
||||||
logger.error(f"No source code in CodeFile {code_file_part.id}")
|
logger.error(f"No source code in CodeFile {code_file_part.id}")
|
||||||
return
|
return
|
||||||
|
|
||||||
vector_engine = get_vector_engine()
|
embedding_engine = get_vector_engine().embedding_engine
|
||||||
embedding_model = vector_engine.embedding_engine.model
|
tokenizer = embedding_engine.tokenizer
|
||||||
model_name = embedding_model.split("/")[-1]
|
|
||||||
tokenizer = tiktoken.encoding_for_model(model_name)
|
max_subchunk_tokens = max(1, int(granularity * embedding_engine.max_tokens))
|
||||||
max_subchunk_tokens = max(1, int(granularity * max_tokens))
|
|
||||||
subchunk_token_counts = _get_subchunk_token_counts(
|
subchunk_token_counts = _get_subchunk_token_counts(
|
||||||
tokenizer, code_file_part.source_code, max_subchunk_tokens
|
tokenizer, code_file_part.source_code, max_subchunk_tokens
|
||||||
)
|
)
|
||||||
|
|
@ -144,7 +141,7 @@ def get_source_code_chunks_from_code_part(
|
||||||
previous_chunk = None
|
previous_chunk = None
|
||||||
while subchunk_token_counts:
|
while subchunk_token_counts:
|
||||||
subchunk_token_counts, chunk_source_code = _get_chunk_source_code(
|
subchunk_token_counts, chunk_source_code = _get_chunk_source_code(
|
||||||
subchunk_token_counts, overlap, max_tokens
|
subchunk_token_counts, overlap
|
||||||
)
|
)
|
||||||
if not chunk_source_code:
|
if not chunk_source_code:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue