refactor: Add max chunk tokens to code graph pipeline
This commit is contained in:
parent
dc0450d30e
commit
4e56cd64a1
4 changed files with 20 additions and 13 deletions
|
|
@ -21,6 +21,7 @@ from cognee.tasks.repo_processor import (
|
||||||
from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks
|
from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks
|
||||||
from cognee.tasks.storage import add_data_points
|
from cognee.tasks.storage import add_data_points
|
||||||
from cognee.tasks.summarization import summarize_code, summarize_text
|
from cognee.tasks.summarization import summarize_code, summarize_text
|
||||||
|
from cognee.infrastructure.llm import get_max_chunk_tokens
|
||||||
|
|
||||||
monitoring = get_base_config().monitoring_tool
|
monitoring = get_base_config().monitoring_tool
|
||||||
if monitoring == MonitoringTool.LANGFUSE:
|
if monitoring == MonitoringTool.LANGFUSE:
|
||||||
|
|
@ -71,7 +72,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
|
||||||
Task(ingest_data, dataset_name="repo_docs", user=user),
|
Task(ingest_data, dataset_name="repo_docs", user=user),
|
||||||
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
|
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
|
||||||
Task(classify_documents),
|
Task(classify_documents),
|
||||||
Task(extract_chunks_from_documents),
|
Task(extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()),
|
||||||
Task(
|
Task(
|
||||||
extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
|
extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,7 @@ from typing import Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
from cognee.infrastructure.llm import get_max_chunk_tokens
|
||||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
|
||||||
from cognee.modules.cognify.config import get_cognify_config
|
from cognee.modules.cognify.config import get_cognify_config
|
||||||
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
|
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
|
||||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||||
|
|
@ -148,22 +147,13 @@ async def get_default_tasks(
|
||||||
if user is None:
|
if user is None:
|
||||||
user = await get_default_user()
|
user = await get_default_user()
|
||||||
|
|
||||||
# Calculate max chunk size based on the following formula
|
|
||||||
embedding_engine = get_vector_engine().embedding_engine
|
|
||||||
llm_client = get_llm_client()
|
|
||||||
|
|
||||||
# We need to make sure chunk size won't take more than half of LLM max context token size
|
|
||||||
# but it also can't be bigger than the embedding engine max token size
|
|
||||||
llm_cutoff_point = llm_client.max_tokens // 2 # Round down the division
|
|
||||||
max_chunk_tokens = min(embedding_engine.max_tokens, llm_cutoff_point)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cognee_config = get_cognify_config()
|
cognee_config = get_cognify_config()
|
||||||
default_tasks = [
|
default_tasks = [
|
||||||
Task(classify_documents),
|
Task(classify_documents),
|
||||||
Task(check_permissions_on_documents, user=user, permissions=["write"]),
|
Task(check_permissions_on_documents, user=user, permissions=["write"]),
|
||||||
Task(
|
Task(
|
||||||
extract_chunks_from_documents, max_chunk_tokens=max_chunk_tokens
|
extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
|
||||||
), # Extract text chunks based on the document type.
|
), # Extract text chunks based on the document type.
|
||||||
Task(
|
Task(
|
||||||
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
|
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
|
||||||
|
|
|
||||||
|
|
@ -1 +1,2 @@
|
||||||
from .config import get_llm_config
|
from .config import get_llm_config
|
||||||
|
from .utils import get_max_chunk_tokens
|
||||||
|
|
|
||||||
15
cognee/infrastructure/llm/utils.py
Normal file
15
cognee/infrastructure/llm/utils.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||||
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_chunk_tokens():
|
||||||
|
# Calculate max chunk size based on the following formula
|
||||||
|
embedding_engine = get_vector_engine().embedding_engine
|
||||||
|
llm_client = get_llm_client()
|
||||||
|
|
||||||
|
# We need to make sure chunk size won't take more than half of LLM max context token size
|
||||||
|
# but it also can't be bigger than the embedding engine max token size
|
||||||
|
llm_cutoff_point = llm_client.max_tokens // 2 # Round down the division
|
||||||
|
max_chunk_tokens = min(embedding_engine.max_tokens, llm_cutoff_point)
|
||||||
|
|
||||||
|
return max_chunk_tokens
|
||||||
Loading…
Add table
Reference in a new issue