diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 42f12ea51..96bfe6d32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: - id: check-added-large-files - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.8.3 + rev: v0.9.0 hooks: # Run the linter. - id: ruff diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 405cb0b40..2d077f39b 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -3,7 +3,6 @@ import logging from pathlib import Path from cognee.base_config import get_base_config -from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks from cognee.modules.pipelines.tasks.Task import Task @@ -54,8 +53,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): await cognee.prune.prune_system(metadata=True) await create_db_and_tables() - embedding_engine = get_embedding_engine() - cognee_config = get_cognify_config() user = await get_default_user() @@ -63,11 +60,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(get_repo_file_dependencies), Task(enrich_dependency_graph), Task(expand_dependency_graph, task_config={"batch_size": 50}), - Task( - get_source_code_chunks, - embedding_model=embedding_engine.model, - task_config={"batch_size": 50}, - ), + Task(get_source_code_chunks, task_config={"batch_size": 50}), Task(summarize_code, task_config={"batch_size": 50}), Task(add_data_points, task_config={"batch_size": 50}), ] @@ -78,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents), + Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens), Task( extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50} ), diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 5490f6b43..3543418fc 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -493,7 +493,7 @@ class Neo4jAdapter(GraphDBInterface): query_edges = f""" MATCH (n)-[r]->(m) - WHERE {where_clause} AND {where_clause.replace('n.', 'm.')} + WHERE {where_clause} AND {where_clause.replace("n.", "m.")} RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties """ result_edges = await self.query(query_edges) diff --git a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt index ebbb03f75..3117ac9f1 100644 --- a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt +++ b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt @@ -1,3 +1,6 @@ -I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and -generate a single patch file that I can apply directly to this repository using git apply. -Please respond with a single patch file in the following format. \ No newline at end of file +You are a senior software engineer. I need you to solve this issue by looking at the provided context and +generate a single patch file that I can apply directly to this repository using git apply. +Additionally, please make sure that you provide code only with correct syntax and +you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing +functions or classes, as they may be referenced from other code. +Please respond only with a single patch file in the following format without adding any additional context or string. diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 7bb8a1c1c..78c02b9c9 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -1,7 +1,9 @@ -from uuid import uuid5, NAMESPACE_OID +from typing import Optional +from uuid import NAMESPACE_OID, uuid5 + +from cognee.tasks.chunks import chunk_by_paragraph from .models.DocumentChunk import DocumentChunk -from cognee.tasks.chunks import chunk_by_paragraph class TextChunker: @@ -10,23 +12,36 @@ class TextChunker: chunk_index = 0 chunk_size = 0 + token_count = 0 - def __init__(self, document, get_text: callable, chunk_size: int = 1024): + def __init__( + self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024 + ): self.document = document self.max_chunk_size = chunk_size self.get_text = get_text + self.max_tokens = max_tokens if max_tokens else float("inf") + + def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data): + word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size + token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens + return word_count_fits and token_count_fits def read(self): paragraph_chunks = [] for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, + self.max_tokens, self.max_chunk_size, batch_paragraphs=True, ): - if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size: + if self.check_word_count_and_token_count( + self.chunk_size, self.token_count, chunk_data + ): paragraph_chunks.append(chunk_data) self.chunk_size += chunk_data["word_count"] + self.token_count += chunk_data["token_count"] else: if len(paragraph_chunks) == 0: yield DocumentChunk( @@ -66,6 +81,7 @@ class TextChunker: print(e) paragraph_chunks = [chunk_data] self.chunk_size = chunk_data["word_count"] + self.token_count = chunk_data["token_count"] self.chunk_index += 1 diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 4920e9b06..a232d50a1 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -12,6 +12,7 @@ class DocumentChunk(DataPoint): chunk_index: int cut_type: str is_part_of: Document + pydantic_type: str = "DocumentChunk" contains: List[Entity] = None _metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"} diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py index d40410bfc..dd94d8b41 100644 --- a/cognee/modules/cognify/config.py +++ b/cognee/modules/cognify/config.py @@ -1,12 +1,14 @@ from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent +from typing import Optional +import os class CognifyConfig(BaseSettings): classification_model: object = DefaultContentPrediction summarization_model: object = SummarizedContent - + max_tokens: Optional[int] = os.getenv("MAX_TOKENS") model_config = SettingsConfigDict(env_file=".env", extra="allow") def to_dict(self) -> dict: diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index a33d4e7fc..b7d2476b4 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -1,6 +1,9 @@ +from typing import Optional + from cognee.infrastructure.llm.get_llm_client import get_llm_client -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document class AudioDocument(Document): @@ -10,12 +13,14 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return result.text - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text]) + chunker = chunker_func( + self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens + ) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 08380e809..7ecdf289e 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -1,3 +1,4 @@ +from typing import Optional from uuid import UUID from cognee.infrastructure.engine import DataPoint @@ -10,5 +11,5 @@ class Document(DataPoint): mime_type: str _metadata: dict = {"index_fields": ["name"], "type": "Document"} - def read(self, chunk_size: int, chunker=str) -> str: + def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 424cd059c..c055b8253 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -1,6 +1,9 @@ +from typing import Optional + from cognee.infrastructure.llm.get_llm_client import get_llm_client -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document class ImageDocument(Document): @@ -10,11 +13,13 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return result.choices[0].message.content - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the image file text = self.transcribe_image() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size=chunk_size, get_text=lambda: [text]) + chunker = chunker_func( + self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens + ) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 684fb428c..768f91264 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -1,12 +1,15 @@ +from typing import Optional + from pypdf import PdfReader -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): file = PdfReader(self.raw_data_location) def get_text(): @@ -15,7 +18,9 @@ class PdfDocument(Document): yield page_text chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text) + chunker = chunker_func( + self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens + ) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index f993ff221..b62ccd56e 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -1,11 +1,13 @@ -from .Document import Document +from typing import Optional + from .ChunkerMapping import ChunkerConfig +from .Document import Document class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): def get_text(): with open(self.raw_data_location, mode="r", encoding="utf-8") as file: while True: @@ -18,6 +20,8 @@ class TextDocument(Document): chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size=chunk_size, get_text=get_text) + chunker = chunker_func( + self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens + ) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index cd5c72e3b..1c291d0dc 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -1,14 +1,16 @@ from io import StringIO +from typing import Optional from cognee.modules.chunking.TextChunker import TextChunker -from .Document import Document from cognee.modules.data.exceptions import UnstructuredLibraryImportError +from .Document import Document + class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str: def get_text(): try: from unstructured.partition.auto import partition @@ -27,6 +29,6 @@ class UnstructuredDocument(Document): yield text - chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text) + chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 63a153bf2..0e57d5dba 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -7,5 +7,6 @@ class Entity(DataPoint): name: str is_a: EntityType description: str + pydantic_type: str = "Entity" _metadata: dict = {"index_fields": ["name"], "type": "Entity"} diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py index 7225bb3ae..10799bb33 100644 --- a/cognee/modules/engine/models/EntityType.py +++ b/cognee/modules/engine/models/EntityType.py @@ -5,5 +5,6 @@ class EntityType(DataPoint): __tablename__ = "entity_type" name: str description: str + pydantic_type: str = "EntityType" _metadata: dict = {"index_fields": ["name"], "type": "EntityType"} diff --git a/cognee/modules/retrieval/brute_force_triplet_search.py b/cognee/modules/retrieval/brute_force_triplet_search.py index fdd312480..9c778505d 100644 --- a/cognee/modules/retrieval/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/brute_force_triplet_search.py @@ -43,7 +43,7 @@ def format_triplets(edges): edge_info = {key: value for key, value in edge_attributes.items() if value is not None} # Create the formatted triplet - triplet = f"Node1: {node1_info}\n" f"Edge: {edge_info}\n" f"Node2: {node2_info}\n\n\n" + triplet = f"Node1: {node1_info}\nEdge: {edge_info}\nNode2: {node2_info}\n\n\n" triplets.append(triplet) return "".join(triplets) diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index ecd187907..243fdbde3 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -8,20 +8,27 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.modules.users.methods import get_default_user from cognee.modules.users.models import User from cognee.shared.utils import send_telemetry +from cognee.api.v1.search import SearchType +from cognee.api.v1.search.search_v2 import search +from cognee.infrastructure.llm.get_llm_client import get_llm_client -async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list: +async def code_description_to_code_part_search( + query: str, include_docs=False, user: User = None, top_k=5 +) -> list: if user is None: user = await get_default_user() if user is None: raise PermissionError("No user found in the system. Please create a user.") - retrieved_codeparts = await code_description_to_code_part(query, user, top_k) + retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs) return retrieved_codeparts -async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]: +async def code_description_to_code_part( + query: str, user: User, top_k: int, include_docs: bool = False +) -> List[str]: """ Maps a code description query to relevant code parts using a CodeGraph pipeline. @@ -29,6 +36,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L query (str): The search query describing the code parts. user (User): The user performing the search. top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher) + include_docs(bool): Boolean showing whether we have the docs in the graph or not Returns: Set[str]: A set of unique code parts matching the query. @@ -55,21 +63,48 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L ) try: - results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k) - if not results: + if include_docs: + search_results = await search(SearchType.INSIGHTS, query_text=query) + + concatenated_descriptions = " ".join( + obj["description"] + for tpl in search_results + for obj in tpl + if isinstance(obj, dict) and "description" in obj + ) + + llm_client = get_llm_client() + context_from_documents = await llm_client.acreate_structured_output( + text_input=f"The retrieved context from documents is {concatenated_descriptions}.", + system_prompt="You are a Senior Software Engineer, summarize the context from documents" + f" in a way that it is gonna be provided next to codeparts as context" + f" while trying to solve this github issue connected to the project: {query}]", + response_model=str, + ) + + code_summaries = await vector_engine.search( + "code_summary_text", query_text=query, limit=top_k + ) + if not code_summaries: logging.warning("No results found for query: '%s' by user: %s", query, user.id) return [] memory_fragment = CogneeGraph() await memory_fragment.project_graph_from_db( graph_engine, - node_properties_to_project=["id", "type", "text", "source_code"], + node_properties_to_project=[ + "id", + "type", + "text", + "source_code", + "pydantic_type", + ], edge_properties_to_project=["relationship_name"], ) code_pieces_to_return = set() - for node in results: + for node in code_summaries: node_id = str(node.id) node_to_search_from = memory_fragment.get_node(node_id) @@ -78,9 +113,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L continue for code_file in node_to_search_from.get_skeleton_neighbours(): - for code_file_edge in code_file.get_skeleton_edges(): - if code_file_edge.get_attribute("relationship_name") == "contains": - code_pieces_to_return.add(code_file_edge.get_destination_node()) + if code_file.get_attribute("pydantic_type") == "SourceCodeChunk": + for code_file_edge in code_file.get_skeleton_edges(): + if code_file_edge.get_attribute("relationship_name") == "code_chunk_of": + code_pieces_to_return.add(code_file_edge.get_destination_node()) + elif code_file.get_attribute("pydantic_type") == "CodePart": + code_pieces_to_return.add(code_file) + elif code_file.get_attribute("pydantic_type") == "CodeFile": + for code_file_edge in code_file.get_skeleton_edges(): + if code_file_edge.get_attribute("relationship_name") == "contains": + code_pieces_to_return.add(code_file_edge.get_destination_node()) logging.info( "Search completed for user: %s, query: '%s'. Found %d code pieces.", @@ -89,7 +131,14 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L len(code_pieces_to_return), ) - return list(code_pieces_to_return) + context = "" + for code_piece in code_pieces_to_return: + context = context + code_piece.get_attribute("source_code") + + if include_docs: + context = context_from_documents + context + + return context except Exception as exec_error: logging.error( diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 164327da0..926aae9fa 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -5,12 +5,14 @@ from cognee.infrastructure.engine import DataPoint class Repository(DataPoint): __tablename__ = "Repository" path: str + pydantic_type: str = "Repository" _metadata: dict = {"index_fields": [], "type": "Repository"} class CodeFile(DataPoint): __tablename__ = "codefile" extracted_id: str # actually file path + pydantic_type: str = "CodeFile" source_code: Optional[str] = None part_of: Optional[Repository] = None depends_on: Optional[List["CodeFile"]] = None @@ -22,6 +24,7 @@ class CodeFile(DataPoint): class CodePart(DataPoint): __tablename__ = "codepart" # part_of: Optional[CodeFile] = None + pydantic_type: str = "CodePart" source_code: Optional[str] = None _metadata: dict = {"index_fields": [], "type": "CodePart"} @@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint): __tablename__ = "sourcecodechunk" code_chunk_of: Optional[CodePart] = None source_code: Optional[str] = None + pydantic_type: str = "SourceCodeChunk" previous_chunk: Optional["SourceCodeChunk"] = None _metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"} diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index d23d2841c..a36a09010 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -231,6 +231,7 @@ class SummarizedContent(BaseModel): summary: str description: str + pydantic_type: str = "SummarizedContent" class SummarizedFunction(BaseModel): @@ -239,6 +240,7 @@ class SummarizedFunction(BaseModel): inputs: Optional[List[str]] = None outputs: Optional[List[str]] = None decorators: Optional[List[str]] = None + pydantic_type: str = "SummarizedFunction" class SummarizedClass(BaseModel): @@ -246,6 +248,7 @@ class SummarizedClass(BaseModel): description: str methods: Optional[List[SummarizedFunction]] = None decorators: Optional[List[str]] = None + pydantic_type: str = "SummarizedClass" class SummarizedCode(BaseModel): @@ -256,6 +259,7 @@ class SummarizedCode(BaseModel): classes: List[SummarizedClass] = [] functions: List[SummarizedFunction] = [] workflow_description: Optional[str] = None + pydantic_type: str = "SummarizedCode" class GraphDBType(Enum): diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 5c95e97b7..24d566074 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -1,10 +1,18 @@ -from uuid import uuid5, NAMESPACE_OID -from typing import Dict, Any, Iterator +from typing import Any, Dict, Iterator, Optional, Union +from uuid import NAMESPACE_OID, uuid5 + +import tiktoken + +from cognee.infrastructure.databases.vector import get_vector_engine + from .chunk_by_sentence import chunk_by_sentence def chunk_by_paragraph( - data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True + data: str, + max_tokens: Optional[Union[int, float]] = None, + paragraph_length: int = 1024, + batch_paragraphs: bool = True, ) -> Iterator[Dict[str, Any]]: """ Chunks text by paragraph while preserving exact text reconstruction capability. @@ -15,16 +23,31 @@ def chunk_by_paragraph( chunk_index = 0 paragraph_ids = [] last_cut_type = None + current_token_count = 0 + if not max_tokens: + max_tokens = float("inf") + + vector_engine = get_vector_engine() + embedding_model = vector_engine.embedding_engine.model + embedding_model = embedding_model.split("/")[-1] for paragraph_id, sentence, word_count, end_type in chunk_by_sentence( data, maximum_length=paragraph_length ): # Check if this sentence would exceed length limit - if current_word_count > 0 and current_word_count + word_count > paragraph_length: + + tokenizer = tiktoken.encoding_for_model(embedding_model) + token_count = len(tokenizer.encode(sentence)) + + if current_word_count > 0 and ( + current_word_count + word_count > paragraph_length + or current_token_count + token_count > max_tokens + ): # Yield current chunk chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, @@ -37,11 +60,13 @@ def chunk_by_paragraph( paragraph_ids = [] current_chunk = "" current_word_count = 0 + current_token_count = 0 chunk_index += 1 paragraph_ids.append(paragraph_id) current_chunk += sentence current_word_count += word_count + current_token_count += token_count # Handle end of paragraph if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: @@ -49,6 +74,7 @@ def chunk_by_paragraph( chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "paragraph_ids": paragraph_ids, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "chunk_index": chunk_index, @@ -58,6 +84,7 @@ def chunk_by_paragraph( paragraph_ids = [] current_chunk = "" current_word_count = 0 + current_token_count = 0 chunk_index += 1 last_cut_type = end_type @@ -67,6 +94,7 @@ def chunk_by_paragraph( chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index 437d2a3e4..5ce224002 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -1,9 +1,16 @@ +from typing import Optional + from cognee.modules.data.processing.document_types.Document import Document async def extract_chunks_from_documents( - documents: list[Document], chunk_size: int = 1024, chunker="text_chunker" + documents: list[Document], + chunk_size: int = 1024, + chunker="text_chunker", + max_tokens: Optional[int] = None, ): for document in documents: - for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker): + for document_chunk in document.read( + chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens + ): yield document_chunk diff --git a/cognee/tasks/repo_processor/extract_code_parts.py b/cognee/tasks/repo_processor/extract_code_parts.py index f25146232..c8e478692 100644 --- a/cognee/tasks/repo_processor/extract_code_parts.py +++ b/cognee/tasks/repo_processor/extract_code_parts.py @@ -1,6 +1,5 @@ from typing import Dict, List import parso - import logging logger = logging.getLogger(__name__) diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py index b0ac2829f..888f847da 100644 --- a/cognee/tasks/repo_processor/get_local_dependencies.py +++ b/cognee/tasks/repo_processor/get_local_dependencies.py @@ -9,7 +9,6 @@ import aiofiles import jedi import parso from parso.tree import BaseNode - import logging logger = logging.getLogger(__name__) diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py index 9c69afd00..12f32e841 100644 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -29,8 +29,105 @@ async def get_non_py_files(repo_path): "*.egg-info", } + ALLOWED_EXTENSIONS = { + ".txt", + ".md", + ".csv", + ".json", + ".xml", + ".yaml", + ".yml", + ".html", + ".css", + ".js", + ".ts", + ".jsx", + ".tsx", + ".sql", + ".log", + ".ini", + ".toml", + ".properties", + ".sh", + ".bash", + ".dockerfile", + ".gitignore", + ".gitattributes", + ".makefile", + ".pyproject", + ".requirements", + ".env", + ".pdf", + ".doc", + ".docx", + ".dot", + ".dotx", + ".rtf", + ".wps", + ".wpd", + ".odt", + ".ott", + ".ottx", + ".txt", + ".wp", + ".sdw", + ".sdx", + ".docm", + ".dotm", + # Additional extensions for other programming languages + ".java", + ".c", + ".cpp", + ".h", + ".cs", + ".go", + ".php", + ".rb", + ".swift", + ".pl", + ".lua", + ".rs", + ".scala", + ".kt", + ".sh", + ".sql", + ".v", + ".asm", + ".pas", + ".d", + ".ml", + ".clj", + ".cljs", + ".erl", + ".ex", + ".exs", + ".f", + ".fs", + ".r", + ".pyi", + ".pdb", + ".ipynb", + ".rmd", + ".cabal", + ".hs", + ".nim", + ".vhdl", + ".verilog", + ".svelte", + ".html", + ".css", + ".scss", + ".less", + ".json5", + ".yaml", + ".yml", + } + def should_process(path): - return not any(pattern in path for pattern in IGNORED_PATTERNS) + _, ext = os.path.splitext(path) + return ext in ALLOWED_EXTENSIONS and not any( + pattern in path for pattern in IGNORED_PATTERNS + ) non_py_files_paths = [ os.path.join(root, file) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 980a86539..82fa46cf0 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -5,6 +5,7 @@ from uuid import NAMESPACE_OID, uuid5 import parso import tiktoken +from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk @@ -126,6 +127,9 @@ def get_source_code_chunks_from_code_part( logger.error(f"No source code in CodeFile {code_file_part.id}") return + vector_engine = get_vector_engine() + embedding_model = vector_engine.embedding_engine.model + model_name = embedding_model.split("/")[-1] tokenizer = tiktoken.encoding_for_model(model_name) max_subchunk_tokens = max(1, int(granularity * max_tokens)) subchunk_token_counts = _get_subchunk_token_counts( @@ -150,7 +154,7 @@ def get_source_code_chunks_from_code_part( async def get_source_code_chunks( - data_points: list[DataPoint], embedding_model="text-embedding-3-large" + data_points: list[DataPoint], ) -> AsyncGenerator[list[DataPoint], None]: """Processes code graph datapoints, create SourceCodeChink datapoints.""" # TODO: Add support for other embedding models, with max_token mapping @@ -165,9 +169,7 @@ async def get_source_code_chunks( for code_part in data_point.contains: try: yield code_part - for source_code_chunk in get_source_code_chunks_from_code_part( - code_part, model_name=embedding_model - ): + for source_code_chunk in get_source_code_chunks_from_code_part(code_part): yield source_code_chunk except Exception as e: logger.error(f"Error processing code part: {e}") diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index fc62209ce..bc7b4886d 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -17,5 +17,6 @@ class CodeSummary(DataPoint): __tablename__ = "code_summary" text: str summarizes: Union[CodeFile, CodePart, SourceCodeChunk] + pydantic_type: str = "CodeSummary" _metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"} diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index dbd43ddda..e07a2431b 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -36,12 +36,12 @@ def test_AudioDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index c0877ae99..b8d585419 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -25,12 +25,12 @@ def test_ImageDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index 8f28815d3..fc4307846 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -27,12 +27,12 @@ def test_PdfDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 1e143d563..6daec62b7 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -39,12 +39,12 @@ def test_TextDocument(input_file, chunk_size): for ground_truth, paragraph_data in zip( GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 03b8deb49..773dc2293 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -68,35 +68,35 @@ def test_UnstructuredDocument(): ) # Test PPTX - for paragraph_data in pptx_document.read(chunk_size=1024): + for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) # Test DOCX - for paragraph_data in docx_document.read(chunk_size=1024): + for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" - assert ( - "sentence_end" == paragraph_data.cut_type - ), f" sentence_end != {paragraph_data.cut_type = }" + assert "sentence_end" == paragraph_data.cut_type, ( + f" sentence_end != {paragraph_data.cut_type = }" + ) # TEST CSV - for paragraph_data in csv_document.read(chunk_size=1024): + for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" - assert ( - "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text - ), f"Read text doesn't match expected text: {paragraph_data.text}" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( + f"Read text doesn't match expected text: {paragraph_data.text}" + ) + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) # Test XLSX - for paragraph_data in xlsx_document.read(chunk_size=1024): + for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) diff --git a/cognee/tests/test_deduplication.py b/cognee/tests/test_deduplication.py index 9c2df032d..89c866f12 100644 --- a/cognee/tests/test_deduplication.py +++ b/cognee/tests/test_deduplication.py @@ -30,9 +30,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert ( - result[0]["name"] == "Natural_language_processing_copy" - ), "Result name does not match expected value." + assert result[0]["name"] == "Natural_language_processing_copy", ( + "Result name does not match expected value." + ) result = await relational_engine.get_all_data_from_table("datasets") assert len(result) == 2, "Unexpected number of datasets found." @@ -61,9 +61,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert ( - hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"] - ), "Content hash is not a part of file name." + assert hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"], ( + "Content hash is not a part of file name." + ) await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_falkordb.py b/cognee/tests/test_falkordb.py index 07ece9eb2..af0e87916 100755 --- a/cognee/tests/test_falkordb.py +++ b/cognee/tests/test_falkordb.py @@ -85,9 +85,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists( - get_relational_engine().db_path - ), "SQLite relational database is not empty" + assert not os.path.exists(get_relational_engine().db_path), ( + "SQLite relational database is not empty" + ) from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 8352b4161..192b67506 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -82,9 +82,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists( - get_relational_engine().db_path - ), "SQLite relational database is not empty" + assert not os.path.exists(get_relational_engine().db_path), ( + "SQLite relational database is not empty" + ) from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index c241177f0..73b6be974 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -24,28 +24,28 @@ async def test_local_file_deletion(data_text, file_location): data_hash = hashlib.md5(encoded_text).hexdigest() # Get data entry from database based on hash contents data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() - assert os.path.isfile( - data.raw_data_location - ), f"Data location doesn't exist: {data.raw_data_location}" + assert os.path.isfile(data.raw_data_location), ( + f"Data location doesn't exist: {data.raw_data_location}" + ) # Test deletion of data along with local files created by cognee await engine.delete_data_entity(data.id) - assert not os.path.exists( - data.raw_data_location - ), f"Data location still exists after deletion: {data.raw_data_location}" + assert not os.path.exists(data.raw_data_location), ( + f"Data location still exists after deletion: {data.raw_data_location}" + ) async with engine.get_async_session() as session: # Get data entry from database based on file path data = ( await session.scalars(select(Data).where(Data.raw_data_location == file_location)) ).one() - assert os.path.isfile( - data.raw_data_location - ), f"Data location doesn't exist: {data.raw_data_location}" + assert os.path.isfile(data.raw_data_location), ( + f"Data location doesn't exist: {data.raw_data_location}" + ) # Test local files not created by cognee won't get deleted await engine.delete_data_entity(data.id) - assert os.path.exists( - data.raw_data_location - ), f"Data location doesn't exists: {data.raw_data_location}" + assert os.path.exists(data.raw_data_location), ( + f"Data location doesn't exists: {data.raw_data_location}" + ) async def test_getting_of_documents(dataset_name_1): @@ -54,16 +54,16 @@ async def test_getting_of_documents(dataset_name_1): user = await get_default_user() document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - assert ( - len(document_ids) == 1 - ), f"Number of expected documents doesn't match {len(document_ids)} != 1" + assert len(document_ids) == 1, ( + f"Number of expected documents doesn't match {len(document_ids)} != 1" + ) # Test getting of documents for search when no dataset is provided user = await get_default_user() document_ids = await get_document_ids_for_user(user.id) - assert ( - len(document_ids) == 2 - ), f"Number of expected documents doesn't match {len(document_ids)} != 2" + assert len(document_ids) == 2, ( + f"Number of expected documents doesn't match {len(document_ids)} != 2" + ) async def main(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py index 728b5cda4..d8680a604 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py @@ -17,9 +17,9 @@ batch_paragraphs_vals = [True, False] def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) reconstructed_text = "".join([chunk["text"] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( @@ -27,14 +27,18 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)), ) def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): - chunks = list(chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)) + chunks = list( + chunk_by_paragraph( + data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs + ) + ) chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] - assert np.all( - chunk_lengths <= paragraph_length - ), f"{paragraph_length = }: {larger_chunks} are too large" + assert np.all(chunk_lengths <= paragraph_length), ( + f"{paragraph_length = }: {larger_chunks} are too large" + ) @pytest.mark.parametrize( @@ -42,8 +46,10 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)), ) def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_paragraphs): - chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) + chunks = chunk_by_paragraph( + data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs + ) chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) - assert np.all( - chunk_indices == np.arange(len(chunk_indices)) - ), f"{chunk_indices = } are not monotonically increasing" + assert np.all(chunk_indices == np.arange(len(chunk_indices))), ( + f"{chunk_indices = } are not monotonically increasing" + ) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index 3ddc6f4f5..e420b2e9f 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -49,16 +49,18 @@ Third paragraph is cut and is missing the dot at the end""", def run_chunking_test(test_text, expected_chunks): chunks = [] - for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False): + for chunk_data in chunk_by_paragraph( + data=test_text, paragraph_length=12, batch_paragraphs=False + ): chunks.append(chunk_data) assert len(chunks) == 3 for expected_chunks_item, chunk in zip(expected_chunks, chunks): for key in ["text", "word_count", "cut_type"]: - assert ( - chunk[key] == expected_chunks_item[key] - ), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" + assert chunk[key] == expected_chunks_item[key], ( + f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" + ) def test_chunking_whole_text(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index d1c75d7ed..efa053077 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -16,9 +16,9 @@ maximum_length_vals = [None, 8, 64] def test_chunk_by_sentence_isomorphism(input_text, maximum_length): chunks = chunk_by_sentence(input_text, maximum_length) reconstructed_text = "".join([chunk[1] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( @@ -36,6 +36,6 @@ def test_paragraph_chunk_length(input_text, maximum_length): chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > maximum_length] - assert np.all( - chunk_lengths <= maximum_length - ), f"{maximum_length = }: {larger_chunks} are too large" + assert np.all(chunk_lengths <= maximum_length), ( + f"{maximum_length = }: {larger_chunks} are too large" + ) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py index fb26638cb..d79fcdbc8 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -17,9 +17,9 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS def test_chunk_by_word_isomorphism(input_text): chunks = chunk_by_word(input_text) reconstructed_text = "".join([chunk[0] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 789c95ab4..a8b4c8a1d 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -11,8 +11,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline from cognee.api.v1.search import SearchType from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt -from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search -from cognee.shared.utils import render_graph +from cognee.modules.retrieval.description_to_codepart_search import ( + code_description_to_code_part_search, +) from evals.eval_utils import download_github_repo, retrieved_edges_to_string @@ -32,25 +33,18 @@ def check_install_package(package_name): return False -async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): +async def generate_patch_with_cognee(instance): repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") - pipeline = await run_code_graph_pipeline(repo_path) - - async for result in pipeline: - print(result) - - print("Here we have the repo under the repo_path") - - await render_graph(None, include_labels=True, include_nodes=True) - + include_docs = True problem_statement = instance["problem_statement"] instructions = read_query_prompt("patch_gen_kg_instructions.txt") - retrieved_edges = await brute_force_triplet_search( - problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"] - ) + async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs): + print(result) - retrieved_edges_str = retrieved_edges_to_string(retrieved_edges) + retrieved_codeparts = await code_description_to_code_part_search( + problem_statement, include_docs=include_docs + ) prompt = "\n".join( [ @@ -58,8 +52,8 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp "", PATCH_EXAMPLE, "", - "These are the retrieved edges:", - retrieved_edges_str, + "This is the additional context to solve the problem (description from documentation together with codeparts):", + retrieved_codeparts, ] ) @@ -85,8 +79,6 @@ async def generate_patch_without_cognee(instance, llm_client): async def get_preds(dataset, with_cognee=True): - llm_client = get_llm_client() - if with_cognee: model_name = "with_cognee" pred_func = generate_patch_with_cognee @@ -94,17 +86,18 @@ async def get_preds(dataset, with_cognee=True): model_name = "without_cognee" pred_func = generate_patch_without_cognee - futures = [(instance["instance_id"], pred_func(instance, llm_client)) for instance in dataset] - model_patches = await asyncio.gather(*[x[1] for x in futures]) + preds = [] - preds = [ - { - "instance_id": instance_id, - "model_patch": model_patch, - "model_name_or_path": model_name, - } - for (instance_id, _), model_patch in zip(futures, model_patches) - ] + for instance in dataset: + instance_id = instance["instance_id"] + model_patch = await pred_func(instance) # Sequentially await the async function + preds.append( + { + "instance_id": instance_id, + "model_patch": model_patch, + "model_name_or_path": model_name, + } + ) return preds @@ -134,6 +127,7 @@ async def main(): with open(predictions_path, "w") as file: json.dump(preds, file) + """ This part is for the evaluation subprocess.run( [ "python", @@ -151,6 +145,7 @@ async def main(): "test_run", ] ) + """ if __name__ == "__main__": diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index c90a0b606..59229344f 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,7 +1,9 @@ import argparse import asyncio +import logging from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline +from cognee.shared.utils import setup_logging async def main(repo_path, include_docs): @@ -9,7 +11,7 @@ async def main(repo_path, include_docs): print(result) -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository") parser.add_argument( @@ -18,5 +20,28 @@ if __name__ == "__main__": default=True, help="Whether or not to process non-code files", ) - args = parser.parse_args() - asyncio.run(main(args.repo_path, args.include_docs)) + parser.add_argument( + "--time", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to time the pipeline run", + ) + return parser.parse_args() + + +if __name__ == "__main__": + setup_logging(logging.ERROR) + + args = parse_args() + + if args.time: + import time + + start_time = time.time() + asyncio.run(main(args.repo_path, args.include_docs)) + end_time = time.time() + print("\n" + "=" * 50) + print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds") + print("=" * 50 + "\n") + else: + asyncio.run(main(args.repo_path, args.include_docs))