From a8a83fffff17c882c554e1a6ea95481de773c6b1 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 20 Dec 2024 10:53:57 +0100 Subject: [PATCH 01/24] Ingest non-code files --- cognee/api/v1/cognify/code_graph_pipeline.py | 43 ++++++++++++++++--- cognee/tasks/repo_processor/__init__.py | 1 + .../repo_processor/get_non_code_files.py | 36 ++++++++++++++++ examples/python/code_graph_example.py | 11 ++--- 4 files changed, 80 insertions(+), 11 deletions(-) create mode 100644 cognee/tasks/repo_processor/get_non_code_files.py diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index c35f9719f..8e92d08e0 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -2,29 +2,37 @@ import asyncio import logging from pathlib import Path +from cognee.base_config import get_base_config +from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.users.methods import get_default_user +from cognee.shared.data_models import KnowledgeGraph, MonitoringTool +from cognee.tasks.documents import (classify_documents, + extract_chunks_from_documents) +from cognee.tasks.graph import extract_graph_from_data +from cognee.tasks.ingestion import ingest_data_with_metadata from cognee.tasks.repo_processor import (enrich_dependency_graph, expand_dependency_graph, + get_data_list_for_user, + get_non_code_files, get_repo_file_dependencies) from cognee.tasks.storage import add_data_points -from cognee.base_config import get_base_config -from cognee.shared.data_models import MonitoringTool - monitoring = get_base_config().monitoring_tool if monitoring == MonitoringTool.LANGFUSE: from langfuse.decorators import observe -from cognee.tasks.summarization import summarize_code +from cognee.tasks.summarization import summarize_code, summarize_text logger = logging.getLogger("code_graph_pipeline") update_status_lock = asyncio.Lock() @observe -async def run_code_graph_pipeline(repo_path): +async def run_code_graph_pipeline(repo_path, include_docs=True): import os import pathlib + import cognee from cognee.infrastructure.databases.relational import create_db_and_tables @@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path): await cognee.prune.prune_system(metadata=True) await create_db_and_tables() + cognee_config = get_cognify_config() + user = await get_default_user() + tasks = [ Task(get_repo_file_dependencies), Task(enrich_dependency_graph, task_config={"batch_size": 50}), @@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path): Task(add_data_points, task_config={"batch_size": 50}), ] - return run_tasks(tasks, repo_path, "cognify_code_pipeline") + if include_docs: + non_code_tasks = [ + Task(get_non_code_files, task_config={"batch_size": 50}), + Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), + Task(get_data_list_for_user, dataset_name="repo_docs", user=user), + Task(classify_documents), + Task(extract_chunks_from_documents), + Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), + Task( + summarize_text, + summarization_model=cognee_config.summarization_model, + task_config={"batch_size": 50} + ), + ] + + if include_docs: + async for result in run_tasks(non_code_tasks, repo_path): + yield result + + async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"): + yield result \ No newline at end of file diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index 05e111b29..fa754028e 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -4,4 +4,5 @@ logger = logging.getLogger("task:repo_processor") from .enrich_dependency_graph import enrich_dependency_graph from .expand_dependency_graph import expand_dependency_graph +from .get_non_code_files import get_data_list_for_user, get_non_py_files from .get_repo_file_dependencies import get_repo_file_dependencies diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py new file mode 100644 index 000000000..5a8a34f64 --- /dev/null +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -0,0 +1,36 @@ +import os + +import aiofiles + +import cognee.modules.ingestion as ingestion +from cognee.infrastructure.engine import DataPoint +from cognee.modules.data.methods import get_datasets +from cognee.modules.data.methods.get_dataset_data import get_dataset_data +from cognee.modules.data.methods.get_datasets_by_name import \ + get_datasets_by_name +from cognee.modules.data.models import Data +from cognee.modules.data.operations.write_metadata import write_metadata +from cognee.modules.ingestion.data_types import BinaryData +from cognee.modules.users.methods import get_default_user +from cognee.shared.CodeGraphEntities import Repository + + +async def get_non_py_files(repo_path): + """Get files that are not .py files and their contents""" + if not os.path.exists(repo_path): + return {} + + non_py_files_paths = [ + os.path.join(root, file) + for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py") + ] + return non_py_files_paths + + +async def get_data_list_for_user(_, dataset_name, user): + datasets = await get_datasets_by_name(dataset_name, user.id) + data_documents: list[Data] = [] + for dataset in datasets: + data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id) + data_documents.extend(data_docs) + return data_documents \ No newline at end of file diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 9189de46c..c0b91972b 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,15 +1,16 @@ import argparse import asyncio + from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline -async def main(repo_path): - async for result in await run_code_graph_pipeline(repo_path): +async def main(repo_path, include_docs): + async for result in run_code_graph_pipeline(repo_path, include_docs): print(result) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository") + parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository") + parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files") args = parser.parse_args() - asyncio.run(main(args.repo_path)) - + asyncio.run(main(args.repo_path, args.include_docs)) \ No newline at end of file From 399faf9ca0e445e477e7c8e2201ec3478d62e928 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 20 Dec 2024 13:37:24 +0100 Subject: [PATCH 02/24] Fixing review findings --- cognee/tasks/repo_processor/get_non_code_files.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py index 5a8a34f64..671b998d9 100644 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -20,14 +20,26 @@ async def get_non_py_files(repo_path): if not os.path.exists(repo_path): return {} + IGNORED_PATTERNS = { + '.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd', + 'node_modules', '*.egg-info' + } + + def should_process(path): + return not any(pattern in path for pattern in IGNORED_PATTERNS) + non_py_files_paths = [ os.path.join(root, file) - for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py") + for root, _, files in os.walk(repo_path) for file in files + if not file.endswith(".py") and should_process(os.path.join(root, file)) ] return non_py_files_paths async def get_data_list_for_user(_, dataset_name, user): + # Note: This method is meant to be used as a Task in a pipeline. + # By the nature of pipelines, the output of the previous Task will be passed as the first argument here, + # but it is not needed here, hence the "_" input. datasets = await get_datasets_by_name(dataset_name, user.id) data_documents: list[Data] = [] for dataset in datasets: From 4cee9a16ce16048aedf4678201ed16b5ce22273b Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:22:45 +0100 Subject: [PATCH 03/24] fix: add allowed extensions --- .../repo_processor/get_non_code_files.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py index 671b998d9..f060782b6 100644 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -25,8 +25,27 @@ async def get_non_py_files(repo_path): 'node_modules', '*.egg-info' } + ALLOWED_EXTENSIONS = { + '.txt', '.md', '.csv', '.json', '.xml', '.yaml', '.yml', '.html', + '.css', '.js', '.ts', '.jsx', '.tsx', '.sql', '.log', '.ini', + '.toml', '.properties', '.sh', '.bash', '.dockerfile', '.gitignore', + '.gitattributes', '.makefile', '.pyproject', '.requirements', + '.env', '.pdf', '.doc', '.docx', '.dot', '.dotx', '.rtf', + '.wps', '.wpd', '.odt', '.ott', '.ottx', '.txt', '.wp', + '.sdw', '.sdx', '.docm', '.dotm', + # Additional extensions for other programming languages + '.java', '.c', '.cpp', '.h', '.cs', '.go', '.php', '.rb', + '.swift', '.pl', '.lua', '.rs', '.scala', '.kt', '.sh', + '.sql', '.v', '.asm', '.pas', '.d', '.ml', '.clj', '.cljs', + '.erl', '.ex', '.exs', '.f', '.fs', '.r', '.pyi', + '.pdb', '.ipynb', '.rmd', '.cabal', '.hs', '.nim', + '.vhdl', '.verilog', '.svelte', '.html', '.css', '.scss', + '.less', '.json5', '.yaml', '.yml' + } + def should_process(path): - return not any(pattern in path for pattern in IGNORED_PATTERNS) + _, ext = os.path.splitext(path) + return ext in ALLOWED_EXTENSIONS and not any(pattern in path for pattern in IGNORED_PATTERNS) non_py_files_paths = [ os.path.join(root, file) From dbc33a6478944991f93223a3e443c3acf12460a7 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:23:55 +0100 Subject: [PATCH 04/24] fix: adhere UnstructuredDocument.read() to Document --- .../data/processing/document_types/UnstructuredDocument.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index 62632cd08..8da065ff5 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -8,7 +8,7 @@ from cognee.modules.data.exceptions import UnstructuredLibraryImportError class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int): + def read(self, chunk_size: int, chunker = str) -> str: def get_text(): try: from unstructured.partition.auto import partition From 5e79dc53c55405217925c5a511a64efccb67578c Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:25:04 +0100 Subject: [PATCH 05/24] feat: time code graph run and add mock support --- examples/python/code_graph_example.py | 62 ++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 44ab33aad..9cd9f99c4 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -8,9 +8,61 @@ async def main(repo_path, include_docs): async for result in run_code_graph_pipeline(repo_path, include_docs): print(result) -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository") - parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files") - args = parser.parse_args() - asyncio.run(main(args.repo_path, args.include_docs)) \ No newline at end of file + parser.add_argument( + "--repo_path", + type=str, + required=True, + help="Path to the repository" + ) + parser.add_argument( + "--include_docs", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to process non-code files" + ) + parser.add_argument( + "--mock_embedding", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to mock embedding and code summary" + ) + parser.add_argument( + "--mock_code_summary", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to mock code summary" + ) + parser.add_argument( + "--time", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to time the pipeline run" + ) + return parser.parse_args() + +if __name__ == "__main__": + import os + + args = parse_args() + + if args.mock_embedding: + os.environ["MOCK_EMBEDDING"] = "true" + print("Mocking embedding.") + + if args.mock_code_summary: + os.environ["MOCK_CODE_SUMMARY"] = "true" + print("Mocking code summary.") + + if args.time: + import time + start_time = time.time() + asyncio.run(main(args.repo_path, args.include_docs)) + end_time = time.time() + print("\n" + "="*50) + print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds") + print("="*50 + "\n") + else: + asyncio.run(main(args.repo_path, args.include_docs)) + \ No newline at end of file From 4802567871d34b3926f2e3dfd33ea9655dedcb44 Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Tue, 7 Jan 2025 11:46:46 +0100 Subject: [PATCH 06/24] Overcome ContextWindowExceededError by checking token count while chunking (#413) --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- cognee/modules/chunking/TextChunker.py | 23 +++++++++++--- .../processing/document_types/Document.py | 3 +- .../document_types/ImageDocument.py | 10 ++++-- .../processing/document_types/PdfDocument.py | 10 ++++-- .../processing/document_types/TextDocument.py | 9 ++++-- .../document_types/UnstructuredDocument.py | 8 +++-- cognee/tasks/chunks/chunk_by_paragraph.py | 31 ++++++++++++++++--- .../extract_chunks_from_documents.py | 12 +++++-- 9 files changed, 84 insertions(+), 24 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 3d31b4000..2648d0731 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents), + Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192), Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), Task( summarize_text, diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 64c7aae5c..8ef4bfda9 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -1,7 +1,10 @@ -from uuid import uuid5, NAMESPACE_OID +from typing import Optional +from uuid import NAMESPACE_OID, uuid5 + +from cognee.tasks.chunks import chunk_by_paragraph from .models.DocumentChunk import DocumentChunk -from cognee.tasks.chunks import chunk_by_paragraph + class TextChunker(): document = None @@ -9,23 +12,34 @@ class TextChunker(): chunk_index = 0 chunk_size = 0 + token_count = 0 - def __init__(self, document, get_text: callable, chunk_size: int = 1024): + def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024): self.document = document self.max_chunk_size = chunk_size self.get_text = get_text + self.max_tokens = max_tokens if max_tokens else float("inf") + self.embedding_model = embedding_model + + def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data): + word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size + token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens + return word_count_fits and token_count_fits def read(self): paragraph_chunks = [] for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, + self.embedding_model, + self.max_tokens, self.max_chunk_size, batch_paragraphs = True, ): - if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size: + if self.check_word_count_and_token_count(self.chunk_size, self.token_count, chunk_data): paragraph_chunks.append(chunk_data) self.chunk_size += chunk_data["word_count"] + self.token_count += chunk_data["token_count"] else: if len(paragraph_chunks) == 0: yield DocumentChunk( @@ -63,6 +77,7 @@ class TextChunker(): print(e) paragraph_chunks = [chunk_data] self.chunk_size = chunk_data["word_count"] + self.token_count = chunk_data["token_count"] self.chunk_index += 1 diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 8d6a3dafb..6712175fb 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -1,3 +1,4 @@ +from typing import Optional from uuid import UUID from cognee.infrastructure.engine import DataPoint @@ -13,5 +14,5 @@ class Document(DataPoint): "type": "Document" } - def read(self, chunk_size: int, chunker = str) -> str: + def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 352486bd8..1f4f281f8 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -1,6 +1,10 @@ +from typing import Optional + from cognee.infrastructure.llm.get_llm_client import get_llm_client -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class ImageDocument(Document): type: str = "image" @@ -10,11 +14,11 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return(result.choices[0].message.content) - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): # Transcribe the image file text = self.transcribe_image() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text]) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 361214718..27dadda33 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -1,11 +1,15 @@ +from typing import Optional + from pypdf import PdfReader -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): file = PdfReader(self.raw_data_location) def get_text(): @@ -14,7 +18,7 @@ class PdfDocument(Document): yield page_text chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 3952d9845..895a6f8b6 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -1,10 +1,13 @@ -from .Document import Document +from typing import Optional + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): def get_text(): with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file: while True: @@ -17,6 +20,6 @@ class TextDocument(Document): chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index 8da065ff5..c94ca4a25 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -1,14 +1,16 @@ from io import StringIO +from typing import Optional from cognee.modules.chunking.TextChunker import TextChunker -from .Document import Document from cognee.modules.data.exceptions import UnstructuredLibraryImportError +from .Document import Document + class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker = str) -> str: + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str: def get_text(): try: from unstructured.partition.auto import partition @@ -27,6 +29,6 @@ class UnstructuredDocument(Document): yield text - chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 00bb5670c..546d4a1a7 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -1,8 +1,18 @@ -from uuid import uuid5, NAMESPACE_OID -from typing import Dict, Any, Iterator +from typing import Any, Dict, Iterator, Optional, Union +from uuid import NAMESPACE_OID, uuid5 + +import tiktoken + from .chunk_by_sentence import chunk_by_sentence -def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]: + +def chunk_by_paragraph( + data: str, + embedding_model: Optional[str], + max_tokens: Optional[Union[int, float]], + paragraph_length: int = 1024, + batch_paragraphs: bool = True + ) -> Iterator[Dict[str, Any]]: """ Chunks text by paragraph while preserving exact text reconstruction capability. When chunks are joined with empty string "", they reproduce the original text exactly. @@ -12,14 +22,22 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_index = 0 paragraph_ids = [] last_cut_type = None + current_token_count = 0 for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - if current_word_count > 0 and current_word_count + word_count > paragraph_length: + if embedding_model: + tokenizer = tiktoken.encoding_for_model(embedding_model) + token_count = len(tokenizer.encode(sentence)) + else: + token_count = 0 + + if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens): # Yield current chunk chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, @@ -32,11 +50,13 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs paragraph_ids = [] current_chunk = "" current_word_count = 0 + current_token_count = 0 chunk_index += 1 paragraph_ids.append(paragraph_id) current_chunk += sentence current_word_count += word_count + current_token_count += token_count # Handle end of paragraph if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: @@ -44,6 +64,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "paragraph_ids": paragraph_ids, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "chunk_index": chunk_index, @@ -53,6 +74,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs paragraph_ids = [] current_chunk = "" current_word_count = 0 + current_token_count = 0 chunk_index += 1 last_cut_type = end_type @@ -62,6 +84,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index 423b87b69..ddcdb8765 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -1,7 +1,15 @@ +from typing import Optional + from cognee.modules.data.processing.document_types.Document import Document -async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'): +async def extract_chunks_from_documents( + documents: list[Document], + chunk_size: int = 1024, + chunker='text_chunker', + embedding_model: Optional[str] = None, + max_tokens: Optional[int] = None, + ): for document in documents: - for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker): + for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens): yield document_chunk From a774191ed3153442bbdc29a79e90f45c51bc5cc5 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 7 Jan 2025 13:38:23 +0100 Subject: [PATCH 07/24] Adjust AudioDocument and handle None token limit --- .../data/processing/document_types/AudioDocument.py | 10 +++++++--- cognee/tasks/chunks/chunk_by_paragraph.py | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index 268338703..a59064674 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -1,6 +1,10 @@ +from typing import Optional + from cognee.infrastructure.llm.get_llm_client import get_llm_client -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class AudioDocument(Document): type: str = "audio" @@ -9,12 +13,12 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return(result.text) - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text]) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 546d4a1a7..2bbd9689f 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -23,6 +23,8 @@ def chunk_by_paragraph( paragraph_ids = [] last_cut_type = None current_token_count = 0 + if not max_tokens: + max_tokens = float("inf") for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit From fb13a1b61a42c6b02ad85e70644c73aef722c1d7 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 7 Jan 2025 15:00:58 +0100 Subject: [PATCH 08/24] Handle azure models as well --- cognee/tasks/chunks/chunk_by_paragraph.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 2bbd9689f..b3c191e29 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -29,6 +29,8 @@ def chunk_by_paragraph( for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit if embedding_model: + if embedding_model.startswith("azure/"): + embedding_model = embedding_model.split("/")[-1] tokenizer = tiktoken.encoding_for_model(embedding_model) token_count = len(tokenizer.encode(sentence)) else: From 8ffef5034ae560c7514d21ae2b58d1f30013354d Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 8 Jan 2025 12:25:31 +0100 Subject: [PATCH 09/24] Add clean logging to code graph example --- examples/python/code_graph_example.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 9cd9f99c4..afc83beb0 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,7 +1,9 @@ import argparse import asyncio +import logging from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline +from cognee.shared.utils import setup_logging async def main(repo_path, include_docs): @@ -43,6 +45,8 @@ def parse_args(): return parser.parse_args() if __name__ == "__main__": + setup_logging(logging.ERROR) + import os args = parse_args() From f4397bf940e3a54a745ac1be19cadb9e33a28ae4 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 8 Jan 2025 12:33:14 +0100 Subject: [PATCH 10/24] Remove setting envvars from arg --- examples/python/code_graph_example.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index afc83beb0..16eafb024 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -24,18 +24,6 @@ def parse_args(): default=True, help="Whether or not to process non-code files" ) - parser.add_argument( - "--mock_embedding", - type=lambda x: x.lower() in ("true", "1"), - default=True, - help="Whether or not to mock embedding and code summary" - ) - parser.add_argument( - "--mock_code_summary", - type=lambda x: x.lower() in ("true", "1"), - default=True, - help="Whether or not to mock code summary" - ) parser.add_argument( "--time", type=lambda x: x.lower() in ("true", "1"), @@ -47,18 +35,8 @@ def parse_args(): if __name__ == "__main__": setup_logging(logging.ERROR) - import os - args = parse_args() - if args.mock_embedding: - os.environ["MOCK_EMBEDDING"] = "true" - print("Mocking embedding.") - - if args.mock_code_summary: - os.environ["MOCK_CODE_SUMMARY"] = "true" - print("Mocking code summary.") - if args.time: import time start_time = time.time() From 34a9267f414efc9553509bfdbf63bbee6aa5be69 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 8 Jan 2025 13:23:17 +0100 Subject: [PATCH 11/24] Get embedding engine instead of passing it. Get it from vector engine instead of direct getter. --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- cognee/modules/chunking/TextChunker.py | 6 ++---- .../document_types/AudioDocument.py | 4 ++-- .../processing/document_types/Document.py | 2 +- .../document_types/ImageDocument.py | 4 ++-- .../processing/document_types/PdfDocument.py | 4 ++-- .../processing/document_types/TextDocument.py | 4 ++-- .../document_types/UnstructuredDocument.py | 4 ++-- cognee/tasks/chunks/chunk_by_paragraph.py | 19 ++++++++++--------- .../extract_chunks_from_documents.py | 3 +-- 10 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 2648d0731..7ba461f88 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192), + Task(extract_chunks_from_documents, max_tokens=8192), Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), Task( summarize_text, diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 8ef4bfda9..a9cb52bf0 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -14,13 +14,12 @@ class TextChunker(): chunk_size = 0 token_count = 0 - def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024): + def __init__(self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024): self.document = document self.max_chunk_size = chunk_size self.get_text = get_text self.max_tokens = max_tokens if max_tokens else float("inf") - self.embedding_model = embedding_model - + def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data): word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens @@ -31,7 +30,6 @@ class TextChunker(): for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, - self.embedding_model, self.max_tokens, self.max_chunk_size, batch_paragraphs = True, diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index a59064674..c4e6ae87c 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -13,12 +13,12 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return(result.text) - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 6712175fb..7c76d3f23 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -14,5 +14,5 @@ class Document(DataPoint): "type": "Document" } - def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str: + def read(self, chunk_size: int, max_tokens: Optional[int], chunker = str) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 1f4f281f8..ffe8ff3f9 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -14,11 +14,11 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return(result.choices[0].message.content) - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): # Transcribe the image file text = self.transcribe_image() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 27dadda33..463911d5b 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -9,7 +9,7 @@ from .Document import Document class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): file = PdfReader(self.raw_data_location) def get_text(): @@ -18,7 +18,7 @@ class PdfDocument(Document): yield page_text chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 895a6f8b6..582f47737 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -7,7 +7,7 @@ from .Document import Document class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): def get_text(): with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file: while True: @@ -20,6 +20,6 @@ class TextDocument(Document): chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index c94ca4a25..6c70744a0 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -10,7 +10,7 @@ from .Document import Document class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str: + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str: def get_text(): try: from unstructured.partition.auto import partition @@ -29,6 +29,6 @@ class UnstructuredDocument(Document): yield text - chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index b3c191e29..8ab66bd7f 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -3,12 +3,13 @@ from uuid import NAMESPACE_OID, uuid5 import tiktoken +from cognee.infrastructure.databases.vector import get_vector_engine + from .chunk_by_sentence import chunk_by_sentence def chunk_by_paragraph( data: str, - embedding_model: Optional[str], max_tokens: Optional[Union[int, float]], paragraph_length: int = 1024, batch_paragraphs: bool = True @@ -26,16 +27,16 @@ def chunk_by_paragraph( if not max_tokens: max_tokens = float("inf") + vector_engine = get_vector_engine() + embedding_model = vector_engine.embedding_engine.model + for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - if embedding_model: - if embedding_model.startswith("azure/"): - embedding_model = embedding_model.split("/")[-1] - tokenizer = tiktoken.encoding_for_model(embedding_model) - token_count = len(tokenizer.encode(sentence)) - else: - token_count = 0 - + + embedding_model = embedding_model.split("/")[-1] + tokenizer = tiktoken.encoding_for_model(embedding_model) + token_count = len(tokenizer.encode(sentence)) + if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens): # Yield current chunk chunk_dict = { diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index ddcdb8765..e647afbef 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -7,9 +7,8 @@ async def extract_chunks_from_documents( documents: list[Document], chunk_size: int = 1024, chunker='text_chunker', - embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, ): for document in documents: - for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens): + for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens): yield document_chunk From 97814e334f282b344cb0357df387b70cbf801397 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 8 Jan 2025 13:45:04 +0100 Subject: [PATCH 12/24] Get embedding engine instead of passing it in code chunking. --- cognee/api/v1/cognify/code_graph_pipeline.py | 6 +----- cognee/tasks/chunks/chunk_by_paragraph.py | 4 ++-- cognee/tasks/repo_processor/get_source_code_chunks.py | 9 ++++++--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 7ba461f88..6e06edfa3 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -3,8 +3,6 @@ import logging from pathlib import Path from cognee.base_config import get_base_config -from cognee.infrastructure.databases.vector.embeddings import \ - get_embedding_engine from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks from cognee.modules.pipelines.tasks.Task import Task @@ -51,8 +49,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): await cognee.prune.prune_system(metadata=True) await create_db_and_tables() - embedding_engine = get_embedding_engine() - cognee_config = get_cognify_config() user = await get_default_user() @@ -60,7 +56,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(get_repo_file_dependencies), Task(enrich_dependency_graph), Task(expand_dependency_graph, task_config={"batch_size": 50}), - Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}), + Task(get_source_code_chunks, task_config={"batch_size": 50}), Task(summarize_code, task_config={"batch_size": 50}), Task(add_data_points, task_config={"batch_size": 50}), ] diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 8ab66bd7f..44355a1ad 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -29,11 +29,11 @@ def chunk_by_paragraph( vector_engine = get_vector_engine() embedding_model = vector_engine.embedding_engine.model - + embedding_model = embedding_model.split("/")[-1] + for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - embedding_model = embedding_model.split("/")[-1] tokenizer = tiktoken.encoding_for_model(embedding_model) token_count = len(tokenizer.encode(sentence)) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 4d0ce3200..0bf7ebe32 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -5,6 +5,7 @@ from uuid import NAMESPACE_OID, uuid5 import parso import tiktoken +from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk @@ -115,13 +116,15 @@ def get_source_code_chunks_from_code_part( max_tokens: int = 8192, overlap: float = 0.25, granularity: float = 0.1, - model_name: str = "text-embedding-3-large" ) -> Generator[SourceCodeChunk, None, None]: """Yields source code chunks from a CodePart object, with configurable token limits and overlap.""" if not code_file_part.source_code: logger.error(f"No source code in CodeFile {code_file_part.id}") return + vector_engine = get_vector_engine() + embedding_model = vector_engine.embedding_engine.model + model_name = embedding_model.split("/")[-1] tokenizer = tiktoken.encoding_for_model(model_name) max_subchunk_tokens = max(1, int(granularity * max_tokens)) subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens) @@ -141,7 +144,7 @@ def get_source_code_chunks_from_code_part( previous_chunk = current_chunk -async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \ +async def get_source_code_chunks(data_points: list[DataPoint]) -> \ AsyncGenerator[list[DataPoint], None]: """Processes code graph datapoints, create SourceCodeChink datapoints.""" # TODO: Add support for other embedding models, with max_token mapping @@ -156,7 +159,7 @@ async def get_source_code_chunks(data_points: list[DataPoint], embedding_model=" for code_part in data_point.contains: try: yield code_part - for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model): + for source_code_chunk in get_source_code_chunks_from_code_part(code_part): yield source_code_chunk except Exception as e: logger.error(f"Error processing code part: {e}") From abb3ea6d219f8221500fb7a7e7f6cc404cf75b08 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 9 Jan 2025 11:31:16 +0100 Subject: [PATCH 13/24] Adjust integration tests --- .../data/processing/document_types/AudioDocument.py | 2 +- cognee/modules/data/processing/document_types/Document.py | 2 +- .../data/processing/document_types/ImageDocument.py | 2 +- .../modules/data/processing/document_types/PdfDocument.py | 2 +- .../data/processing/document_types/TextDocument.py | 2 +- .../processing/document_types/UnstructuredDocument.py | 2 +- .../integration/documents/UnstructuredDocument_test.py | 8 ++++---- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index faace056b..b7d2476b4 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -13,7 +13,7 @@ class AudioDocument(Document): result = get_llm_client().create_transcript(self.raw_data_location) return result.text - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the audio file text = self.create_transcript() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 9a29e7797..7ecdf289e 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -11,5 +11,5 @@ class Document(DataPoint): mime_type: str _metadata: dict = {"index_fields": ["name"], "type": "Document"} - def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str: + def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index f0c7a6d61..c055b8253 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -13,7 +13,7 @@ class ImageDocument(Document): result = get_llm_client().transcribe_image(self.raw_data_location) return result.choices[0].message.content - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the image file text = self.transcribe_image() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 56969c7f8..768f91264 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -9,7 +9,7 @@ from .Document import Document class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): file = PdfReader(self.raw_data_location) def get_text(): diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 11dc798aa..b62ccd56e 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -7,7 +7,7 @@ from .Document import Document class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): def get_text(): with open(self.raw_data_location, mode="r", encoding="utf-8") as file: while True: diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index d6b64498c..1c291d0dc 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -10,7 +10,7 @@ from .Document import Document class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str: + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str: def get_text(): try: from unstructured.partition.auto import partition diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 03b8deb49..e0278de81 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -68,7 +68,7 @@ def test_UnstructuredDocument(): ) # Test PPTX - for paragraph_data in pptx_document.read(chunk_size=1024): + for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert ( @@ -76,7 +76,7 @@ def test_UnstructuredDocument(): ), f" sentence_cut != {paragraph_data.cut_type = }" # Test DOCX - for paragraph_data in docx_document.read(chunk_size=1024): + for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert ( @@ -84,7 +84,7 @@ def test_UnstructuredDocument(): ), f" sentence_end != {paragraph_data.cut_type = }" # TEST CSV - for paragraph_data in csv_document.read(chunk_size=1024): + for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" assert ( "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text @@ -94,7 +94,7 @@ def test_UnstructuredDocument(): ), f" sentence_cut != {paragraph_data.cut_type = }" # Test XLSX - for paragraph_data in xlsx_document.read(chunk_size=1024): + for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert ( From cdaae161a8e006415988a45ca529d5bc71f9d632 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 9 Jan 2025 12:08:42 +0100 Subject: [PATCH 14/24] Handle circular import --- cognee/tasks/repo_processor/__init__.py | 3 --- cognee/tasks/repo_processor/expand_dependency_graph.py | 5 ++++- cognee/tasks/repo_processor/extract_code_parts.py | 4 +++- cognee/tasks/repo_processor/get_local_dependencies.py | 4 +++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index 6dc032547..8f0df23d8 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -2,6 +2,3 @@ from .enrich_dependency_graph import enrich_dependency_graph from .expand_dependency_graph import expand_dependency_graph from .get_non_code_files import get_data_list_for_user, get_non_py_files from .get_repo_file_dependencies import get_repo_file_dependencies -import logging - -logger = logging.getLogger("task:repo_processor") diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py index de26fe8d4..d3f5d1b07 100644 --- a/cognee/tasks/repo_processor/expand_dependency_graph.py +++ b/cognee/tasks/repo_processor/expand_dependency_graph.py @@ -5,7 +5,10 @@ from uuid import NAMESPACE_OID, uuid5 from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts -from cognee.tasks.repo_processor import logger + +import logging + +logger = logging.getLogger("task:repo_processor") def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None: diff --git a/cognee/tasks/repo_processor/extract_code_parts.py b/cognee/tasks/repo_processor/extract_code_parts.py index 76cfef538..c181a87d9 100644 --- a/cognee/tasks/repo_processor/extract_code_parts.py +++ b/cognee/tasks/repo_processor/extract_code_parts.py @@ -1,7 +1,9 @@ from typing import Dict, List import parso -from cognee.tasks.repo_processor import logger +import logging + +logger = logging.getLogger("task:repo_processor") def _extract_parts_from_module(module, parts_dict: Dict[str, List[str]]) -> Dict[str, List[str]]: diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py index b443829c9..92b50cd0b 100644 --- a/cognee/tasks/repo_processor/get_local_dependencies.py +++ b/cognee/tasks/repo_processor/get_local_dependencies.py @@ -10,7 +10,9 @@ import jedi import parso from parso.tree import BaseNode -from cognee.tasks.repo_processor import logger +import logging + +logger = logging.getLogger("task:repo_processor") @contextmanager From 626bc76f5ccdb830e741cf74464e1e3a967dec75 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 9 Jan 2025 12:53:26 +0100 Subject: [PATCH 15/24] Set max_tokens in config --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- cognee/modules/cognify/config.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 53c41d43b..2d077f39b 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents, max_tokens=8192), + Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens), Task( extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50} ), diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py index d40410bfc..dd94d8b41 100644 --- a/cognee/modules/cognify/config.py +++ b/cognee/modules/cognify/config.py @@ -1,12 +1,14 @@ from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent +from typing import Optional +import os class CognifyConfig(BaseSettings): classification_model: object = DefaultContentPrediction summarization_model: object = SummarizedContent - + max_tokens: Optional[int] = os.getenv("MAX_TOKENS") model_config = SettingsConfigDict(env_file=".env", extra="allow") def to_dict(self) -> dict: From d7b2186300db585ee5dd1affb20e85e3017a1606 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 9 Jan 2025 14:27:37 +0100 Subject: [PATCH 16/24] Adjust SWE-bench script to code graph pipeline call --- evals/eval_swe_bench.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 789c95ab4..509530685 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -34,9 +34,8 @@ def check_install_package(package_name): async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") - pipeline = await run_code_graph_pipeline(repo_path) - async for result in pipeline: + async for result in run_code_graph_pipeline(repo_path, include_docs=True): print(result) print("Here we have the repo under the repo_path") @@ -47,7 +46,16 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp instructions = read_query_prompt("patch_gen_kg_instructions.txt") retrieved_edges = await brute_force_triplet_search( - problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"] + problem_statement, + top_k=3, + collections=[ + "code_summary_text", + "data_point_name", + "document_chunk_text", + "entity_name", + "entity_type_name", + "sourcecodechunk_source_code", + ], ) retrieved_edges_str = retrieved_edges_to_string(retrieved_edges) From 18bb282fbc7fb3c7e770641bd0b64fa38af7dd92 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 9 Jan 2025 14:27:37 +0100 Subject: [PATCH 17/24] Adjust SWE-bench script to code graph pipeline call --- evals/eval_swe_bench.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 789c95ab4..20e005751 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -34,9 +34,8 @@ def check_install_package(package_name): async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") - pipeline = await run_code_graph_pipeline(repo_path) - async for result in pipeline: + async for result in run_code_graph_pipeline(repo_path, include_docs=True): print(result) print("Here we have the repo under the repo_path") @@ -47,7 +46,9 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp instructions = read_query_prompt("patch_gen_kg_instructions.txt") retrieved_edges = await brute_force_triplet_search( - problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"] + problem_statement, + top_k=3, + collections=["code_summary_text"], ) retrieved_edges_str = retrieved_edges_to_string(retrieved_edges) From 56cc2233027e80dda25fe0fb7ee14347915800d6 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:46:41 +0100 Subject: [PATCH 18/24] feat: adds pydantic types to graph layer models --- cognee/modules/chunking/models/DocumentChunk.py | 1 + cognee/modules/engine/models/Entity.py | 1 + cognee/modules/engine/models/EntityType.py | 1 + cognee/shared/CodeGraphEntities.py | 4 ++++ cognee/tasks/summarization/models.py | 1 + 5 files changed, 8 insertions(+) diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 4920e9b06..a232d50a1 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -12,6 +12,7 @@ class DocumentChunk(DataPoint): chunk_index: int cut_type: str is_part_of: Document + pydantic_type: str = "DocumentChunk" contains: List[Entity] = None _metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"} diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 63a153bf2..0e57d5dba 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -7,5 +7,6 @@ class Entity(DataPoint): name: str is_a: EntityType description: str + pydantic_type: str = "Entity" _metadata: dict = {"index_fields": ["name"], "type": "Entity"} diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py index 7225bb3ae..10799bb33 100644 --- a/cognee/modules/engine/models/EntityType.py +++ b/cognee/modules/engine/models/EntityType.py @@ -5,5 +5,6 @@ class EntityType(DataPoint): __tablename__ = "entity_type" name: str description: str + pydantic_type: str = "EntityType" _metadata: dict = {"index_fields": ["name"], "type": "EntityType"} diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 164327da0..926aae9fa 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -5,12 +5,14 @@ from cognee.infrastructure.engine import DataPoint class Repository(DataPoint): __tablename__ = "Repository" path: str + pydantic_type: str = "Repository" _metadata: dict = {"index_fields": [], "type": "Repository"} class CodeFile(DataPoint): __tablename__ = "codefile" extracted_id: str # actually file path + pydantic_type: str = "CodeFile" source_code: Optional[str] = None part_of: Optional[Repository] = None depends_on: Optional[List["CodeFile"]] = None @@ -22,6 +24,7 @@ class CodeFile(DataPoint): class CodePart(DataPoint): __tablename__ = "codepart" # part_of: Optional[CodeFile] = None + pydantic_type: str = "CodePart" source_code: Optional[str] = None _metadata: dict = {"index_fields": [], "type": "CodePart"} @@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint): __tablename__ = "sourcecodechunk" code_chunk_of: Optional[CodePart] = None source_code: Optional[str] = None + pydantic_type: str = "SourceCodeChunk" previous_chunk: Optional["SourceCodeChunk"] = None _metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"} diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index fc62209ce..bc7b4886d 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -17,5 +17,6 @@ class CodeSummary(DataPoint): __tablename__ = "code_summary" text: str summarizes: Union[CodeFile, CodePart, SourceCodeChunk] + pydantic_type: str = "CodeSummary" _metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"} From 9604d95ba515ecb1056f2f103a1e83e581c546dc Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:54:58 +0100 Subject: [PATCH 19/24] feat: adds basic retriever for swe bench --- .../description_to_codepart_search.py | 31 +++++++---- cognee/shared/data_models.py | 4 ++ evals/eval_swe_bench.py | 53 ++++++++----------- 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index ecd187907..fec17fb16 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -10,7 +10,7 @@ from cognee.modules.users.models import User from cognee.shared.utils import send_telemetry -async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list: +async def code_description_to_code_part_search(query: str, user: User = None, top_k=5) -> list: if user is None: user = await get_default_user() @@ -55,21 +55,23 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L ) try: - results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k) - if not results: + code_summaries = await vector_engine.search( + "code_summary_text", query_text=query, limit=top_k + ) + if not code_summaries: logging.warning("No results found for query: '%s' by user: %s", query, user.id) return [] memory_fragment = CogneeGraph() await memory_fragment.project_graph_from_db( graph_engine, - node_properties_to_project=["id", "type", "text", "source_code"], + node_properties_to_project=["id", "type", "text", "source_code", "pydantic_type"], edge_properties_to_project=["relationship_name"], ) code_pieces_to_return = set() - for node in results: + for node in code_summaries: node_id = str(node.id) node_to_search_from = memory_fragment.get_node(node_id) @@ -78,9 +80,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L continue for code_file in node_to_search_from.get_skeleton_neighbours(): - for code_file_edge in code_file.get_skeleton_edges(): - if code_file_edge.get_attribute("relationship_name") == "contains": - code_pieces_to_return.add(code_file_edge.get_destination_node()) + if code_file.get_attribute("pydantic_type") == "SourceCodeChunk": + for code_file_edge in code_file.get_skeleton_edges(): + if code_file_edge.get_attribute("relationship_name") == "code_chunk_of": + code_pieces_to_return.add(code_file_edge.get_destination_node()) + elif code_file.get_attribute("pydantic_type") == "CodePart": + code_pieces_to_return.add(code_file) + elif code_file.get_attribute("pydantic_type") == "CodeFile": + for code_file_edge in code_file.get_skeleton_edges(): + if code_file_edge.get_attribute("relationship_name") == "contains": + code_pieces_to_return.add(code_file_edge.get_destination_node()) logging.info( "Search completed for user: %s, query: '%s'. Found %d code pieces.", @@ -89,7 +98,11 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L len(code_pieces_to_return), ) - return list(code_pieces_to_return) + context = "" + for code_piece in code_pieces_to_return: + context = context + code_piece.get_attribute("source_code") + + return context except Exception as exec_error: logging.error( diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index d23d2841c..a36a09010 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -231,6 +231,7 @@ class SummarizedContent(BaseModel): summary: str description: str + pydantic_type: str = "SummarizedContent" class SummarizedFunction(BaseModel): @@ -239,6 +240,7 @@ class SummarizedFunction(BaseModel): inputs: Optional[List[str]] = None outputs: Optional[List[str]] = None decorators: Optional[List[str]] = None + pydantic_type: str = "SummarizedFunction" class SummarizedClass(BaseModel): @@ -246,6 +248,7 @@ class SummarizedClass(BaseModel): description: str methods: Optional[List[SummarizedFunction]] = None decorators: Optional[List[str]] = None + pydantic_type: str = "SummarizedClass" class SummarizedCode(BaseModel): @@ -256,6 +259,7 @@ class SummarizedCode(BaseModel): classes: List[SummarizedClass] = [] functions: List[SummarizedFunction] = [] workflow_description: Optional[str] = None + pydantic_type: str = "SummarizedCode" class GraphDBType(Enum): diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 20e005751..b5fcc616b 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -11,7 +11,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline from cognee.api.v1.search import SearchType from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt -from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search +from cognee.modules.retrieval.description_to_codepart_search import ( + code_description_to_code_part_search, +) from cognee.shared.utils import render_graph from evals.eval_utils import download_github_repo, retrieved_edges_to_string @@ -32,26 +34,16 @@ def check_install_package(package_name): return False -async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): - repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") - - async for result in run_code_graph_pipeline(repo_path, include_docs=True): - print(result) - - print("Here we have the repo under the repo_path") - - await render_graph(None, include_labels=True, include_nodes=True) - +async def generate_patch_with_cognee(instance): + """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")""" problem_statement = instance["problem_statement"] instructions = read_query_prompt("patch_gen_kg_instructions.txt") - retrieved_edges = await brute_force_triplet_search( - problem_statement, - top_k=3, - collections=["code_summary_text"], - ) + repo_path = "/Users/laszlohajdu/Documents/GitHub/test/" + async for result in run_code_graph_pipeline(repo_path, include_docs=False): + print(result) - retrieved_edges_str = retrieved_edges_to_string(retrieved_edges) + retrieved_codeparts = await code_description_to_code_part_search(problem_statement) prompt = "\n".join( [ @@ -60,7 +52,7 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp PATCH_EXAMPLE, "", "These are the retrieved edges:", - retrieved_edges_str, + retrieved_codeparts, ] ) @@ -86,8 +78,6 @@ async def generate_patch_without_cognee(instance, llm_client): async def get_preds(dataset, with_cognee=True): - llm_client = get_llm_client() - if with_cognee: model_name = "with_cognee" pred_func = generate_patch_with_cognee @@ -95,17 +85,18 @@ async def get_preds(dataset, with_cognee=True): model_name = "without_cognee" pred_func = generate_patch_without_cognee - futures = [(instance["instance_id"], pred_func(instance, llm_client)) for instance in dataset] - model_patches = await asyncio.gather(*[x[1] for x in futures]) + preds = [] - preds = [ - { - "instance_id": instance_id, - "model_patch": model_patch, - "model_name_or_path": model_name, - } - for (instance_id, _), model_patch in zip(futures, model_patches) - ] + for instance in dataset: + instance_id = instance["instance_id"] + model_patch = await pred_func(instance) # Sequentially await the async function + preds.append( + { + "instance_id": instance_id, + "model_patch": model_patch, + "model_name_or_path": model_name, + } + ) return preds @@ -135,6 +126,7 @@ async def main(): with open(predictions_path, "w") as file: json.dump(preds, file) + """ This part is for the evaluation subprocess.run( [ "python", @@ -152,6 +144,7 @@ async def main(): "test_run", ] ) + """ if __name__ == "__main__": From 8327c053990f9e71c19a087603865e3cbfa54a8c Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 10 Jan 2025 11:47:21 +0100 Subject: [PATCH 20/24] Match Ruff version in config to the one in github actions --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 42f12ea51..96bfe6d32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: - id: check-added-large-files - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.8.3 + rev: v0.9.0 hooks: # Run the linter. - id: ruff From 6177d04b44e8e16ebd7070fb1b4f127e0aea4d6f Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:03:34 +0100 Subject: [PATCH 21/24] feat: implements code retreiver --- .../llm/prompts/patch_gen_kg_instructions.txt | 9 +++-- .../description_to_codepart_search.py | 38 +++++++++++++++++-- evals/eval_swe_bench.py | 13 ++++--- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt index ebbb03f75..3117ac9f1 100644 --- a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt +++ b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt @@ -1,3 +1,6 @@ -I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and -generate a single patch file that I can apply directly to this repository using git apply. -Please respond with a single patch file in the following format. \ No newline at end of file +You are a senior software engineer. I need you to solve this issue by looking at the provided context and +generate a single patch file that I can apply directly to this repository using git apply. +Additionally, please make sure that you provide code only with correct syntax and +you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing +functions or classes, as they may be referenced from other code. +Please respond only with a single patch file in the following format without adding any additional context or string. diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index fec17fb16..538f76a6e 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -8,20 +8,27 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.modules.users.methods import get_default_user from cognee.modules.users.models import User from cognee.shared.utils import send_telemetry +from cognee.api.v1.search import SearchType +from cognee.api.v1.search.search_v2 import search +from cognee.infrastructure.llm.get_llm_client import get_llm_client -async def code_description_to_code_part_search(query: str, user: User = None, top_k=5) -> list: +async def code_description_to_code_part_search( + query: str, include_docs=False, user: User = None, top_k=5 +) -> list: if user is None: user = await get_default_user() if user is None: raise PermissionError("No user found in the system. Please create a user.") - retrieved_codeparts = await code_description_to_code_part(query, user, top_k) + retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs) return retrieved_codeparts -async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]: +async def code_description_to_code_part( + query: str, user: User, top_k: int, include_docs: bool +) -> List[str]: """ Maps a code description query to relevant code parts using a CodeGraph pipeline. @@ -29,6 +36,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L query (str): The search query describing the code parts. user (User): The user performing the search. top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher) + include_docs(bool): Boolean showing whether we have the docs in the graph or not Returns: Set[str]: A set of unique code parts matching the query. @@ -37,6 +45,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L ValueError: If arguments are invalid. RuntimeError: If an unexpected error occurs during execution. """ + print(include_docs) if not query or not isinstance(query, str): raise ValueError("The query must be a non-empty string.") if top_k <= 0 or not isinstance(top_k, int): @@ -55,6 +64,26 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L ) try: + if include_docs: + search_results = await search(SearchType.INSIGHTS, query_text=query) + + concatenated_descriptions = " ".join( + obj["description"] + for tpl in search_results + for obj in tpl + if isinstance(obj, dict) and "description" in obj + ) + + llm_client = get_llm_client() + context_from_documents = await llm_client.acreate_structured_output( + text_input=f"The retrieved context from documents" + f" is {concatenated_descriptions}.", + system_prompt="You are a Senior Software Engineer, summarize the context from documents" + f" in a way that it is gonna be provided next to codeparts as context" + f" while trying to solve this github issue connected to the project: {query}]", + response_model=str, + ) + code_summaries = await vector_engine.search( "code_summary_text", query_text=query, limit=top_k ) @@ -102,6 +131,9 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L for code_piece in code_pieces_to_return: context = context + code_piece.get_attribute("source_code") + if include_docs: + context = context_from_documents + context + return context except Exception as exec_error: diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index b5fcc616b..894acf1bb 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -14,8 +14,6 @@ from cognee.infrastructure.llm.prompts import read_query_prompt from cognee.modules.retrieval.description_to_codepart_search import ( code_description_to_code_part_search, ) -from cognee.shared.utils import render_graph -from evals.eval_utils import download_github_repo, retrieved_edges_to_string def check_install_package(package_name): @@ -36,14 +34,17 @@ def check_install_package(package_name): async def generate_patch_with_cognee(instance): """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")""" + include_docs = True problem_statement = instance["problem_statement"] instructions = read_query_prompt("patch_gen_kg_instructions.txt") - repo_path = "/Users/laszlohajdu/Documents/GitHub/test/" - async for result in run_code_graph_pipeline(repo_path, include_docs=False): + repo_path = "/Users/laszlohajdu/Documents/GitHub/graph_rag/" + async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs): print(result) - retrieved_codeparts = await code_description_to_code_part_search(problem_statement) + retrieved_codeparts = await code_description_to_code_part_search( + problem_statement, include_docs=include_docs + ) prompt = "\n".join( [ @@ -51,7 +52,7 @@ async def generate_patch_with_cognee(instance): "", PATCH_EXAMPLE, "", - "These are the retrieved edges:", + "This is the additional context to solve the problem (description from documentation together with codeparts):", retrieved_codeparts, ] ) From 06e8d2268b231abb680a89ff86c2525d16599389 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:52:26 +0100 Subject: [PATCH 22/24] Fix: fixes unit test for codepart search --- .../retrieval/description_to_codepart_search.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index 538f76a6e..bd506f76a 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -27,7 +27,7 @@ async def code_description_to_code_part_search( async def code_description_to_code_part( - query: str, user: User, top_k: int, include_docs: bool + query: str, user: User, top_k: int, include_docs: bool = False ) -> List[str]: """ Maps a code description query to relevant code parts using a CodeGraph pipeline. @@ -45,7 +45,6 @@ async def code_description_to_code_part( ValueError: If arguments are invalid. RuntimeError: If an unexpected error occurs during execution. """ - print(include_docs) if not query or not isinstance(query, str): raise ValueError("The query must be a non-empty string.") if top_k <= 0 or not isinstance(top_k, int): @@ -94,7 +93,13 @@ async def code_description_to_code_part( memory_fragment = CogneeGraph() await memory_fragment.project_graph_from_db( graph_engine, - node_properties_to_project=["id", "type", "text", "source_code", "pydantic_type"], + node_properties_to_project=[ + "id", + "type", + "text", + "source_code", + "pydantic_type", + ], edge_properties_to_project=["relationship_name"], ) From 872bc8964843894c3f122f767516ea8e8214e275 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 10 Jan 2025 15:11:00 +0100 Subject: [PATCH 23/24] Format with Ruff 0.9.0 --- .../databases/graph/neo4j_driver/adapter.py | 2 +- .../retrieval/brute_force_triplet_search.py | 2 +- .../description_to_codepart_search.py | 3 +- .../documents/AudioDocument_test.py | 18 +++++----- .../documents/ImageDocument_test.py | 18 +++++----- .../integration/documents/PdfDocument_test.py | 18 +++++----- .../documents/TextDocument_test.py | 18 +++++----- .../documents/UnstructuredDocument_test.py | 30 ++++++++-------- cognee/tests/test_deduplication.py | 12 +++---- cognee/tests/test_falkordb.py | 6 ++-- cognee/tests/test_library.py | 6 ++-- cognee/tests/test_pgvector.py | 36 +++++++++---------- .../chunks/chunk_by_paragraph_2_test.py | 18 +++++----- .../chunks/chunk_by_paragraph_test.py | 6 ++-- .../chunks/chunk_by_sentence_test.py | 12 +++---- .../processing/chunks/chunk_by_word_test.py | 6 ++-- 16 files changed, 105 insertions(+), 106 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 5490f6b43..3543418fc 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -493,7 +493,7 @@ class Neo4jAdapter(GraphDBInterface): query_edges = f""" MATCH (n)-[r]->(m) - WHERE {where_clause} AND {where_clause.replace('n.', 'm.')} + WHERE {where_clause} AND {where_clause.replace("n.", "m.")} RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties """ result_edges = await self.query(query_edges) diff --git a/cognee/modules/retrieval/brute_force_triplet_search.py b/cognee/modules/retrieval/brute_force_triplet_search.py index fdd312480..9c778505d 100644 --- a/cognee/modules/retrieval/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/brute_force_triplet_search.py @@ -43,7 +43,7 @@ def format_triplets(edges): edge_info = {key: value for key, value in edge_attributes.items() if value is not None} # Create the formatted triplet - triplet = f"Node1: {node1_info}\n" f"Edge: {edge_info}\n" f"Node2: {node2_info}\n\n\n" + triplet = f"Node1: {node1_info}\nEdge: {edge_info}\nNode2: {node2_info}\n\n\n" triplets.append(triplet) return "".join(triplets) diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index bd506f76a..243fdbde3 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -75,8 +75,7 @@ async def code_description_to_code_part( llm_client = get_llm_client() context_from_documents = await llm_client.acreate_structured_output( - text_input=f"The retrieved context from documents" - f" is {concatenated_descriptions}.", + text_input=f"The retrieved context from documents is {concatenated_descriptions}.", system_prompt="You are a Senior Software Engineer, summarize the context from documents" f" in a way that it is gonna be provided next to codeparts as context" f" while trying to solve this github issue connected to the project: {query}]", diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index dbd43ddda..e07a2431b 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -36,12 +36,12 @@ def test_AudioDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index c0877ae99..b8d585419 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -25,12 +25,12 @@ def test_ImageDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index 8f28815d3..fc4307846 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -27,12 +27,12 @@ def test_PdfDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 1e143d563..6daec62b7 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -39,12 +39,12 @@ def test_TextDocument(input_file, chunk_size): for ground_truth, paragraph_data in zip( GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker") ): - assert ( - ground_truth["word_count"] == paragraph_data.word_count - ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - assert ground_truth["len_text"] == len( - paragraph_data.text - ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - assert ( - ground_truth["cut_type"] == paragraph_data.cut_type - ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + assert ground_truth["word_count"] == paragraph_data.word_count, ( + f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + ) + assert ground_truth["len_text"] == len(paragraph_data.text), ( + f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + ) + assert ground_truth["cut_type"] == paragraph_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' + ) diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index e0278de81..773dc2293 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -71,32 +71,32 @@ def test_UnstructuredDocument(): for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) # Test DOCX for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" - assert ( - "sentence_end" == paragraph_data.cut_type - ), f" sentence_end != {paragraph_data.cut_type = }" + assert "sentence_end" == paragraph_data.cut_type, ( + f" sentence_end != {paragraph_data.cut_type = }" + ) # TEST CSV for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" - assert ( - "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text - ), f"Read text doesn't match expected text: {paragraph_data.text}" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( + f"Read text doesn't match expected text: {paragraph_data.text}" + ) + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) # Test XLSX for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" - assert ( - "sentence_cut" == paragraph_data.cut_type - ), f" sentence_cut != {paragraph_data.cut_type = }" + assert "sentence_cut" == paragraph_data.cut_type, ( + f" sentence_cut != {paragraph_data.cut_type = }" + ) diff --git a/cognee/tests/test_deduplication.py b/cognee/tests/test_deduplication.py index 9c2df032d..89c866f12 100644 --- a/cognee/tests/test_deduplication.py +++ b/cognee/tests/test_deduplication.py @@ -30,9 +30,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert ( - result[0]["name"] == "Natural_language_processing_copy" - ), "Result name does not match expected value." + assert result[0]["name"] == "Natural_language_processing_copy", ( + "Result name does not match expected value." + ) result = await relational_engine.get_all_data_from_table("datasets") assert len(result) == 2, "Unexpected number of datasets found." @@ -61,9 +61,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert ( - hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"] - ), "Content hash is not a part of file name." + assert hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"], ( + "Content hash is not a part of file name." + ) await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_falkordb.py b/cognee/tests/test_falkordb.py index 07ece9eb2..af0e87916 100755 --- a/cognee/tests/test_falkordb.py +++ b/cognee/tests/test_falkordb.py @@ -85,9 +85,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists( - get_relational_engine().db_path - ), "SQLite relational database is not empty" + assert not os.path.exists(get_relational_engine().db_path), ( + "SQLite relational database is not empty" + ) from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 8352b4161..192b67506 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -82,9 +82,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists( - get_relational_engine().db_path - ), "SQLite relational database is not empty" + assert not os.path.exists(get_relational_engine().db_path), ( + "SQLite relational database is not empty" + ) from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index c241177f0..73b6be974 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -24,28 +24,28 @@ async def test_local_file_deletion(data_text, file_location): data_hash = hashlib.md5(encoded_text).hexdigest() # Get data entry from database based on hash contents data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() - assert os.path.isfile( - data.raw_data_location - ), f"Data location doesn't exist: {data.raw_data_location}" + assert os.path.isfile(data.raw_data_location), ( + f"Data location doesn't exist: {data.raw_data_location}" + ) # Test deletion of data along with local files created by cognee await engine.delete_data_entity(data.id) - assert not os.path.exists( - data.raw_data_location - ), f"Data location still exists after deletion: {data.raw_data_location}" + assert not os.path.exists(data.raw_data_location), ( + f"Data location still exists after deletion: {data.raw_data_location}" + ) async with engine.get_async_session() as session: # Get data entry from database based on file path data = ( await session.scalars(select(Data).where(Data.raw_data_location == file_location)) ).one() - assert os.path.isfile( - data.raw_data_location - ), f"Data location doesn't exist: {data.raw_data_location}" + assert os.path.isfile(data.raw_data_location), ( + f"Data location doesn't exist: {data.raw_data_location}" + ) # Test local files not created by cognee won't get deleted await engine.delete_data_entity(data.id) - assert os.path.exists( - data.raw_data_location - ), f"Data location doesn't exists: {data.raw_data_location}" + assert os.path.exists(data.raw_data_location), ( + f"Data location doesn't exists: {data.raw_data_location}" + ) async def test_getting_of_documents(dataset_name_1): @@ -54,16 +54,16 @@ async def test_getting_of_documents(dataset_name_1): user = await get_default_user() document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - assert ( - len(document_ids) == 1 - ), f"Number of expected documents doesn't match {len(document_ids)} != 1" + assert len(document_ids) == 1, ( + f"Number of expected documents doesn't match {len(document_ids)} != 1" + ) # Test getting of documents for search when no dataset is provided user = await get_default_user() document_ids = await get_document_ids_for_user(user.id) - assert ( - len(document_ids) == 2 - ), f"Number of expected documents doesn't match {len(document_ids)} != 2" + assert len(document_ids) == 2, ( + f"Number of expected documents doesn't match {len(document_ids)} != 2" + ) async def main(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py index 53098fc67..d8680a604 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py @@ -17,9 +17,9 @@ batch_paragraphs_vals = [True, False] def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) reconstructed_text = "".join([chunk["text"] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( @@ -36,9 +36,9 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] - assert np.all( - chunk_lengths <= paragraph_length - ), f"{paragraph_length = }: {larger_chunks} are too large" + assert np.all(chunk_lengths <= paragraph_length), ( + f"{paragraph_length = }: {larger_chunks} are too large" + ) @pytest.mark.parametrize( @@ -50,6 +50,6 @@ def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_ data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs ) chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) - assert np.all( - chunk_indices == np.arange(len(chunk_indices)) - ), f"{chunk_indices = } are not monotonically increasing" + assert np.all(chunk_indices == np.arange(len(chunk_indices))), ( + f"{chunk_indices = } are not monotonically increasing" + ) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index e7d9a54ba..e420b2e9f 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -58,9 +58,9 @@ def run_chunking_test(test_text, expected_chunks): for expected_chunks_item, chunk in zip(expected_chunks, chunks): for key in ["text", "word_count", "cut_type"]: - assert ( - chunk[key] == expected_chunks_item[key] - ), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" + assert chunk[key] == expected_chunks_item[key], ( + f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" + ) def test_chunking_whole_text(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index d1c75d7ed..efa053077 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -16,9 +16,9 @@ maximum_length_vals = [None, 8, 64] def test_chunk_by_sentence_isomorphism(input_text, maximum_length): chunks = chunk_by_sentence(input_text, maximum_length) reconstructed_text = "".join([chunk[1] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( @@ -36,6 +36,6 @@ def test_paragraph_chunk_length(input_text, maximum_length): chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > maximum_length] - assert np.all( - chunk_lengths <= maximum_length - ), f"{maximum_length = }: {larger_chunks} are too large" + assert np.all(chunk_lengths <= maximum_length), ( + f"{maximum_length = }: {larger_chunks} are too large" + ) diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py index fb26638cb..d79fcdbc8 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -17,9 +17,9 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS def test_chunk_by_word_isomorphism(input_text): chunks = chunk_by_word(input_text) reconstructed_text = "".join([chunk[0] for chunk in chunks]) - assert ( - reconstructed_text == input_text - ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) @pytest.mark.parametrize( From e2ad54d88e93bed169182feb212fbcb9a3fb86f1 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 10 Jan 2025 15:54:45 +0100 Subject: [PATCH 24/24] Fix: deleting incorrect repo path --- evals/eval_swe_bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 894acf1bb..a8b4c8a1d 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -14,6 +14,7 @@ from cognee.infrastructure.llm.prompts import read_query_prompt from cognee.modules.retrieval.description_to_codepart_search import ( code_description_to_code_part_search, ) +from evals.eval_utils import download_github_repo, retrieved_edges_to_string def check_install_package(package_name): @@ -33,12 +34,11 @@ def check_install_package(package_name): async def generate_patch_with_cognee(instance): - """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")""" + repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") include_docs = True problem_statement = instance["problem_statement"] instructions = read_query_prompt("patch_gen_kg_instructions.txt") - repo_path = "/Users/laszlohajdu/Documents/GitHub/graph_rag/" async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs): print(result)