From 56cc2233027e80dda25fe0fb7ee14347915800d6 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:46:41 +0100 Subject: [PATCH 1/4] feat: adds pydantic types to graph layer models --- cognee/modules/chunking/models/DocumentChunk.py | 1 + cognee/modules/engine/models/Entity.py | 1 + cognee/modules/engine/models/EntityType.py | 1 + cognee/shared/CodeGraphEntities.py | 4 ++++ cognee/tasks/summarization/models.py | 1 + 5 files changed, 8 insertions(+) diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 4920e9b06..a232d50a1 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -12,6 +12,7 @@ class DocumentChunk(DataPoint): chunk_index: int cut_type: str is_part_of: Document + pydantic_type: str = "DocumentChunk" contains: List[Entity] = None _metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"} diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index 63a153bf2..0e57d5dba 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -7,5 +7,6 @@ class Entity(DataPoint): name: str is_a: EntityType description: str + pydantic_type: str = "Entity" _metadata: dict = {"index_fields": ["name"], "type": "Entity"} diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py index 7225bb3ae..10799bb33 100644 --- a/cognee/modules/engine/models/EntityType.py +++ b/cognee/modules/engine/models/EntityType.py @@ -5,5 +5,6 @@ class EntityType(DataPoint): __tablename__ = "entity_type" name: str description: str + pydantic_type: str = "EntityType" _metadata: dict = {"index_fields": ["name"], "type": "EntityType"} diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 164327da0..926aae9fa 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -5,12 +5,14 @@ from cognee.infrastructure.engine import DataPoint class Repository(DataPoint): __tablename__ = "Repository" path: str + pydantic_type: str = "Repository" _metadata: dict = {"index_fields": [], "type": "Repository"} class CodeFile(DataPoint): __tablename__ = "codefile" extracted_id: str # actually file path + pydantic_type: str = "CodeFile" source_code: Optional[str] = None part_of: Optional[Repository] = None depends_on: Optional[List["CodeFile"]] = None @@ -22,6 +24,7 @@ class CodeFile(DataPoint): class CodePart(DataPoint): __tablename__ = "codepart" # part_of: Optional[CodeFile] = None + pydantic_type: str = "CodePart" source_code: Optional[str] = None _metadata: dict = {"index_fields": [], "type": "CodePart"} @@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint): __tablename__ = "sourcecodechunk" code_chunk_of: Optional[CodePart] = None source_code: Optional[str] = None + pydantic_type: str = "SourceCodeChunk" previous_chunk: Optional["SourceCodeChunk"] = None _metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"} diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index fc62209ce..bc7b4886d 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -17,5 +17,6 @@ class CodeSummary(DataPoint): __tablename__ = "code_summary" text: str summarizes: Union[CodeFile, CodePart, SourceCodeChunk] + pydantic_type: str = "CodeSummary" _metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"} From 9604d95ba515ecb1056f2f103a1e83e581c546dc Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:54:58 +0100 Subject: [PATCH 2/4] feat: adds basic retriever for swe bench --- .../description_to_codepart_search.py | 31 +++++++---- cognee/shared/data_models.py | 4 ++ evals/eval_swe_bench.py | 53 ++++++++----------- 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index ecd187907..fec17fb16 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -10,7 +10,7 @@ from cognee.modules.users.models import User from cognee.shared.utils import send_telemetry -async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list: +async def code_description_to_code_part_search(query: str, user: User = None, top_k=5) -> list: if user is None: user = await get_default_user() @@ -55,21 +55,23 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L ) try: - results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k) - if not results: + code_summaries = await vector_engine.search( + "code_summary_text", query_text=query, limit=top_k + ) + if not code_summaries: logging.warning("No results found for query: '%s' by user: %s", query, user.id) return [] memory_fragment = CogneeGraph() await memory_fragment.project_graph_from_db( graph_engine, - node_properties_to_project=["id", "type", "text", "source_code"], + node_properties_to_project=["id", "type", "text", "source_code", "pydantic_type"], edge_properties_to_project=["relationship_name"], ) code_pieces_to_return = set() - for node in results: + for node in code_summaries: node_id = str(node.id) node_to_search_from = memory_fragment.get_node(node_id) @@ -78,9 +80,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L continue for code_file in node_to_search_from.get_skeleton_neighbours(): - for code_file_edge in code_file.get_skeleton_edges(): - if code_file_edge.get_attribute("relationship_name") == "contains": - code_pieces_to_return.add(code_file_edge.get_destination_node()) + if code_file.get_attribute("pydantic_type") == "SourceCodeChunk": + for code_file_edge in code_file.get_skeleton_edges(): + if code_file_edge.get_attribute("relationship_name") == "code_chunk_of": + code_pieces_to_return.add(code_file_edge.get_destination_node()) + elif code_file.get_attribute("pydantic_type") == "CodePart": + code_pieces_to_return.add(code_file) + elif code_file.get_attribute("pydantic_type") == "CodeFile": + for code_file_edge in code_file.get_skeleton_edges(): + if code_file_edge.get_attribute("relationship_name") == "contains": + code_pieces_to_return.add(code_file_edge.get_destination_node()) logging.info( "Search completed for user: %s, query: '%s'. Found %d code pieces.", @@ -89,7 +98,11 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L len(code_pieces_to_return), ) - return list(code_pieces_to_return) + context = "" + for code_piece in code_pieces_to_return: + context = context + code_piece.get_attribute("source_code") + + return context except Exception as exec_error: logging.error( diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index d23d2841c..a36a09010 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -231,6 +231,7 @@ class SummarizedContent(BaseModel): summary: str description: str + pydantic_type: str = "SummarizedContent" class SummarizedFunction(BaseModel): @@ -239,6 +240,7 @@ class SummarizedFunction(BaseModel): inputs: Optional[List[str]] = None outputs: Optional[List[str]] = None decorators: Optional[List[str]] = None + pydantic_type: str = "SummarizedFunction" class SummarizedClass(BaseModel): @@ -246,6 +248,7 @@ class SummarizedClass(BaseModel): description: str methods: Optional[List[SummarizedFunction]] = None decorators: Optional[List[str]] = None + pydantic_type: str = "SummarizedClass" class SummarizedCode(BaseModel): @@ -256,6 +259,7 @@ class SummarizedCode(BaseModel): classes: List[SummarizedClass] = [] functions: List[SummarizedFunction] = [] workflow_description: Optional[str] = None + pydantic_type: str = "SummarizedCode" class GraphDBType(Enum): diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 20e005751..b5fcc616b 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -11,7 +11,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline from cognee.api.v1.search import SearchType from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.prompts import read_query_prompt -from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search +from cognee.modules.retrieval.description_to_codepart_search import ( + code_description_to_code_part_search, +) from cognee.shared.utils import render_graph from evals.eval_utils import download_github_repo, retrieved_edges_to_string @@ -32,26 +34,16 @@ def check_install_package(package_name): return False -async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): - repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") - - async for result in run_code_graph_pipeline(repo_path, include_docs=True): - print(result) - - print("Here we have the repo under the repo_path") - - await render_graph(None, include_labels=True, include_nodes=True) - +async def generate_patch_with_cognee(instance): + """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")""" problem_statement = instance["problem_statement"] instructions = read_query_prompt("patch_gen_kg_instructions.txt") - retrieved_edges = await brute_force_triplet_search( - problem_statement, - top_k=3, - collections=["code_summary_text"], - ) + repo_path = "/Users/laszlohajdu/Documents/GitHub/test/" + async for result in run_code_graph_pipeline(repo_path, include_docs=False): + print(result) - retrieved_edges_str = retrieved_edges_to_string(retrieved_edges) + retrieved_codeparts = await code_description_to_code_part_search(problem_statement) prompt = "\n".join( [ @@ -60,7 +52,7 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp PATCH_EXAMPLE, "", "These are the retrieved edges:", - retrieved_edges_str, + retrieved_codeparts, ] ) @@ -86,8 +78,6 @@ async def generate_patch_without_cognee(instance, llm_client): async def get_preds(dataset, with_cognee=True): - llm_client = get_llm_client() - if with_cognee: model_name = "with_cognee" pred_func = generate_patch_with_cognee @@ -95,17 +85,18 @@ async def get_preds(dataset, with_cognee=True): model_name = "without_cognee" pred_func = generate_patch_without_cognee - futures = [(instance["instance_id"], pred_func(instance, llm_client)) for instance in dataset] - model_patches = await asyncio.gather(*[x[1] for x in futures]) + preds = [] - preds = [ - { - "instance_id": instance_id, - "model_patch": model_patch, - "model_name_or_path": model_name, - } - for (instance_id, _), model_patch in zip(futures, model_patches) - ] + for instance in dataset: + instance_id = instance["instance_id"] + model_patch = await pred_func(instance) # Sequentially await the async function + preds.append( + { + "instance_id": instance_id, + "model_patch": model_patch, + "model_name_or_path": model_name, + } + ) return preds @@ -135,6 +126,7 @@ async def main(): with open(predictions_path, "w") as file: json.dump(preds, file) + """ This part is for the evaluation subprocess.run( [ "python", @@ -152,6 +144,7 @@ async def main(): "test_run", ] ) + """ if __name__ == "__main__": From 6177d04b44e8e16ebd7070fb1b4f127e0aea4d6f Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:03:34 +0100 Subject: [PATCH 3/4] feat: implements code retreiver --- .../llm/prompts/patch_gen_kg_instructions.txt | 9 +++-- .../description_to_codepart_search.py | 38 +++++++++++++++++-- evals/eval_swe_bench.py | 13 ++++--- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt index ebbb03f75..3117ac9f1 100644 --- a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt +++ b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt @@ -1,3 +1,6 @@ -I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and -generate a single patch file that I can apply directly to this repository using git apply. -Please respond with a single patch file in the following format. \ No newline at end of file +You are a senior software engineer. I need you to solve this issue by looking at the provided context and +generate a single patch file that I can apply directly to this repository using git apply. +Additionally, please make sure that you provide code only with correct syntax and +you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing +functions or classes, as they may be referenced from other code. +Please respond only with a single patch file in the following format without adding any additional context or string. diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index fec17fb16..538f76a6e 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -8,20 +8,27 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.modules.users.methods import get_default_user from cognee.modules.users.models import User from cognee.shared.utils import send_telemetry +from cognee.api.v1.search import SearchType +from cognee.api.v1.search.search_v2 import search +from cognee.infrastructure.llm.get_llm_client import get_llm_client -async def code_description_to_code_part_search(query: str, user: User = None, top_k=5) -> list: +async def code_description_to_code_part_search( + query: str, include_docs=False, user: User = None, top_k=5 +) -> list: if user is None: user = await get_default_user() if user is None: raise PermissionError("No user found in the system. Please create a user.") - retrieved_codeparts = await code_description_to_code_part(query, user, top_k) + retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs) return retrieved_codeparts -async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]: +async def code_description_to_code_part( + query: str, user: User, top_k: int, include_docs: bool +) -> List[str]: """ Maps a code description query to relevant code parts using a CodeGraph pipeline. @@ -29,6 +36,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L query (str): The search query describing the code parts. user (User): The user performing the search. top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher) + include_docs(bool): Boolean showing whether we have the docs in the graph or not Returns: Set[str]: A set of unique code parts matching the query. @@ -37,6 +45,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L ValueError: If arguments are invalid. RuntimeError: If an unexpected error occurs during execution. """ + print(include_docs) if not query or not isinstance(query, str): raise ValueError("The query must be a non-empty string.") if top_k <= 0 or not isinstance(top_k, int): @@ -55,6 +64,26 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L ) try: + if include_docs: + search_results = await search(SearchType.INSIGHTS, query_text=query) + + concatenated_descriptions = " ".join( + obj["description"] + for tpl in search_results + for obj in tpl + if isinstance(obj, dict) and "description" in obj + ) + + llm_client = get_llm_client() + context_from_documents = await llm_client.acreate_structured_output( + text_input=f"The retrieved context from documents" + f" is {concatenated_descriptions}.", + system_prompt="You are a Senior Software Engineer, summarize the context from documents" + f" in a way that it is gonna be provided next to codeparts as context" + f" while trying to solve this github issue connected to the project: {query}]", + response_model=str, + ) + code_summaries = await vector_engine.search( "code_summary_text", query_text=query, limit=top_k ) @@ -102,6 +131,9 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L for code_piece in code_pieces_to_return: context = context + code_piece.get_attribute("source_code") + if include_docs: + context = context_from_documents + context + return context except Exception as exec_error: diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index b5fcc616b..894acf1bb 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -14,8 +14,6 @@ from cognee.infrastructure.llm.prompts import read_query_prompt from cognee.modules.retrieval.description_to_codepart_search import ( code_description_to_code_part_search, ) -from cognee.shared.utils import render_graph -from evals.eval_utils import download_github_repo, retrieved_edges_to_string def check_install_package(package_name): @@ -36,14 +34,17 @@ def check_install_package(package_name): async def generate_patch_with_cognee(instance): """repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")""" + include_docs = True problem_statement = instance["problem_statement"] instructions = read_query_prompt("patch_gen_kg_instructions.txt") - repo_path = "/Users/laszlohajdu/Documents/GitHub/test/" - async for result in run_code_graph_pipeline(repo_path, include_docs=False): + repo_path = "/Users/laszlohajdu/Documents/GitHub/graph_rag/" + async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs): print(result) - retrieved_codeparts = await code_description_to_code_part_search(problem_statement) + retrieved_codeparts = await code_description_to_code_part_search( + problem_statement, include_docs=include_docs + ) prompt = "\n".join( [ @@ -51,7 +52,7 @@ async def generate_patch_with_cognee(instance): "", PATCH_EXAMPLE, "", - "These are the retrieved edges:", + "This is the additional context to solve the problem (description from documentation together with codeparts):", retrieved_codeparts, ] ) From 06e8d2268b231abb680a89ff86c2525d16599389 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:52:26 +0100 Subject: [PATCH 4/4] Fix: fixes unit test for codepart search --- .../retrieval/description_to_codepart_search.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py index 538f76a6e..bd506f76a 100644 --- a/cognee/modules/retrieval/description_to_codepart_search.py +++ b/cognee/modules/retrieval/description_to_codepart_search.py @@ -27,7 +27,7 @@ async def code_description_to_code_part_search( async def code_description_to_code_part( - query: str, user: User, top_k: int, include_docs: bool + query: str, user: User, top_k: int, include_docs: bool = False ) -> List[str]: """ Maps a code description query to relevant code parts using a CodeGraph pipeline. @@ -45,7 +45,6 @@ async def code_description_to_code_part( ValueError: If arguments are invalid. RuntimeError: If an unexpected error occurs during execution. """ - print(include_docs) if not query or not isinstance(query, str): raise ValueError("The query must be a non-empty string.") if top_k <= 0 or not isinstance(top_k, int): @@ -94,7 +93,13 @@ async def code_description_to_code_part( memory_fragment = CogneeGraph() await memory_fragment.project_graph_from_db( graph_engine, - node_properties_to_project=["id", "type", "text", "source_code", "pydantic_type"], + node_properties_to_project=[ + "id", + "type", + "text", + "source_code", + "pydantic_type", + ], edge_properties_to_project=["relationship_name"], )