Merge pull request #424 from topoteretes/feature/cog-971-preparing-swe-bench-run

Feature/cog 971 preparing swe bench run
This commit is contained in:
Vasilije 2025-01-10 14:59:48 +01:00 committed by GitHub
commit f694ca283f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 106 additions and 49 deletions

View file

@ -1,3 +1,6 @@
I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and
generate a single patch file that I can apply directly to this repository using git apply.
Please respond with a single patch file in the following format.
You are a senior software engineer. I need you to solve this issue by looking at the provided context and
generate a single patch file that I can apply directly to this repository using git apply.
Additionally, please make sure that you provide code only with correct syntax and
you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing
functions or classes, as they may be referenced from other code.
Please respond only with a single patch file in the following format without adding any additional context or string.

View file

@ -12,6 +12,7 @@ class DocumentChunk(DataPoint):
chunk_index: int
cut_type: str
is_part_of: Document
pydantic_type: str = "DocumentChunk"
contains: List[Entity] = None
_metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}

View file

@ -7,5 +7,6 @@ class Entity(DataPoint):
name: str
is_a: EntityType
description: str
pydantic_type: str = "Entity"
_metadata: dict = {"index_fields": ["name"], "type": "Entity"}

View file

@ -5,5 +5,6 @@ class EntityType(DataPoint):
__tablename__ = "entity_type"
name: str
description: str
pydantic_type: str = "EntityType"
_metadata: dict = {"index_fields": ["name"], "type": "EntityType"}

View file

@ -8,20 +8,27 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.models import User
from cognee.shared.utils import send_telemetry
from cognee.api.v1.search import SearchType
from cognee.api.v1.search.search_v2 import search
from cognee.infrastructure.llm.get_llm_client import get_llm_client
async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list:
async def code_description_to_code_part_search(
query: str, include_docs=False, user: User = None, top_k=5
) -> list:
if user is None:
user = await get_default_user()
if user is None:
raise PermissionError("No user found in the system. Please create a user.")
retrieved_codeparts = await code_description_to_code_part(query, user, top_k)
retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs)
return retrieved_codeparts
async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]:
async def code_description_to_code_part(
query: str, user: User, top_k: int, include_docs: bool = False
) -> List[str]:
"""
Maps a code description query to relevant code parts using a CodeGraph pipeline.
@ -29,6 +36,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
query (str): The search query describing the code parts.
user (User): The user performing the search.
top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher)
include_docs(bool): Boolean showing whether we have the docs in the graph or not
Returns:
Set[str]: A set of unique code parts matching the query.
@ -55,21 +63,49 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
)
try:
results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k)
if not results:
if include_docs:
search_results = await search(SearchType.INSIGHTS, query_text=query)
concatenated_descriptions = " ".join(
obj["description"]
for tpl in search_results
for obj in tpl
if isinstance(obj, dict) and "description" in obj
)
llm_client = get_llm_client()
context_from_documents = await llm_client.acreate_structured_output(
text_input=f"The retrieved context from documents"
f" is {concatenated_descriptions}.",
system_prompt="You are a Senior Software Engineer, summarize the context from documents"
f" in a way that it is gonna be provided next to codeparts as context"
f" while trying to solve this github issue connected to the project: {query}]",
response_model=str,
)
code_summaries = await vector_engine.search(
"code_summary_text", query_text=query, limit=top_k
)
if not code_summaries:
logging.warning("No results found for query: '%s' by user: %s", query, user.id)
return []
memory_fragment = CogneeGraph()
await memory_fragment.project_graph_from_db(
graph_engine,
node_properties_to_project=["id", "type", "text", "source_code"],
node_properties_to_project=[
"id",
"type",
"text",
"source_code",
"pydantic_type",
],
edge_properties_to_project=["relationship_name"],
)
code_pieces_to_return = set()
for node in results:
for node in code_summaries:
node_id = str(node.id)
node_to_search_from = memory_fragment.get_node(node_id)
@ -78,9 +114,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
continue
for code_file in node_to_search_from.get_skeleton_neighbours():
for code_file_edge in code_file.get_skeleton_edges():
if code_file_edge.get_attribute("relationship_name") == "contains":
code_pieces_to_return.add(code_file_edge.get_destination_node())
if code_file.get_attribute("pydantic_type") == "SourceCodeChunk":
for code_file_edge in code_file.get_skeleton_edges():
if code_file_edge.get_attribute("relationship_name") == "code_chunk_of":
code_pieces_to_return.add(code_file_edge.get_destination_node())
elif code_file.get_attribute("pydantic_type") == "CodePart":
code_pieces_to_return.add(code_file)
elif code_file.get_attribute("pydantic_type") == "CodeFile":
for code_file_edge in code_file.get_skeleton_edges():
if code_file_edge.get_attribute("relationship_name") == "contains":
code_pieces_to_return.add(code_file_edge.get_destination_node())
logging.info(
"Search completed for user: %s, query: '%s'. Found %d code pieces.",
@ -89,7 +132,14 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
len(code_pieces_to_return),
)
return list(code_pieces_to_return)
context = ""
for code_piece in code_pieces_to_return:
context = context + code_piece.get_attribute("source_code")
if include_docs:
context = context_from_documents + context
return context
except Exception as exec_error:
logging.error(

View file

@ -5,12 +5,14 @@ from cognee.infrastructure.engine import DataPoint
class Repository(DataPoint):
__tablename__ = "Repository"
path: str
pydantic_type: str = "Repository"
_metadata: dict = {"index_fields": [], "type": "Repository"}
class CodeFile(DataPoint):
__tablename__ = "codefile"
extracted_id: str # actually file path
pydantic_type: str = "CodeFile"
source_code: Optional[str] = None
part_of: Optional[Repository] = None
depends_on: Optional[List["CodeFile"]] = None
@ -22,6 +24,7 @@ class CodeFile(DataPoint):
class CodePart(DataPoint):
__tablename__ = "codepart"
# part_of: Optional[CodeFile] = None
pydantic_type: str = "CodePart"
source_code: Optional[str] = None
_metadata: dict = {"index_fields": [], "type": "CodePart"}
@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint):
__tablename__ = "sourcecodechunk"
code_chunk_of: Optional[CodePart] = None
source_code: Optional[str] = None
pydantic_type: str = "SourceCodeChunk"
previous_chunk: Optional["SourceCodeChunk"] = None
_metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"}

View file

@ -231,6 +231,7 @@ class SummarizedContent(BaseModel):
summary: str
description: str
pydantic_type: str = "SummarizedContent"
class SummarizedFunction(BaseModel):
@ -239,6 +240,7 @@ class SummarizedFunction(BaseModel):
inputs: Optional[List[str]] = None
outputs: Optional[List[str]] = None
decorators: Optional[List[str]] = None
pydantic_type: str = "SummarizedFunction"
class SummarizedClass(BaseModel):
@ -246,6 +248,7 @@ class SummarizedClass(BaseModel):
description: str
methods: Optional[List[SummarizedFunction]] = None
decorators: Optional[List[str]] = None
pydantic_type: str = "SummarizedClass"
class SummarizedCode(BaseModel):
@ -256,6 +259,7 @@ class SummarizedCode(BaseModel):
classes: List[SummarizedClass] = []
functions: List[SummarizedFunction] = []
workflow_description: Optional[str] = None
pydantic_type: str = "SummarizedCode"
class GraphDBType(Enum):

View file

@ -1,6 +1,5 @@
from typing import Dict, List
import parso
import logging
logger = logging.getLogger(__name__)

View file

@ -9,7 +9,6 @@ import aiofiles
import jedi
import parso
from parso.tree import BaseNode
import logging
logger = logging.getLogger(__name__)

View file

@ -17,5 +17,6 @@ class CodeSummary(DataPoint):
__tablename__ = "code_summary"
text: str
summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
pydantic_type: str = "CodeSummary"
_metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"}

View file

@ -11,9 +11,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
from cognee.api.v1.search import SearchType
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
from cognee.shared.utils import render_graph
from evals.eval_utils import download_github_repo, retrieved_edges_to_string
from cognee.modules.retrieval.description_to_codepart_search import (
code_description_to_code_part_search,
)
def check_install_package(package_name):
@ -32,26 +32,19 @@ def check_install_package(package_name):
return False
async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
async for result in run_code_graph_pipeline(repo_path, include_docs=True):
print(result)
print("Here we have the repo under the repo_path")
await render_graph(None, include_labels=True, include_nodes=True)
async def generate_patch_with_cognee(instance):
"""repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")"""
include_docs = True
problem_statement = instance["problem_statement"]
instructions = read_query_prompt("patch_gen_kg_instructions.txt")
retrieved_edges = await brute_force_triplet_search(
problem_statement,
top_k=3,
collections=["code_summary_text"],
)
repo_path = "/Users/laszlohajdu/Documents/GitHub/graph_rag/"
async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs):
print(result)
retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
retrieved_codeparts = await code_description_to_code_part_search(
problem_statement, include_docs=include_docs
)
prompt = "\n".join(
[
@ -59,8 +52,8 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
"<patch>",
PATCH_EXAMPLE,
"</patch>",
"These are the retrieved edges:",
retrieved_edges_str,
"This is the additional context to solve the problem (description from documentation together with codeparts):",
retrieved_codeparts,
]
)
@ -86,8 +79,6 @@ async def generate_patch_without_cognee(instance, llm_client):
async def get_preds(dataset, with_cognee=True):
llm_client = get_llm_client()
if with_cognee:
model_name = "with_cognee"
pred_func = generate_patch_with_cognee
@ -95,17 +86,18 @@ async def get_preds(dataset, with_cognee=True):
model_name = "without_cognee"
pred_func = generate_patch_without_cognee
futures = [(instance["instance_id"], pred_func(instance, llm_client)) for instance in dataset]
model_patches = await asyncio.gather(*[x[1] for x in futures])
preds = []
preds = [
{
"instance_id": instance_id,
"model_patch": model_patch,
"model_name_or_path": model_name,
}
for (instance_id, _), model_patch in zip(futures, model_patches)
]
for instance in dataset:
instance_id = instance["instance_id"]
model_patch = await pred_func(instance) # Sequentially await the async function
preds.append(
{
"instance_id": instance_id,
"model_patch": model_patch,
"model_name_or_path": model_name,
}
)
return preds
@ -135,6 +127,7 @@ async def main():
with open(predictions_path, "w") as file:
json.dump(preds, file)
""" This part is for the evaluation
subprocess.run(
[
"python",
@ -152,6 +145,7 @@ async def main():
"test_run",
]
)
"""
if __name__ == "__main__":