Merge pull request #424 from topoteretes/feature/cog-971-preparing-swe-bench-run
Feature/cog 971 preparing swe bench run
This commit is contained in:
commit
f694ca283f
11 changed files with 106 additions and 49 deletions
|
|
@ -1,3 +1,6 @@
|
||||||
I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and
|
You are a senior software engineer. I need you to solve this issue by looking at the provided context and
|
||||||
generate a single patch file that I can apply directly to this repository using git apply.
|
generate a single patch file that I can apply directly to this repository using git apply.
|
||||||
Please respond with a single patch file in the following format.
|
Additionally, please make sure that you provide code only with correct syntax and
|
||||||
|
you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing
|
||||||
|
functions or classes, as they may be referenced from other code.
|
||||||
|
Please respond only with a single patch file in the following format without adding any additional context or string.
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ class DocumentChunk(DataPoint):
|
||||||
chunk_index: int
|
chunk_index: int
|
||||||
cut_type: str
|
cut_type: str
|
||||||
is_part_of: Document
|
is_part_of: Document
|
||||||
|
pydantic_type: str = "DocumentChunk"
|
||||||
contains: List[Entity] = None
|
contains: List[Entity] = None
|
||||||
|
|
||||||
_metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}
|
_metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}
|
||||||
|
|
|
||||||
|
|
@ -7,5 +7,6 @@ class Entity(DataPoint):
|
||||||
name: str
|
name: str
|
||||||
is_a: EntityType
|
is_a: EntityType
|
||||||
description: str
|
description: str
|
||||||
|
pydantic_type: str = "Entity"
|
||||||
|
|
||||||
_metadata: dict = {"index_fields": ["name"], "type": "Entity"}
|
_metadata: dict = {"index_fields": ["name"], "type": "Entity"}
|
||||||
|
|
|
||||||
|
|
@ -5,5 +5,6 @@ class EntityType(DataPoint):
|
||||||
__tablename__ = "entity_type"
|
__tablename__ = "entity_type"
|
||||||
name: str
|
name: str
|
||||||
description: str
|
description: str
|
||||||
|
pydantic_type: str = "EntityType"
|
||||||
|
|
||||||
_metadata: dict = {"index_fields": ["name"], "type": "EntityType"}
|
_metadata: dict = {"index_fields": ["name"], "type": "EntityType"}
|
||||||
|
|
|
||||||
|
|
@ -8,20 +8,27 @@ from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
|
||||||
from cognee.modules.users.methods import get_default_user
|
from cognee.modules.users.methods import get_default_user
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
from cognee.shared.utils import send_telemetry
|
from cognee.shared.utils import send_telemetry
|
||||||
|
from cognee.api.v1.search import SearchType
|
||||||
|
from cognee.api.v1.search.search_v2 import search
|
||||||
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
|
|
||||||
|
|
||||||
async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list:
|
async def code_description_to_code_part_search(
|
||||||
|
query: str, include_docs=False, user: User = None, top_k=5
|
||||||
|
) -> list:
|
||||||
if user is None:
|
if user is None:
|
||||||
user = await get_default_user()
|
user = await get_default_user()
|
||||||
|
|
||||||
if user is None:
|
if user is None:
|
||||||
raise PermissionError("No user found in the system. Please create a user.")
|
raise PermissionError("No user found in the system. Please create a user.")
|
||||||
|
|
||||||
retrieved_codeparts = await code_description_to_code_part(query, user, top_k)
|
retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs)
|
||||||
return retrieved_codeparts
|
return retrieved_codeparts
|
||||||
|
|
||||||
|
|
||||||
async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]:
|
async def code_description_to_code_part(
|
||||||
|
query: str, user: User, top_k: int, include_docs: bool = False
|
||||||
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Maps a code description query to relevant code parts using a CodeGraph pipeline.
|
Maps a code description query to relevant code parts using a CodeGraph pipeline.
|
||||||
|
|
||||||
|
|
@ -29,6 +36,7 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
|
||||||
query (str): The search query describing the code parts.
|
query (str): The search query describing the code parts.
|
||||||
user (User): The user performing the search.
|
user (User): The user performing the search.
|
||||||
top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher)
|
top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher)
|
||||||
|
include_docs(bool): Boolean showing whether we have the docs in the graph or not
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Set[str]: A set of unique code parts matching the query.
|
Set[str]: A set of unique code parts matching the query.
|
||||||
|
|
@ -55,21 +63,49 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k)
|
if include_docs:
|
||||||
if not results:
|
search_results = await search(SearchType.INSIGHTS, query_text=query)
|
||||||
|
|
||||||
|
concatenated_descriptions = " ".join(
|
||||||
|
obj["description"]
|
||||||
|
for tpl in search_results
|
||||||
|
for obj in tpl
|
||||||
|
if isinstance(obj, dict) and "description" in obj
|
||||||
|
)
|
||||||
|
|
||||||
|
llm_client = get_llm_client()
|
||||||
|
context_from_documents = await llm_client.acreate_structured_output(
|
||||||
|
text_input=f"The retrieved context from documents"
|
||||||
|
f" is {concatenated_descriptions}.",
|
||||||
|
system_prompt="You are a Senior Software Engineer, summarize the context from documents"
|
||||||
|
f" in a way that it is gonna be provided next to codeparts as context"
|
||||||
|
f" while trying to solve this github issue connected to the project: {query}]",
|
||||||
|
response_model=str,
|
||||||
|
)
|
||||||
|
|
||||||
|
code_summaries = await vector_engine.search(
|
||||||
|
"code_summary_text", query_text=query, limit=top_k
|
||||||
|
)
|
||||||
|
if not code_summaries:
|
||||||
logging.warning("No results found for query: '%s' by user: %s", query, user.id)
|
logging.warning("No results found for query: '%s' by user: %s", query, user.id)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
memory_fragment = CogneeGraph()
|
memory_fragment = CogneeGraph()
|
||||||
await memory_fragment.project_graph_from_db(
|
await memory_fragment.project_graph_from_db(
|
||||||
graph_engine,
|
graph_engine,
|
||||||
node_properties_to_project=["id", "type", "text", "source_code"],
|
node_properties_to_project=[
|
||||||
|
"id",
|
||||||
|
"type",
|
||||||
|
"text",
|
||||||
|
"source_code",
|
||||||
|
"pydantic_type",
|
||||||
|
],
|
||||||
edge_properties_to_project=["relationship_name"],
|
edge_properties_to_project=["relationship_name"],
|
||||||
)
|
)
|
||||||
|
|
||||||
code_pieces_to_return = set()
|
code_pieces_to_return = set()
|
||||||
|
|
||||||
for node in results:
|
for node in code_summaries:
|
||||||
node_id = str(node.id)
|
node_id = str(node.id)
|
||||||
node_to_search_from = memory_fragment.get_node(node_id)
|
node_to_search_from = memory_fragment.get_node(node_id)
|
||||||
|
|
||||||
|
|
@ -78,9 +114,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for code_file in node_to_search_from.get_skeleton_neighbours():
|
for code_file in node_to_search_from.get_skeleton_neighbours():
|
||||||
for code_file_edge in code_file.get_skeleton_edges():
|
if code_file.get_attribute("pydantic_type") == "SourceCodeChunk":
|
||||||
if code_file_edge.get_attribute("relationship_name") == "contains":
|
for code_file_edge in code_file.get_skeleton_edges():
|
||||||
code_pieces_to_return.add(code_file_edge.get_destination_node())
|
if code_file_edge.get_attribute("relationship_name") == "code_chunk_of":
|
||||||
|
code_pieces_to_return.add(code_file_edge.get_destination_node())
|
||||||
|
elif code_file.get_attribute("pydantic_type") == "CodePart":
|
||||||
|
code_pieces_to_return.add(code_file)
|
||||||
|
elif code_file.get_attribute("pydantic_type") == "CodeFile":
|
||||||
|
for code_file_edge in code_file.get_skeleton_edges():
|
||||||
|
if code_file_edge.get_attribute("relationship_name") == "contains":
|
||||||
|
code_pieces_to_return.add(code_file_edge.get_destination_node())
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
"Search completed for user: %s, query: '%s'. Found %d code pieces.",
|
"Search completed for user: %s, query: '%s'. Found %d code pieces.",
|
||||||
|
|
@ -89,7 +132,14 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
|
||||||
len(code_pieces_to_return),
|
len(code_pieces_to_return),
|
||||||
)
|
)
|
||||||
|
|
||||||
return list(code_pieces_to_return)
|
context = ""
|
||||||
|
for code_piece in code_pieces_to_return:
|
||||||
|
context = context + code_piece.get_attribute("source_code")
|
||||||
|
|
||||||
|
if include_docs:
|
||||||
|
context = context_from_documents + context
|
||||||
|
|
||||||
|
return context
|
||||||
|
|
||||||
except Exception as exec_error:
|
except Exception as exec_error:
|
||||||
logging.error(
|
logging.error(
|
||||||
|
|
|
||||||
|
|
@ -5,12 +5,14 @@ from cognee.infrastructure.engine import DataPoint
|
||||||
class Repository(DataPoint):
|
class Repository(DataPoint):
|
||||||
__tablename__ = "Repository"
|
__tablename__ = "Repository"
|
||||||
path: str
|
path: str
|
||||||
|
pydantic_type: str = "Repository"
|
||||||
_metadata: dict = {"index_fields": [], "type": "Repository"}
|
_metadata: dict = {"index_fields": [], "type": "Repository"}
|
||||||
|
|
||||||
|
|
||||||
class CodeFile(DataPoint):
|
class CodeFile(DataPoint):
|
||||||
__tablename__ = "codefile"
|
__tablename__ = "codefile"
|
||||||
extracted_id: str # actually file path
|
extracted_id: str # actually file path
|
||||||
|
pydantic_type: str = "CodeFile"
|
||||||
source_code: Optional[str] = None
|
source_code: Optional[str] = None
|
||||||
part_of: Optional[Repository] = None
|
part_of: Optional[Repository] = None
|
||||||
depends_on: Optional[List["CodeFile"]] = None
|
depends_on: Optional[List["CodeFile"]] = None
|
||||||
|
|
@ -22,6 +24,7 @@ class CodeFile(DataPoint):
|
||||||
class CodePart(DataPoint):
|
class CodePart(DataPoint):
|
||||||
__tablename__ = "codepart"
|
__tablename__ = "codepart"
|
||||||
# part_of: Optional[CodeFile] = None
|
# part_of: Optional[CodeFile] = None
|
||||||
|
pydantic_type: str = "CodePart"
|
||||||
source_code: Optional[str] = None
|
source_code: Optional[str] = None
|
||||||
_metadata: dict = {"index_fields": [], "type": "CodePart"}
|
_metadata: dict = {"index_fields": [], "type": "CodePart"}
|
||||||
|
|
||||||
|
|
@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint):
|
||||||
__tablename__ = "sourcecodechunk"
|
__tablename__ = "sourcecodechunk"
|
||||||
code_chunk_of: Optional[CodePart] = None
|
code_chunk_of: Optional[CodePart] = None
|
||||||
source_code: Optional[str] = None
|
source_code: Optional[str] = None
|
||||||
|
pydantic_type: str = "SourceCodeChunk"
|
||||||
previous_chunk: Optional["SourceCodeChunk"] = None
|
previous_chunk: Optional["SourceCodeChunk"] = None
|
||||||
|
|
||||||
_metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"}
|
_metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"}
|
||||||
|
|
|
||||||
|
|
@ -231,6 +231,7 @@ class SummarizedContent(BaseModel):
|
||||||
|
|
||||||
summary: str
|
summary: str
|
||||||
description: str
|
description: str
|
||||||
|
pydantic_type: str = "SummarizedContent"
|
||||||
|
|
||||||
|
|
||||||
class SummarizedFunction(BaseModel):
|
class SummarizedFunction(BaseModel):
|
||||||
|
|
@ -239,6 +240,7 @@ class SummarizedFunction(BaseModel):
|
||||||
inputs: Optional[List[str]] = None
|
inputs: Optional[List[str]] = None
|
||||||
outputs: Optional[List[str]] = None
|
outputs: Optional[List[str]] = None
|
||||||
decorators: Optional[List[str]] = None
|
decorators: Optional[List[str]] = None
|
||||||
|
pydantic_type: str = "SummarizedFunction"
|
||||||
|
|
||||||
|
|
||||||
class SummarizedClass(BaseModel):
|
class SummarizedClass(BaseModel):
|
||||||
|
|
@ -246,6 +248,7 @@ class SummarizedClass(BaseModel):
|
||||||
description: str
|
description: str
|
||||||
methods: Optional[List[SummarizedFunction]] = None
|
methods: Optional[List[SummarizedFunction]] = None
|
||||||
decorators: Optional[List[str]] = None
|
decorators: Optional[List[str]] = None
|
||||||
|
pydantic_type: str = "SummarizedClass"
|
||||||
|
|
||||||
|
|
||||||
class SummarizedCode(BaseModel):
|
class SummarizedCode(BaseModel):
|
||||||
|
|
@ -256,6 +259,7 @@ class SummarizedCode(BaseModel):
|
||||||
classes: List[SummarizedClass] = []
|
classes: List[SummarizedClass] = []
|
||||||
functions: List[SummarizedFunction] = []
|
functions: List[SummarizedFunction] = []
|
||||||
workflow_description: Optional[str] = None
|
workflow_description: Optional[str] = None
|
||||||
|
pydantic_type: str = "SummarizedCode"
|
||||||
|
|
||||||
|
|
||||||
class GraphDBType(Enum):
|
class GraphDBType(Enum):
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
import parso
|
import parso
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ import aiofiles
|
||||||
import jedi
|
import jedi
|
||||||
import parso
|
import parso
|
||||||
from parso.tree import BaseNode
|
from parso.tree import BaseNode
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
|
||||||
|
|
@ -17,5 +17,6 @@ class CodeSummary(DataPoint):
|
||||||
__tablename__ = "code_summary"
|
__tablename__ = "code_summary"
|
||||||
text: str
|
text: str
|
||||||
summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
|
summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
|
||||||
|
pydantic_type: str = "CodeSummary"
|
||||||
|
|
||||||
_metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"}
|
_metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"}
|
||||||
|
|
|
||||||
|
|
@ -11,9 +11,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
|
||||||
from cognee.api.v1.search import SearchType
|
from cognee.api.v1.search import SearchType
|
||||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||||
from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
|
from cognee.modules.retrieval.description_to_codepart_search import (
|
||||||
from cognee.shared.utils import render_graph
|
code_description_to_code_part_search,
|
||||||
from evals.eval_utils import download_github_repo, retrieved_edges_to_string
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_install_package(package_name):
|
def check_install_package(package_name):
|
||||||
|
|
@ -32,26 +32,19 @@ def check_install_package(package_name):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
|
async def generate_patch_with_cognee(instance):
|
||||||
repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
|
"""repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")"""
|
||||||
|
include_docs = True
|
||||||
async for result in run_code_graph_pipeline(repo_path, include_docs=True):
|
|
||||||
print(result)
|
|
||||||
|
|
||||||
print("Here we have the repo under the repo_path")
|
|
||||||
|
|
||||||
await render_graph(None, include_labels=True, include_nodes=True)
|
|
||||||
|
|
||||||
problem_statement = instance["problem_statement"]
|
problem_statement = instance["problem_statement"]
|
||||||
instructions = read_query_prompt("patch_gen_kg_instructions.txt")
|
instructions = read_query_prompt("patch_gen_kg_instructions.txt")
|
||||||
|
|
||||||
retrieved_edges = await brute_force_triplet_search(
|
repo_path = "/Users/laszlohajdu/Documents/GitHub/graph_rag/"
|
||||||
problem_statement,
|
async for result in run_code_graph_pipeline(repo_path, include_docs=include_docs):
|
||||||
top_k=3,
|
print(result)
|
||||||
collections=["code_summary_text"],
|
|
||||||
)
|
|
||||||
|
|
||||||
retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
|
retrieved_codeparts = await code_description_to_code_part_search(
|
||||||
|
problem_statement, include_docs=include_docs
|
||||||
|
)
|
||||||
|
|
||||||
prompt = "\n".join(
|
prompt = "\n".join(
|
||||||
[
|
[
|
||||||
|
|
@ -59,8 +52,8 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
|
||||||
"<patch>",
|
"<patch>",
|
||||||
PATCH_EXAMPLE,
|
PATCH_EXAMPLE,
|
||||||
"</patch>",
|
"</patch>",
|
||||||
"These are the retrieved edges:",
|
"This is the additional context to solve the problem (description from documentation together with codeparts):",
|
||||||
retrieved_edges_str,
|
retrieved_codeparts,
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -86,8 +79,6 @@ async def generate_patch_without_cognee(instance, llm_client):
|
||||||
|
|
||||||
|
|
||||||
async def get_preds(dataset, with_cognee=True):
|
async def get_preds(dataset, with_cognee=True):
|
||||||
llm_client = get_llm_client()
|
|
||||||
|
|
||||||
if with_cognee:
|
if with_cognee:
|
||||||
model_name = "with_cognee"
|
model_name = "with_cognee"
|
||||||
pred_func = generate_patch_with_cognee
|
pred_func = generate_patch_with_cognee
|
||||||
|
|
@ -95,17 +86,18 @@ async def get_preds(dataset, with_cognee=True):
|
||||||
model_name = "without_cognee"
|
model_name = "without_cognee"
|
||||||
pred_func = generate_patch_without_cognee
|
pred_func = generate_patch_without_cognee
|
||||||
|
|
||||||
futures = [(instance["instance_id"], pred_func(instance, llm_client)) for instance in dataset]
|
preds = []
|
||||||
model_patches = await asyncio.gather(*[x[1] for x in futures])
|
|
||||||
|
|
||||||
preds = [
|
for instance in dataset:
|
||||||
{
|
instance_id = instance["instance_id"]
|
||||||
"instance_id": instance_id,
|
model_patch = await pred_func(instance) # Sequentially await the async function
|
||||||
"model_patch": model_patch,
|
preds.append(
|
||||||
"model_name_or_path": model_name,
|
{
|
||||||
}
|
"instance_id": instance_id,
|
||||||
for (instance_id, _), model_patch in zip(futures, model_patches)
|
"model_patch": model_patch,
|
||||||
]
|
"model_name_or_path": model_name,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return preds
|
return preds
|
||||||
|
|
||||||
|
|
@ -135,6 +127,7 @@ async def main():
|
||||||
with open(predictions_path, "w") as file:
|
with open(predictions_path, "w") as file:
|
||||||
json.dump(preds, file)
|
json.dump(preds, file)
|
||||||
|
|
||||||
|
""" This part is for the evaluation
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
[
|
[
|
||||||
"python",
|
"python",
|
||||||
|
|
@ -152,6 +145,7 @@ async def main():
|
||||||
"test_run",
|
"test_run",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue