From 26d914eebc68a3c9c6c72df96525e3c15b1510c6 Mon Sep 17 00:00:00 2001 From: Rita Date: Mon, 11 Nov 2024 16:48:06 +0000 Subject: [PATCH 01/29] Loading SWE-bench dataset and converting it to DeepEval dataset --- evals/eval_on_swe_bench.py | 167 +++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 evals/eval_on_swe_bench.py diff --git a/evals/eval_on_swe_bench.py b/evals/eval_on_swe_bench.py new file mode 100644 index 000000000..b5b3f14b6 --- /dev/null +++ b/evals/eval_on_swe_bench.py @@ -0,0 +1,167 @@ +from typing import List, Dict, Type +from swebench.harness.utils import load_swebench_dataset +from deepeval.dataset import EvaluationDataset +from deepeval.test_case import LLMTestCase +from pydantic import BaseModel + +from deepeval.synthesizer import Synthesizer + + +# DeepEval dataset for reference +# synthesizer = Synthesizer() +# synthesizer.generate_goldens_from_docs( +# document_paths=['/app/.data/short_stories/soldiers_home.pdf'], +# include_expected_output=True +# ) + +def convert_swe_to_deepeval(swe_dataset: List[Dict]): + deepeval_dataset = EvaluationDataset() + for datum in swe_dataset: + input = datum["problem_statement"] + expected_output = datum["patch"] + context = [datum["text"]] + # retrieval_context = datum.get(retrieval_context_key_name) + # tools_called = datum.get(tools_called_key_name) + # expected_tools = json_obj.get(expected_tools_key_name) + + deepeval_dataset.add_test_case( + LLMTestCase( + input=input, + actual_output=None, + expected_output=expected_output, + context=context, + # retrieval_context=retrieval_context, + # tools_called=tools_called, + # expected_tools=expected_tools, + ) + ) + return deepeval_dataset + + +from cognee.infrastructure.llm.get_llm_client import get_llm_client + +swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') +deepeval_dataset = convert_swe_to_deepeval(swe_dataset) + +import logging + +logger = logging.getLogger(__name__) + +class AnswerModel(BaseModel): + response:str + +def get_answer_base(content: str, context:str, response_model: Type[BaseModel]): + llm_client = get_llm_client() + + system_prompt = "THIS IS YOUR CONTEXT:" + str(context) + + return llm_client.create_structured_output(content, system_prompt, response_model) + +def get_answer(content: str,context, model: Type[BaseModel]= AnswerModel): + + try: + return (get_answer_base( + content, + context, + model + )) + except Exception as error: + logger.error("Error extracting cognitive layers from content: %s", error, exc_info = True) + raise error + +async def run_cognify_base_rag(): + from cognee.api.v1.add import add + from cognee.api.v1.prune import prune + from cognee.api.v1.cognify.cognify import cognify + + await prune.prune_system() + + await add("data://test_datasets", "initial_test") + + graph = await cognify("initial_test") + pass + + +import os +from cognee.base_config import get_base_config +from cognee.infrastructure.databases.vector import get_vector_engine + +async def cognify_search_base_rag(content:str, context:str): + base_config = get_base_config() + + cognee_directory_path = os.path.abspath(".cognee_system") + base_config.system_root_directory = cognee_directory_path + + vector_engine = get_vector_engine() + + return_ = await vector_engine.search(collection_name="basic_rag", query_text=content, limit=10) + + print("results", return_) + return return_ + +async def cognify_search_graph(content:str, context:str): + from cognee.api.v1.search import search, SearchType + params = {'query': 'Donald Trump'} + + results = await search(SearchType.INSIGHTS, params) + print("results", results) + return results + + +def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLMTestCase]: + test_cases = [] + for case in test_cases_raw: + test_case = LLMTestCase( + input=case.input, + # Generate actual output using the 'input' and 'additional_metadata' + actual_output= str(get_answer(case.input, case.context).model_dump()['response']), + expected_output=case.expected_output, + context=case.context, + retrieval_context=["retrieval_context"], + ) + test_cases.append(test_case) + return test_cases + +def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): + deepeval_dataset = EvaluationDataset() + for datum in swe_dataset[:4]: + input = datum["problem_statement"] + expected_output = datum["patch"] + context = [datum["text"]] + # retrieval_context = datum.get(retrieval_context_key_name) + # tools_called = datum.get(tools_called_key_name) + # expected_tools = json_obj.get(expected_tools_key_name) + + deepeval_dataset.add_test_case( + LLMTestCase( + input=input, + actual_output= str(get_answer(input, context).model_dump()['response']), + expected_output=expected_output, + context=context, + # retrieval_context=retrieval_context, + # tools_called=tools_called, + # expected_tools=expected_tools, + ) + ) + return deepeval_dataset + +swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') +test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) + +if __name__ == "__main__": + + import asyncio + + async def main(): + # await run_cognify_base_rag() + # await cognify_search_base_rag("show_all_processes", "context") + await cognify_search_graph("show_all_processes", "context") + asyncio.run(main()) + # run_cognify_base_rag_and_search() + # # Data preprocessing before setting the dataset test cases + swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + test_dataset = convert_swe_to_deepeval_testcases(dataset.test_cases) + from deepeval.metrics import HallucinationMetric + metric = HallucinationMetric() + evalresult = dataset.evaluate([metric]) + pass \ No newline at end of file From aa95aa21afc585f643ae9cd6c732dcb1ba2d3a9b Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 12 Nov 2024 17:40:42 +0100 Subject: [PATCH 02/29] downloading example repo for eval --- evals/{eval_on_swe_bench.py => deepeval_on_swe_bench.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename evals/{eval_on_swe_bench.py => deepeval_on_swe_bench.py} (97%) diff --git a/evals/eval_on_swe_bench.py b/evals/deepeval_on_swe_bench.py similarity index 97% rename from evals/eval_on_swe_bench.py rename to evals/deepeval_on_swe_bench.py index b5b3f14b6..b191a08d4 100644 --- a/evals/eval_on_swe_bench.py +++ b/evals/deepeval_on_swe_bench.py @@ -160,8 +160,8 @@ if __name__ == "__main__": # run_cognify_base_rag_and_search() # # Data preprocessing before setting the dataset test cases swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') - test_dataset = convert_swe_to_deepeval_testcases(dataset.test_cases) + test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) from deepeval.metrics import HallucinationMetric metric = HallucinationMetric() - evalresult = dataset.evaluate([metric]) + evalresult = test_dataset.evaluate([metric]) pass \ No newline at end of file From 094ba7233e27583c97baf4b2a87e2c642348a8c6 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 14 Nov 2024 16:28:03 +0100 Subject: [PATCH 03/29] Running inference with and without cognee --- evals/eval_swe_bench.py | 81 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 evals/eval_swe_bench.py diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py new file mode 100644 index 000000000..8e8327fd5 --- /dev/null +++ b/evals/eval_swe_bench.py @@ -0,0 +1,81 @@ +from swebench.harness.utils import load_swebench_dataset +from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE +from evals.eval_utils import download_instances +import cognee +from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline +from cognee.api.v1.search import SearchType +import os +from pathlib import Path +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.shared.data_models import Answer + +async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata = True) + + dataset_name = "SWE_test_data" + code_text = dataset[0]["text"][:100000] + await cognee.add([code_text], dataset_name) + await cognee.cognify([dataset_name]) + graph_engine = await get_graph_engine() + with open(graph_engine.filename, "r") as f: + graph_str = f.read() + + problem_statement = dataset[0]['problem_statement'] + instructions = ( + f"I need you to solve this issue by looking at the provided knowledge graph and by " + + f"generating a single patch file that I can apply directly to this repository " + + f"using git apply. Please respond with a single patch " + + f"file in the following format." + ) + + prompt = "\n".join([ + instructions, + "", + PATCH_EXAMPLE, + "", + "This is the knowledge graph:", + graph_str + ]) + + llm_client = get_llm_client() + answer_prediction = llm_client.create_structured_output( + text_input = problem_statement, + system_prompt = prompt, + response_model = str, + ) + return answer_prediction + +def llm_on_preprocessed_data(dataset): + problem_statement = dataset[0]['problem_statement'] + prompt = dataset[0]["text"] + + llm_client = get_llm_client() + answer_prediction = llm_client.create_structured_output( + text_input = problem_statement, + system_prompt = prompt, # TODO check if this is correct + response_model = str, + ) + return answer_prediction + + +async def main(): + swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench', split='test') + swe_dataset_preprocessed = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + test_data = swe_dataset[:1] + test_data_preprocessed = swe_dataset_preprocessed[:1] + assert test_data[0]["instance_id"] == test_data_preprocessed[0]["instance_id"] + filepath = Path("SWE-bench_testsample") + if filepath.exists(): + from datasets import Dataset + dataset = Dataset.load_from_disk(filepath) + else: + dataset = download_instances(test_data, filepath) + + llm_output_with_cognee = await cognee_and_llm(dataset) + llm_output_without_cognee = llm_on_preprocessed_data(test_data_preprocessed) + +if __name__ == "__main__": + import asyncio + asyncio.run(main(), debug=True) From 721fde3d602f527fa1944e8c9ef006c4555c9f24 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 15 Nov 2024 17:14:43 +0100 Subject: [PATCH 04/29] generating testspecs for data --- evals/eval_swe_bench.py | 57 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 8e8327fd5..3aabfcba3 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -1,14 +1,17 @@ from swebench.harness.utils import load_swebench_dataset +from swebench.harness.run_evaluation import get_dataset_from_preds +from swebench.harness.run_evaluation import run_instances +from swebench.harness.test_spec import make_test_spec, TestSpec + +import subprocess from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE from evals.eval_utils import download_instances import cognee from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType -import os from pathlib import Path from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client -from cognee.shared.data_models import Answer async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): await cognee.prune.prune_data() @@ -47,7 +50,8 @@ async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): ) return answer_prediction -def llm_on_preprocessed_data(dataset): + +async def llm_on_preprocessed_data(dataset): problem_statement = dataset[0]['problem_statement'] prompt = dataset[0]["text"] @@ -59,6 +63,47 @@ def llm_on_preprocessed_data(dataset): ) return answer_prediction +async def get_preds(dataset, with_cognee): + if with_cognee: + text_output = await cognee_and_llm(dataset) + model_name = "with_cognee" + else: + text_output = await llm_on_preprocessed_data(dataset) + model_name = "without_cognee" + + preds = {dataset[0]["instance_id"]: + {"instance_id": dataset[0]["instance_id"], + "model_patch": text_output, + "model_name_or_path": model_name}} + + dataset_name = 'princeton-nlp/SWE-bench' if with_cognee else 'princeton-nlp/SWE-bench_bm25_13K' + preds_dataset = get_dataset_from_preds(dataset_name, + "test", + [dataset[0]["instance_id"]], + preds, + model_name) + + return preds, preds_dataset + +async def evaluate(test_specs: list[TestSpec], + preds: dict, + ): + for test_spec in test_specs: + pred = preds[test_spec.instance_id] + log_dir = Path("logs") + log_dir.mkdir(parents=True, exist_ok=True) + + patch_file = Path(log_dir / "patch.diff") + patch_file.write_text(pred["model_patch"] or "") + for command in test_spec.repo_script_list: + if "/testbed" in command: + command = command.replace("/testbed", "./testbed") + result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True) + print(result) + + subprocess.run("git apply --allow-empty -v logs/patch.diff", shell=True, capture_output=True, text=True) + + async def main(): swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench', split='test') @@ -73,8 +118,10 @@ async def main(): else: dataset = download_instances(test_data, filepath) - llm_output_with_cognee = await cognee_and_llm(dataset) - llm_output_without_cognee = llm_on_preprocessed_data(test_data_preprocessed) + cognee_preds, cognee_preds_dataset = await get_preds(dataset, with_cognee=True) + # nocognee_preds = await get_preds(dataset, with_cognee=False) + test_specs = list(map(make_test_spec, test_data)) + results = await evaluate(test_specs, cognee_preds) if __name__ == "__main__": import asyncio From ed08cdb9f9cff6a3f33d1a5f7f81cc66835a4a1a Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 15 Nov 2024 17:54:41 +0100 Subject: [PATCH 05/29] using the code graph pipeline instead of cognify --- evals/eval_swe_bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 3aabfcba3..9acb176b7 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -20,7 +20,7 @@ async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): dataset_name = "SWE_test_data" code_text = dataset[0]["text"][:100000] await cognee.add([code_text], dataset_name) - await cognee.cognify([dataset_name]) + await code_graph_pipeline([dataset_name]) graph_engine = await get_graph_engine() with open(graph_engine.filename, "r") as f: graph_str = f.read() @@ -63,7 +63,7 @@ async def llm_on_preprocessed_data(dataset): ) return answer_prediction -async def get_preds(dataset, with_cognee): +async def get_preds(dataset, with_cognee=True): if with_cognee: text_output = await cognee_and_llm(dataset) model_name = "with_cognee" From 98e3445c2cc815fa2aff7a9046253a34ce4603ed Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 18 Nov 2024 15:02:16 +0100 Subject: [PATCH 06/29] running swebench evaluation as subprocess --- evals/deepeval_on_swe_bench.py | 59 +++++++++-------- evals/eval_swe_bench.py | 116 ++++++++++++++------------------- evals/eval_utils.py | 107 ++++++++++++++++++++++++++++++ 3 files changed, 190 insertions(+), 92 deletions(-) create mode 100644 evals/eval_utils.py diff --git a/evals/deepeval_on_swe_bench.py b/evals/deepeval_on_swe_bench.py index b191a08d4..8cb94abb3 100644 --- a/evals/deepeval_on_swe_bench.py +++ b/evals/deepeval_on_swe_bench.py @@ -1,3 +1,8 @@ +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.base_config import get_base_config +import os +import logging +from cognee.infrastructure.llm.get_llm_client import get_llm_client from typing import List, Dict, Type from swebench.harness.utils import load_swebench_dataset from deepeval.dataset import EvaluationDataset @@ -21,8 +26,6 @@ def convert_swe_to_deepeval(swe_dataset: List[Dict]): expected_output = datum["patch"] context = [datum["text"]] # retrieval_context = datum.get(retrieval_context_key_name) - # tools_called = datum.get(tools_called_key_name) - # expected_tools = json_obj.get(expected_tools_key_name) deepeval_dataset.add_test_case( LLMTestCase( @@ -31,33 +34,32 @@ def convert_swe_to_deepeval(swe_dataset: List[Dict]): expected_output=expected_output, context=context, # retrieval_context=retrieval_context, - # tools_called=tools_called, - # expected_tools=expected_tools, ) ) return deepeval_dataset -from cognee.infrastructure.llm.get_llm_client import get_llm_client - -swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') +swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') deepeval_dataset = convert_swe_to_deepeval(swe_dataset) -import logging logger = logging.getLogger(__name__) -class AnswerModel(BaseModel): - response:str -def get_answer_base(content: str, context:str, response_model: Type[BaseModel]): +class AnswerModel(BaseModel): + response: str + + +def get_answer_base(content: str, context: str, response_model: Type[BaseModel]): llm_client = get_llm_client() system_prompt = "THIS IS YOUR CONTEXT:" + str(context) - return llm_client.create_structured_output(content, system_prompt, response_model) + return llm_client.create_structured_output(content, system_prompt, response_model) -def get_answer(content: str,context, model: Type[BaseModel]= AnswerModel): + +def get_answer(content: str, context, model: Type[BaseModel] = AnswerModel): try: return (get_answer_base( @@ -66,9 +68,11 @@ def get_answer(content: str,context, model: Type[BaseModel]= AnswerModel): model )) except Exception as error: - logger.error("Error extracting cognitive layers from content: %s", error, exc_info = True) + logger.error( + "Error extracting cognitive layers from content: %s", error, exc_info=True) raise error + async def run_cognify_base_rag(): from cognee.api.v1.add import add from cognee.api.v1.prune import prune @@ -82,11 +86,7 @@ async def run_cognify_base_rag(): pass -import os -from cognee.base_config import get_base_config -from cognee.infrastructure.databases.vector import get_vector_engine - -async def cognify_search_base_rag(content:str, context:str): +async def cognify_search_base_rag(content: str, context: str): base_config = get_base_config() cognee_directory_path = os.path.abspath(".cognee_system") @@ -99,7 +99,8 @@ async def cognify_search_base_rag(content:str, context:str): print("results", return_) return return_ -async def cognify_search_graph(content:str, context:str): + +async def cognify_search_graph(content: str, context: str): from cognee.api.v1.search import search, SearchType params = {'query': 'Donald Trump'} @@ -114,7 +115,8 @@ def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLM test_case = LLMTestCase( input=case.input, # Generate actual output using the 'input' and 'additional_metadata' - actual_output= str(get_answer(case.input, case.context).model_dump()['response']), + actual_output=str(get_answer( + case.input, case.context).model_dump()['response']), expected_output=case.expected_output, context=case.context, retrieval_context=["retrieval_context"], @@ -122,6 +124,7 @@ def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLM test_cases.append(test_case) return test_cases + def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): deepeval_dataset = EvaluationDataset() for datum in swe_dataset[:4]: @@ -135,7 +138,8 @@ def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): deepeval_dataset.add_test_case( LLMTestCase( input=input, - actual_output= str(get_answer(input, context).model_dump()['response']), + actual_output=str(get_answer( + input, context).model_dump()['response']), expected_output=expected_output, context=context, # retrieval_context=retrieval_context, @@ -145,9 +149,11 @@ def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): ) return deepeval_dataset -swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + +swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - + if __name__ == "__main__": import asyncio @@ -159,9 +165,10 @@ if __name__ == "__main__": asyncio.run(main()) # run_cognify_base_rag_and_search() # # Data preprocessing before setting the dataset test cases - swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) from deepeval.metrics import HallucinationMetric metric = HallucinationMetric() evalresult = test_dataset.evaluate([metric]) - pass \ No newline at end of file + pass diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 9acb176b7..2cb221576 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -1,38 +1,38 @@ -from swebench.harness.utils import load_swebench_dataset -from swebench.harness.run_evaluation import get_dataset_from_preds -from swebench.harness.run_evaluation import run_instances -from swebench.harness.test_spec import make_test_spec, TestSpec - +import json import subprocess +from pathlib import Path + +from swebench.harness.utils import load_swebench_dataset from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE -from evals.eval_utils import download_instances + import cognee from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType -from pathlib import Path from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client +from evals.eval_utils import download_instances -async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): + +async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata = True) + await cognee.prune.prune_system(metadata=True) dataset_name = "SWE_test_data" - code_text = dataset[0]["text"][:100000] + code_text = dataset[0]["text"] await cognee.add([code_text], dataset_name) await code_graph_pipeline([dataset_name]) graph_engine = await get_graph_engine() with open(graph_engine.filename, "r") as f: - graph_str = f.read() - + graph_str = f.read() + problem_statement = dataset[0]['problem_statement'] instructions = ( - f"I need you to solve this issue by looking at the provided knowledge graph and by " - + f"generating a single patch file that I can apply directly to this repository " - + f"using git apply. Please respond with a single patch " - + f"file in the following format." + "I need you to solve this issue by looking at the provided knowledge graph and by " + + "generating a single patch file that I can apply directly to this repository " + + "using git apply. Please respond with a single patch " + + "file in the following format." ) - + prompt = "\n".join([ instructions, "", @@ -41,28 +41,29 @@ async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): "This is the knowledge graph:", graph_str ]) - + llm_client = get_llm_client() answer_prediction = llm_client.create_structured_output( - text_input = problem_statement, - system_prompt = prompt, - response_model = str, - ) + text_input=problem_statement, + system_prompt=prompt, + response_model=str, + ) return answer_prediction async def llm_on_preprocessed_data(dataset): problem_statement = dataset[0]['problem_statement'] prompt = dataset[0]["text"] - + llm_client = get_llm_client() answer_prediction = llm_client.create_structured_output( - text_input = problem_statement, - system_prompt = prompt, # TODO check if this is correct - response_model = str, - ) + text_input=problem_statement, + system_prompt=prompt, + response_model=str, + ) return answer_prediction + async def get_preds(dataset, with_cognee=True): if with_cognee: text_output = await cognee_and_llm(dataset) @@ -70,46 +71,21 @@ async def get_preds(dataset, with_cognee=True): else: text_output = await llm_on_preprocessed_data(dataset) model_name = "without_cognee" - - preds = {dataset[0]["instance_id"]: - {"instance_id": dataset[0]["instance_id"], - "model_patch": text_output, - "model_name_or_path": model_name}} - - dataset_name = 'princeton-nlp/SWE-bench' if with_cognee else 'princeton-nlp/SWE-bench_bm25_13K' - preds_dataset = get_dataset_from_preds(dataset_name, - "test", - [dataset[0]["instance_id"]], - preds, - model_name) - - return preds, preds_dataset -async def evaluate(test_specs: list[TestSpec], - preds: dict, - ): - for test_spec in test_specs: - pred = preds[test_spec.instance_id] - log_dir = Path("logs") - log_dir.mkdir(parents=True, exist_ok=True) + preds = [{"instance_id": dataset[0]["instance_id"], + "model_patch": text_output, + "model_name_or_path": model_name}] - patch_file = Path(log_dir / "patch.diff") - patch_file.write_text(pred["model_patch"] or "") - for command in test_spec.repo_script_list: - if "/testbed" in command: - command = command.replace("/testbed", "./testbed") - result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True) - print(result) - - subprocess.run("git apply --allow-empty -v logs/patch.diff", shell=True, capture_output=True, text=True) + return preds - async def main(): - swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench', split='test') - swe_dataset_preprocessed = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') - test_data = swe_dataset[:1] - test_data_preprocessed = swe_dataset_preprocessed[:1] + swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench', split='test') + swe_dataset_preprocessed = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') + test_data = swe_dataset[:1] + test_data_preprocessed = swe_dataset_preprocessed[:1] assert test_data[0]["instance_id"] == test_data_preprocessed[0]["instance_id"] filepath = Path("SWE-bench_testsample") if filepath.exists(): @@ -117,11 +93,19 @@ async def main(): dataset = Dataset.load_from_disk(filepath) else: dataset = download_instances(test_data, filepath) - - cognee_preds, cognee_preds_dataset = await get_preds(dataset, with_cognee=True) + + cognee_preds = await get_preds(dataset, with_cognee=True) # nocognee_preds = await get_preds(dataset, with_cognee=False) - test_specs = list(map(make_test_spec, test_data)) - results = await evaluate(test_specs, cognee_preds) + with open("withcognee.json", "w") as file: + json.dump(cognee_preds, file) + + subprocess.run(["python", "-m", "swebench.harness.run_evaluation", + "--dataset_name", 'princeton-nlp/SWE-bench', + "--split", "test", + "--predictions_path", "withcognee.json", + "--max_workers", "1", + "--instance_ids", test_data[0]["instance_id"], + "--run_id", "with_cognee"]) if __name__ == "__main__": import asyncio diff --git a/evals/eval_utils.py b/evals/eval_utils.py new file mode 100644 index 000000000..1c2785737 --- /dev/null +++ b/evals/eval_utils.py @@ -0,0 +1,107 @@ +import json +import logging +import os +import traceback +from copy import deepcopy +from pathlib import Path +from tempfile import TemporaryDirectory + +import unidiff +from datasets import Dataset +from swebench.inference.make_datasets.create_instance import make_code_text +from swebench.inference.make_datasets.utils import (AutoContextManager, + ingest_directory_contents) +from tqdm.auto import tqdm + + +def ingest_files(filenames): + files_dict = dict() + for filename in filenames: + with open(filename) as f: + content = f.read() + files_dict[filename] = content + return files_dict + + +def ingest_repos(input_instances): + orig_dir = os.getcwd() + with TemporaryDirectory( + dir="/scratch" if os.path.exists("/scratch") else "/tmp" + ) as root_dir: + for instance in tqdm( + input_instances.values(), + total=len(input_instances), + desc="Downloading repos on specific commits", + ): + try: + with AutoContextManager( + instance, root_dir + ) as cm: + readmes = cm.get_readme_files() + instance["readmes"] = ingest_files(readmes) + instance["file_contents"] = ingest_directory_contents( + cm.repo_path + ) + finally: + # if AutoContextManager fails to exit properly future exits will return the wrong directory + os.chdir(orig_dir) + + return input_instances + + +def extract_fields(instance): + readmes_text = make_code_text(instance["readmes"]) + code_text = make_code_text( + instance["file_contents"], add_line_numbers=False) + + text_inputs = "\n".join([readmes_text, code_text]) + text_inputs = text_inputs.strip() + "\n\n" + # text_inputs = code_text + patch = "\n".join([f"", instance["patch"], ""]) + return {**instance, "text": text_inputs, "patch": patch} + + +def create_dataset(input_instances): + columns = [ + "instance_id", + "text", + "repo", + "base_commit", + "problem_statement", + "hints_text", + "created_at", + "patch", + "test_patch", + "version", + "FAIL_TO_PASS", + "PASS_TO_PASS", + "environment_setup_commit", + ] + + data_table = {key: list() for key in columns} + for instance in input_instances.values(): + datum = extract_fields(instance) + for key in columns: + data_table[key].append(datum[key] if key in datum else "") + dataset = Dataset.from_dict(data_table) + + return dataset + + +def download_instances( + input_data, + path=Path("SWE-bench_testsample"), + verbose=False, +): + """Downloads code from github. + + Args: + - input_data: dictionary with unprocessed input instances. + - verbose: set ContextManager verbose to True + """ + input_instances = {x["instance_id"]: x for x in input_data} + input_instances_copy = deepcopy(input_instances) + input_instances_with_text = ingest_repos(input_instances_copy) + dataset = create_dataset(input_instances_with_text) + dataset.save_to_disk(path) + return dataset From d986e7c981fc7cb2cfb1534c94b93e28dba0dea1 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 18 Nov 2024 15:59:18 +0100 Subject: [PATCH 07/29] minor code cleanup --- evals/eval_utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/evals/eval_utils.py b/evals/eval_utils.py index 1c2785737..e4f070ffd 100644 --- a/evals/eval_utils.py +++ b/evals/eval_utils.py @@ -1,12 +1,8 @@ -import json -import logging import os -import traceback from copy import deepcopy from pathlib import Path from tempfile import TemporaryDirectory -import unidiff from datasets import Dataset from swebench.inference.make_datasets.create_instance import make_code_text from swebench.inference.make_datasets.utils import (AutoContextManager, From 838d98238a8b4869d1e4aaf24f461902a17d04bc Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 19 Nov 2024 13:32:35 +0100 Subject: [PATCH 08/29] Code cleanup --- evals/deepeval_on_swe_bench.py | 174 --------------------------------- evals/eval_swe_bench.py | 75 ++++++++------ evals/eval_utils.py | 2 +- 3 files changed, 44 insertions(+), 207 deletions(-) delete mode 100644 evals/deepeval_on_swe_bench.py diff --git a/evals/deepeval_on_swe_bench.py b/evals/deepeval_on_swe_bench.py deleted file mode 100644 index 8cb94abb3..000000000 --- a/evals/deepeval_on_swe_bench.py +++ /dev/null @@ -1,174 +0,0 @@ -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.base_config import get_base_config -import os -import logging -from cognee.infrastructure.llm.get_llm_client import get_llm_client -from typing import List, Dict, Type -from swebench.harness.utils import load_swebench_dataset -from deepeval.dataset import EvaluationDataset -from deepeval.test_case import LLMTestCase -from pydantic import BaseModel - -from deepeval.synthesizer import Synthesizer - - -# DeepEval dataset for reference -# synthesizer = Synthesizer() -# synthesizer.generate_goldens_from_docs( -# document_paths=['/app/.data/short_stories/soldiers_home.pdf'], -# include_expected_output=True -# ) - -def convert_swe_to_deepeval(swe_dataset: List[Dict]): - deepeval_dataset = EvaluationDataset() - for datum in swe_dataset: - input = datum["problem_statement"] - expected_output = datum["patch"] - context = [datum["text"]] - # retrieval_context = datum.get(retrieval_context_key_name) - - deepeval_dataset.add_test_case( - LLMTestCase( - input=input, - actual_output=None, - expected_output=expected_output, - context=context, - # retrieval_context=retrieval_context, - ) - ) - return deepeval_dataset - - -swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') -deepeval_dataset = convert_swe_to_deepeval(swe_dataset) - - -logger = logging.getLogger(__name__) - - -class AnswerModel(BaseModel): - response: str - - -def get_answer_base(content: str, context: str, response_model: Type[BaseModel]): - llm_client = get_llm_client() - - system_prompt = "THIS IS YOUR CONTEXT:" + str(context) - - return llm_client.create_structured_output(content, system_prompt, response_model) - - -def get_answer(content: str, context, model: Type[BaseModel] = AnswerModel): - - try: - return (get_answer_base( - content, - context, - model - )) - except Exception as error: - logger.error( - "Error extracting cognitive layers from content: %s", error, exc_info=True) - raise error - - -async def run_cognify_base_rag(): - from cognee.api.v1.add import add - from cognee.api.v1.prune import prune - from cognee.api.v1.cognify.cognify import cognify - - await prune.prune_system() - - await add("data://test_datasets", "initial_test") - - graph = await cognify("initial_test") - pass - - -async def cognify_search_base_rag(content: str, context: str): - base_config = get_base_config() - - cognee_directory_path = os.path.abspath(".cognee_system") - base_config.system_root_directory = cognee_directory_path - - vector_engine = get_vector_engine() - - return_ = await vector_engine.search(collection_name="basic_rag", query_text=content, limit=10) - - print("results", return_) - return return_ - - -async def cognify_search_graph(content: str, context: str): - from cognee.api.v1.search import search, SearchType - params = {'query': 'Donald Trump'} - - results = await search(SearchType.INSIGHTS, params) - print("results", results) - return results - - -def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLMTestCase]: - test_cases = [] - for case in test_cases_raw: - test_case = LLMTestCase( - input=case.input, - # Generate actual output using the 'input' and 'additional_metadata' - actual_output=str(get_answer( - case.input, case.context).model_dump()['response']), - expected_output=case.expected_output, - context=case.context, - retrieval_context=["retrieval_context"], - ) - test_cases.append(test_case) - return test_cases - - -def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): - deepeval_dataset = EvaluationDataset() - for datum in swe_dataset[:4]: - input = datum["problem_statement"] - expected_output = datum["patch"] - context = [datum["text"]] - # retrieval_context = datum.get(retrieval_context_key_name) - # tools_called = datum.get(tools_called_key_name) - # expected_tools = json_obj.get(expected_tools_key_name) - - deepeval_dataset.add_test_case( - LLMTestCase( - input=input, - actual_output=str(get_answer( - input, context).model_dump()['response']), - expected_output=expected_output, - context=context, - # retrieval_context=retrieval_context, - # tools_called=tools_called, - # expected_tools=expected_tools, - ) - ) - return deepeval_dataset - - -swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') -test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - -if __name__ == "__main__": - - import asyncio - - async def main(): - # await run_cognify_base_rag() - # await cognify_search_base_rag("show_all_processes", "context") - await cognify_search_graph("show_all_processes", "context") - asyncio.run(main()) - # run_cognify_base_rag_and_search() - # # Data preprocessing before setting the dataset test cases - swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') - test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - from deepeval.metrics import HallucinationMetric - metric = HallucinationMetric() - evalresult = test_dataset.evaluate([metric]) - pass diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 2cb221576..c0ab6d67e 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -1,7 +1,9 @@ +import argparse import json import subprocess from pathlib import Path +from datasets import Dataset from swebench.harness.utils import load_swebench_dataset from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE @@ -13,19 +15,20 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client from evals.eval_utils import download_instances -async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): +async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) dataset_name = "SWE_test_data" - code_text = dataset[0]["text"] + code_text = instance["text"] await cognee.add([code_text], dataset_name) await code_graph_pipeline([dataset_name]) graph_engine = await get_graph_engine() with open(graph_engine.filename, "r") as f: graph_str = f.read() - problem_statement = dataset[0]['problem_statement'] + problem_statement = instance['problem_statement'] instructions = ( "I need you to solve this issue by looking at the provided knowledge graph and by " + "generating a single patch file that I can apply directly to this repository " @@ -51,9 +54,9 @@ async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): return answer_prediction -async def llm_on_preprocessed_data(dataset): - problem_statement = dataset[0]['problem_statement'] - prompt = dataset[0]["text"] +async def generate_patch_without_cognee(instance): + problem_statement = instance['problem_statement'] + prompt = instance["text"] llm_client = get_llm_client() answer_prediction = llm_client.create_structured_output( @@ -66,46 +69,54 @@ async def llm_on_preprocessed_data(dataset): async def get_preds(dataset, with_cognee=True): if with_cognee: - text_output = await cognee_and_llm(dataset) model_name = "with_cognee" + pred_func = generate_patch_with_cognee else: - text_output = await llm_on_preprocessed_data(dataset) model_name = "without_cognee" + pred_func = generate_patch_without_cognee - preds = [{"instance_id": dataset[0]["instance_id"], - "model_patch": text_output, - "model_name_or_path": model_name}] + preds = [{"instance_id": instance["instance_id"], + "model_patch": await pred_func(instance), + "model_name_or_path": model_name} for instance in dataset] return preds async def main(): - swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench', split='test') - swe_dataset_preprocessed = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') - test_data = swe_dataset[:1] - test_data_preprocessed = swe_dataset_preprocessed[:1] - assert test_data[0]["instance_id"] == test_data_preprocessed[0]["instance_id"] - filepath = Path("SWE-bench_testsample") - if filepath.exists(): - from datasets import Dataset - dataset = Dataset.load_from_disk(filepath) - else: - dataset = download_instances(test_data, filepath) + parser = argparse.ArgumentParser( + description="Run LLM predictions on SWE-bench dataset") + parser.add_argument('--cognee_off', action='store_true') + args = parser.parse_args() - cognee_preds = await get_preds(dataset, with_cognee=True) - # nocognee_preds = await get_preds(dataset, with_cognee=False) - with open("withcognee.json", "w") as file: - json.dump(cognee_preds, file) + if args.cognee_off: + dataset_name = 'princeton-nlp/SWE-bench_Lite_bm25_13K' + dataset = load_swebench_dataset(dataset_name, split='test') + predictions_path = "preds_nocognee.json" + if Path(predictions_path).exists(): + with open(predictions_path, "r") as file: + preds = json.load(file) + else: + preds = await get_preds(dataset, with_cognee=False) + with open(predictions_path, "w") as file: + json.dump(preds, file) + else: + dataset_name = 'princeton-nlp/SWE-bench_Lite' + swe_dataset = load_swebench_dataset( + dataset_name, split='test')[:1] + filepath = Path("SWE-bench_testsample") + if filepath.exists(): + dataset = Dataset.load_from_disk(filepath) + else: + dataset = download_instances(swe_dataset, filepath) + predictions_path = "preds.json" + preds = await get_preds(dataset, with_cognee=not args.cognee_off) subprocess.run(["python", "-m", "swebench.harness.run_evaluation", - "--dataset_name", 'princeton-nlp/SWE-bench', + "--dataset_name", dataset_name, "--split", "test", - "--predictions_path", "withcognee.json", + "--predictions_path", predictions_path, "--max_workers", "1", - "--instance_ids", test_data[0]["instance_id"], - "--run_id", "with_cognee"]) + "--run_id", "test_run"]) if __name__ == "__main__": import asyncio diff --git a/evals/eval_utils.py b/evals/eval_utils.py index e4f070ffd..e95a84cec 100644 --- a/evals/eval_utils.py +++ b/evals/eval_utils.py @@ -53,7 +53,7 @@ def extract_fields(instance): text_inputs = "\n".join([readmes_text, code_text]) text_inputs = text_inputs.strip() + "\n\n" # text_inputs = code_text - patch = "\n".join([f"", instance["patch"], ""]) + patch = "\n".join(["", instance["patch"], ""]) return {**instance, "text": text_inputs, "patch": patch} From 2948089806d4465dc7cae264e0f4951cb6af2230 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 19 Nov 2024 14:07:53 +0100 Subject: [PATCH 09/29] Read patch generation instructions from file --- .../infrastructure/llm/prompts/patch_gen_instructions.txt | 3 +++ evals/eval_swe_bench.py | 8 ++------ 2 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 cognee/infrastructure/llm/prompts/patch_gen_instructions.txt diff --git a/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt new file mode 100644 index 000000000..1553753ab --- /dev/null +++ b/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt @@ -0,0 +1,3 @@ +I need you to solve this issue by looking at the provided knowledge graph and +generating a single patch file that I can apply directly to this repository using git apply. +Please respond with a single patch file in the following format. \ No newline at end of file diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index c0ab6d67e..e5b8643c1 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -12,6 +12,7 @@ from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.infrastructure.llm.prompts import read_query_prompt from evals.eval_utils import download_instances @@ -29,12 +30,7 @@ async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): graph_str = f.read() problem_statement = instance['problem_statement'] - instructions = ( - "I need you to solve this issue by looking at the provided knowledge graph and by " - + "generating a single patch file that I can apply directly to this repository " - + "using git apply. Please respond with a single patch " - + "file in the following format." - ) + instructions = read_query_prompt("patch_gen_instructions.txt") prompt = "\n".join([ instructions, From 5f144a0f922997fa96efea7c7e5684ea8f381a22 Mon Sep 17 00:00:00 2001 From: Boris Date: Tue, 19 Nov 2024 15:30:09 +0100 Subject: [PATCH 10/29] fix: make all checks green (#1) --- cognee/modules/pipelines/Pipeline.py | 18 ------------------ docker-compose.yml | 2 +- notebooks/cognee_llama_index.ipynb | 2 +- 3 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 cognee/modules/pipelines/Pipeline.py diff --git a/cognee/modules/pipelines/Pipeline.py b/cognee/modules/pipelines/Pipeline.py deleted file mode 100644 index 610394174..000000000 --- a/cognee/modules/pipelines/Pipeline.py +++ /dev/null @@ -1,18 +0,0 @@ -from uuid import UUID, uuid4 -from typing import Optional -from pydantic import BaseModel -from .models.Task import Task - -class PipelineConfig(BaseModel): - batch_count: int = 10 - description: Optional[str] = None - -class Pipeline(): - id: UUID = uuid4() - name: str - description: str - tasks: list[Task] = [] - - def __init__(self, name: str, pipeline_config: PipelineConfig): - self.name = name - self.description = pipeline_config.description diff --git a/docker-compose.yml b/docker-compose.yml index 426b178a7..afb216169 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,7 +19,7 @@ services: deploy: resources: limits: - cpus: '4.0' + cpus: '2.0' memory: 8GB frontend: diff --git a/notebooks/cognee_llama_index.ipynb b/notebooks/cognee_llama_index.ipynb index 15e17163d..d6f92b2b1 100644 --- a/notebooks/cognee_llama_index.ipynb +++ b/notebooks/cognee_llama_index.ipynb @@ -174,7 +174,7 @@ "\n", "# Query cognee for summaries\n", "search_results = await cognee.search(\n", - " SearchType.SUMMARIES, query=\"What are the main news discussed in the document?\"\n", + " SearchType.SUMMARIES, query_text=\"What are the main news discussed in the document?\"\n", ")\n", "# Display search results\n", "print(\"\\n Summary of main news discussed:\\n\")\n", From 4b55354dce0e8ffa3c7ebb6f9dcba95129318c73 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Tue, 19 Nov 2024 15:31:26 +0100 Subject: [PATCH 11/29] fix: Resolve issue with pgvector timeout (#3) By creating PGVector as a singleton all issues regrading timeout are resolved as there are no more parallel instances trying to communicate with the database --- .../databases/vector/get_vector_engine.py | 2 + notebooks/cognee_llama_index.ipynb | 4 +- poetry.lock | 101 ++++++++++-------- pyproject.toml | 2 +- 4 files changed, 60 insertions(+), 49 deletions(-) diff --git a/cognee/infrastructure/databases/vector/get_vector_engine.py b/cognee/infrastructure/databases/vector/get_vector_engine.py index 8056c9e36..079a8903f 100644 --- a/cognee/infrastructure/databases/vector/get_vector_engine.py +++ b/cognee/infrastructure/databases/vector/get_vector_engine.py @@ -1,6 +1,8 @@ from .config import get_vectordb_config from .embeddings import get_embedding_engine from .create_vector_engine import create_vector_engine +from functools import lru_cache +@lru_cache def get_vector_engine(): return create_vector_engine(get_vectordb_config().to_dict(), get_embedding_engine()) diff --git a/notebooks/cognee_llama_index.ipynb b/notebooks/cognee_llama_index.ipynb index d6f92b2b1..742c2f51c 100644 --- a/notebooks/cognee_llama_index.ipynb +++ b/notebooks/cognee_llama_index.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ diff --git a/poetry.lock b/poetry.lock index 93eb4a4d1..96f9aec27 100644 --- a/poetry.lock +++ b/poetry.lock @@ -404,60 +404,69 @@ files = [ [[package]] name = "asyncpg" -version = "0.29.0" +version = "0.30.0" description = "An asyncio PostgreSQL driver" optional = false python-versions = ">=3.8.0" files = [ - {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"}, - {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"}, - {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"}, - {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"}, - {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"}, - {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"}, - {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"}, - {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"}, - {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"}, - {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"}, - {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"}, - {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"}, - {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"}, - {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"}, - {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"}, - {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"}, - {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"}, - {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"}, - {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"}, - {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"}, - {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"}, - {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"}, - {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"}, - {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"}, - {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"}, - {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"}, - {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"}, - {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"}, - {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"}, - {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"}, - {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"}, - {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"}, - {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"}, - {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"}, - {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"}, - {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"}, - {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"}, - {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"}, - {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"}, - {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"}, - {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"}, + {file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"}, + {file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"}, + {file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3152fef2e265c9c24eec4ee3d22b4f4d2703d30614b0b6753e9ed4115c8a146f"}, + {file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7255812ac85099a0e1ffb81b10dc477b9973345793776b128a23e60148dd1af"}, + {file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:578445f09f45d1ad7abddbff2a3c7f7c291738fdae0abffbeb737d3fc3ab8b75"}, + {file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c42f6bb65a277ce4d93f3fba46b91a265631c8df7250592dd4f11f8b0152150f"}, + {file = "asyncpg-0.30.0-cp310-cp310-win32.whl", hash = "sha256:aa403147d3e07a267ada2ae34dfc9324e67ccc4cdca35261c8c22792ba2b10cf"}, + {file = "asyncpg-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb622c94db4e13137c4c7f98834185049cc50ee01d8f657ef898b6407c7b9c50"}, + {file = "asyncpg-0.30.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a"}, + {file = "asyncpg-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed"}, + {file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a"}, + {file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956"}, + {file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056"}, + {file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454"}, + {file = "asyncpg-0.30.0-cp311-cp311-win32.whl", hash = "sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d"}, + {file = "asyncpg-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f"}, + {file = "asyncpg-0.30.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e"}, + {file = "asyncpg-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a"}, + {file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3"}, + {file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737"}, + {file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a"}, + {file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af"}, + {file = "asyncpg-0.30.0-cp312-cp312-win32.whl", hash = "sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e"}, + {file = "asyncpg-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305"}, + {file = "asyncpg-0.30.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70"}, + {file = "asyncpg-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3"}, + {file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33"}, + {file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4"}, + {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4"}, + {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba"}, + {file = "asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590"}, + {file = "asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e"}, + {file = "asyncpg-0.30.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:29ff1fc8b5bf724273782ff8b4f57b0f8220a1b2324184846b39d1ab4122031d"}, + {file = "asyncpg-0.30.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64e899bce0600871b55368b8483e5e3e7f1860c9482e7f12e0a771e747988168"}, + {file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b290f4726a887f75dcd1b3006f484252db37602313f806e9ffc4e5996cfe5cb"}, + {file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f86b0e2cd3f1249d6fe6fd6cfe0cd4538ba994e2d8249c0491925629b9104d0f"}, + {file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:393af4e3214c8fa4c7b86da6364384c0d1b3298d45803375572f415b6f673f38"}, + {file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fd4406d09208d5b4a14db9a9dbb311b6d7aeeab57bded7ed2f8ea41aeef39b34"}, + {file = "asyncpg-0.30.0-cp38-cp38-win32.whl", hash = "sha256:0b448f0150e1c3b96cb0438a0d0aa4871f1472e58de14a3ec320dbb2798fb0d4"}, + {file = "asyncpg-0.30.0-cp38-cp38-win_amd64.whl", hash = "sha256:f23b836dd90bea21104f69547923a02b167d999ce053f3d502081acea2fba15b"}, + {file = "asyncpg-0.30.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad"}, + {file = "asyncpg-0.30.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff"}, + {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708"}, + {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144"}, + {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb"}, + {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547"}, + {file = "asyncpg-0.30.0-cp39-cp39-win32.whl", hash = "sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a"}, + {file = "asyncpg-0.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773"}, + {file = "asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851"}, ] [package.dependencies] -async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""} +async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.11.0\""} [package.extras] -docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] -test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"] +docs = ["Sphinx (>=8.1.3,<8.2.0)", "sphinx-rtd-theme (>=1.2.2)"] +gssauth = ["gssapi", "sspilib"] +test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi", "k5test", "mypy (>=1.8.0,<1.9.0)", "sspilib", "uvloop (>=0.15.3)"] [[package]] name = "attrs" @@ -7632,4 +7641,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "f5874af8364839dd2a362b6b3209c4aae108f30dcc27be43d0d07f7b28160eda" +content-hash = "474ae44ef721bf9b2d34d1cd139cddf42542ef9167895960784b6e88214dd1e6" diff --git a/pyproject.toml b/pyproject.toml index 0bab3f615..92b70db63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ anthropic = "^0.26.1" sentry-sdk = {extras = ["fastapi"], version = "^2.9.0"} fastapi-users = {version = "*", extras = ["sqlalchemy"]} alembic = "^1.13.3" -asyncpg = "^0.29.0" +asyncpg = "0.30.0" pgvector = "^0.3.5" psycopg2 = {version = "^2.9.10", optional = true} llama-index-core = {version = "^0.11.22", optional = true} From 26e2dc852d5461ca314b6099c2e6852664849a30 Mon Sep 17 00:00:00 2001 From: lxobr Date: Fri, 15 Nov 2024 09:26:41 +0100 Subject: [PATCH 12/29] feat: new repo-to-graph task --- cognee/tasks/repo_processor/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 cognee/tasks/repo_processor/__init__.py diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py new file mode 100644 index 000000000..e69de29bb From ba83d71269d0af19e9b278b5e4b89b55add968f4 Mon Sep 17 00:00:00 2001 From: lxobr Date: Fri, 15 Nov 2024 09:28:13 +0100 Subject: [PATCH 13/29] feat: extract script dependencies --- .../local_script_dependencies.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 cognee/tasks/repo_processor/local_script_dependencies.py diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py new file mode 100644 index 000000000..244431212 --- /dev/null +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -0,0 +1,83 @@ +from typing import List, Dict, Optional +import jedi +import parso +import sys +from pathlib import Path +from parso.tree import BaseNode + + +def get_code_entities(node: parso.tree.NodeOrLeaf) -> List[Dict[str, any]]: + """ + Recursively extract code entities using parso. + """ + code_entity_list = [] + + if not hasattr(node, 'children'): + return code_entity_list + + # Process nodes of type 'name', which correspond to code entities + name_nodes = (child for child in node.children if child.type == 'name') + for name_node in name_nodes: + code_entity = { + 'name': name_node.value, + 'line': name_node.start_pos[0], + 'column': name_node.start_pos[1] + } + code_entity_list.append(code_entity) + + # Recursively process child nodes + for child in node.children: + code_entity_list.extend(get_code_entities(child)) + + return code_entity_list + + +def update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> None: + """ + Update a code_entity with (full_name, module_name, module_path) using Jedi + """ + results = script.goto(code_entity["line"], code_entity["column"], follow_imports=True) + if results: + code_entity["full_name"] = getattr(results[0], "full_name", None) + code_entity["module_name"] = getattr(results[0], "module_name", None) + code_entity["module_path"] = getattr(results[0], "module_path", None) + + +def get_local_script_dependencies(script_path: str, repo_path: Optional[str] = None) -> List[str]: + """ + Extract and return a list of unique module paths that the script depends on. + """ + if repo_path: + sys.path.insert(0, str(Path(repo_path).resolve())) + + with open(script_path, "r") as file: + source_code = file.read() + + script = jedi.Script(code=source_code, path=script_path) + + tree = parso.parse(source_code) + code_entities = get_code_entities(tree) + + for code_entity in code_entities: + update_code_entity(script, code_entity) + + module_paths = { + entity.get("module_path") + for entity in code_entities + if entity.get("module_path") + } + if repo_path: + repo_path_resolved = str(Path(repo_path).resolve(strict=False)) + module_paths = {path for path in module_paths if str(path).startswith(repo_path_resolved)} + + return sorted(path for path in module_paths if path) + +if __name__ == "__main__": + # Simple execution example, use absolute paths + script_path = ".../cognee/examples/python/simple_example.py" + repo_path = ".../cognee" + + dependencies = get_local_script_dependencies(script_path, repo_path) + print("Dependencies:") + for dependency in dependencies: + print(dependency) \ No newline at end of file From e148d32c14c19acc8d818b732186aecd48e59dd3 Mon Sep 17 00:00:00 2001 From: lxobr Date: Fri, 15 Nov 2024 17:59:10 +0100 Subject: [PATCH 14/29] refactor: Modify sys.path in context manager --- .../local_script_dependencies.py | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index 244431212..4beb584f4 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -1,3 +1,4 @@ +from contextlib import contextmanager from typing import List, Dict, Optional import jedi import parso @@ -6,7 +7,17 @@ from pathlib import Path from parso.tree import BaseNode -def get_code_entities(node: parso.tree.NodeOrLeaf) -> List[Dict[str, any]]: +@contextmanager +def add_sys_path(path): + original_sys_path = sys.path.copy() + sys.path.insert(0, path) + try: + yield + finally: + sys.path = original_sys_path + + +def _get_code_entities(node: parso.tree.NodeOrLeaf) -> List[Dict[str, any]]: """ Recursively extract code entities using parso. """ @@ -27,12 +38,12 @@ def get_code_entities(node: parso.tree.NodeOrLeaf) -> List[Dict[str, any]]: # Recursively process child nodes for child in node.children: - code_entity_list.extend(get_code_entities(child)) + code_entity_list.extend(_get_code_entities(child)) return code_entity_list -def update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> None: +def _update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> None: """ Update a code_entity with (full_name, module_name, module_path) using Jedi """ @@ -42,35 +53,38 @@ def update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> None code_entity["module_name"] = getattr(results[0], "module_name", None) code_entity["module_path"] = getattr(results[0], "module_path", None) - -def get_local_script_dependencies(script_path: str, repo_path: Optional[str] = None) -> List[str]: - """ - Extract and return a list of unique module paths that the script depends on. - """ - if repo_path: - sys.path.insert(0, str(Path(repo_path).resolve())) - +def _extract_dependencies(script_path: str) -> List[str]: with open(script_path, "r") as file: source_code = file.read() script = jedi.Script(code=source_code, path=script_path) tree = parso.parse(source_code) - code_entities = get_code_entities(tree) + code_entities = _get_code_entities(tree) for code_entity in code_entities: - update_code_entity(script, code_entity) + _update_code_entity(script, code_entity) module_paths = { entity.get("module_path") for entity in code_entities if entity.get("module_path") } - if repo_path: - repo_path_resolved = str(Path(repo_path).resolve(strict=False)) - module_paths = {path for path in module_paths if str(path).startswith(repo_path_resolved)} - return sorted(path for path in module_paths if path) + return sorted(str(path) for path in module_paths) + +def get_local_script_dependencies(script_path: str, repo_path: Optional[str] = None) -> List[str]: + """ + Extract and return a list of unique module paths that the script depends on. + """ + if repo_path: + repo_path_resolved = str(Path(repo_path).resolve()) + with add_sys_path(repo_path_resolved): + dependencies = _extract_dependencies(script_path) + dependencies = [path for path in dependencies if path.startswith(repo_path_resolved)] + else: + dependencies = _extract_dependencies(script_path) + return dependencies if __name__ == "__main__": # Simple execution example, use absolute paths From 2be2b802c0ec3d69e7a900b092570d0e52359dd0 Mon Sep 17 00:00:00 2001 From: lxobr Date: Fri, 15 Nov 2024 17:59:51 +0100 Subject: [PATCH 15/29] feat: Safely handle file read errors --- cognee/tasks/repo_processor/local_script_dependencies.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index 4beb584f4..8acdbff0c 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -54,8 +54,12 @@ def _update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> Non code_entity["module_path"] = getattr(results[0], "module_path", None) def _extract_dependencies(script_path: str) -> List[str]: - with open(script_path, "r") as file: - source_code = file.read() + try: + with open(script_path, "r") as file: + source_code = file.read() + except IOError as e: + print(f"Error opening {script_path}: {e}") + return [] script = jedi.Script(code=source_code, path=script_path) From 742792b6c10fe5c8ef88e1060ff04614baf27cde Mon Sep 17 00:00:00 2001 From: lxobr Date: Fri, 15 Nov 2024 18:02:35 +0100 Subject: [PATCH 16/29] refactor: Remove a comment --- cognee/tasks/repo_processor/local_script_dependencies.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index 8acdbff0c..bd856c7d5 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -26,7 +26,6 @@ def _get_code_entities(node: parso.tree.NodeOrLeaf) -> List[Dict[str, any]]: if not hasattr(node, 'children'): return code_entity_list - # Process nodes of type 'name', which correspond to code entities name_nodes = (child for child in node.children if child.type == 'name') for name_node in name_nodes: code_entity = { From 4bf2281cd543ea6eb867c696bfbba38848327fac Mon Sep 17 00:00:00 2001 From: lxobr Date: Fri, 15 Nov 2024 18:24:29 +0100 Subject: [PATCH 17/29] feat: Enable async processing --- .../repo_processor/local_script_dependencies.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index bd856c7d5..17120a32a 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -1,5 +1,7 @@ from contextlib import contextmanager from typing import List, Dict, Optional +import asyncio +import aiofiles import jedi import parso import sys @@ -52,10 +54,10 @@ def _update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> Non code_entity["module_name"] = getattr(results[0], "module_name", None) code_entity["module_path"] = getattr(results[0], "module_path", None) -def _extract_dependencies(script_path: str) -> List[str]: +async def _extract_dependencies(script_path: str) -> List[str]: try: - with open(script_path, "r") as file: - source_code = file.read() + async with aiofiles.open(script_path, "r") as file: + source_code = await file.read() except IOError as e: print(f"Error opening {script_path}: {e}") return [] @@ -76,17 +78,17 @@ def _extract_dependencies(script_path: str) -> List[str]: return sorted(str(path) for path in module_paths) -def get_local_script_dependencies(script_path: str, repo_path: Optional[str] = None) -> List[str]: +async def get_local_script_dependencies(script_path: str, repo_path: Optional[str] = None) -> List[str]: """ Extract and return a list of unique module paths that the script depends on. """ if repo_path: repo_path_resolved = str(Path(repo_path).resolve()) with add_sys_path(repo_path_resolved): - dependencies = _extract_dependencies(script_path) + dependencies = await _extract_dependencies(script_path) dependencies = [path for path in dependencies if path.startswith(repo_path_resolved)] else: - dependencies = _extract_dependencies(script_path) + dependencies = await _extract_dependencies(script_path) return dependencies if __name__ == "__main__": @@ -94,7 +96,8 @@ if __name__ == "__main__": script_path = ".../cognee/examples/python/simple_example.py" repo_path = ".../cognee" - dependencies = get_local_script_dependencies(script_path, repo_path) + dependencies = asyncio.run(get_local_script_dependencies(script_path, repo_path)) + print("Dependencies:") for dependency in dependencies: print(dependency) \ No newline at end of file From 3aadda9a890e9249acd70a4224db03d45bcbc179 Mon Sep 17 00:00:00 2001 From: lxobr Date: Mon, 18 Nov 2024 10:05:38 +0100 Subject: [PATCH 18/29] feat: Add argparse for testing purposes --- .../local_script_dependencies.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index 17120a32a..47839f8cf 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -1,14 +1,15 @@ -from contextlib import contextmanager -from typing import List, Dict, Optional +import argparse import asyncio +import sys +from contextlib import contextmanager +from pathlib import Path +from typing import List, Dict, Optional + import aiofiles import jedi import parso -import sys -from pathlib import Path from parso.tree import BaseNode - @contextmanager def add_sys_path(path): original_sys_path = sys.path.copy() @@ -91,13 +92,23 @@ async def get_local_script_dependencies(script_path: str, repo_path: Optional[st dependencies = await _extract_dependencies(script_path) return dependencies + if __name__ == "__main__": - # Simple execution example, use absolute paths - script_path = ".../cognee/examples/python/simple_example.py" - repo_path = ".../cognee" + parser = argparse.ArgumentParser(description="Get local script dependencies.") + + # Suggested path: .../cognee/examples/python/simple_example.py + parser.add_argument("script_path", type=str, help="Absolute path to the Python script file") + + # Suggested path: .../cognee + parser.add_argument("repo_path", type=str, help="Absolute path to the repository root") + + args = parser.parse_args() + + script_path = args.script_path + repo_path = args.repo_path dependencies = asyncio.run(get_local_script_dependencies(script_path, repo_path)) print("Dependencies:") for dependency in dependencies: - print(dependency) \ No newline at end of file + print(dependency) From 1a1452e177960de279b92a09617e8ba50501f4ed Mon Sep 17 00:00:00 2001 From: lxobr Date: Mon, 18 Nov 2024 16:16:59 +0100 Subject: [PATCH 19/29] fix: Add error handling for Jedi analysis, with debug mode --- .../repo_processor/local_script_dependencies.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index 47839f8cf..517a2d43d 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -49,11 +49,16 @@ def _update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> Non """ Update a code_entity with (full_name, module_name, module_path) using Jedi """ - results = script.goto(code_entity["line"], code_entity["column"], follow_imports=True) - if results: - code_entity["full_name"] = getattr(results[0], "full_name", None) - code_entity["module_name"] = getattr(results[0], "module_name", None) - code_entity["module_path"] = getattr(results[0], "module_path", None) + try: + results = script.goto(code_entity["line"], code_entity["column"], follow_imports=True) + if results: + result = results[0] + code_entity["full_name"] = getattr(result, "full_name", None) + code_entity["module_name"] = getattr(result, "module_name", None) + code_entity["module_path"] = getattr(result, "module_path", None) + except Exception as e: + # logging.warning(f"Failed to analyze code entity {code_entity['name']}: {e}") + print(f"Failed to analyze code entity {code_entity['name']}: {e}") async def _extract_dependencies(script_path: str) -> List[str]: try: @@ -63,6 +68,7 @@ async def _extract_dependencies(script_path: str) -> List[str]: print(f"Error opening {script_path}: {e}") return [] + jedi.set_debug_function(lambda color, str_out: None) script = jedi.Script(code=source_code, path=script_path) tree = parso.parse(source_code) From 2417d18607096622f4437c3299602c2d2dc5d4ba Mon Sep 17 00:00:00 2001 From: lxobr Date: Tue, 19 Nov 2024 08:34:06 +0100 Subject: [PATCH 20/29] fix: Add logging instead of print --- cognee/tasks/repo_processor/__init__.py | 3 +++ cognee/tasks/repo_processor/local_script_dependencies.py | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index e69de29bb..94dab6b3f 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger("task:repo_processor") diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index 517a2d43d..18576b359 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -10,6 +10,9 @@ import jedi import parso from parso.tree import BaseNode +from cognee.tasks.repo_processor import logger + + @contextmanager def add_sys_path(path): original_sys_path = sys.path.copy() @@ -58,14 +61,15 @@ def _update_code_entity(script: jedi.Script, code_entity: Dict[str, any]) -> Non code_entity["module_path"] = getattr(result, "module_path", None) except Exception as e: # logging.warning(f"Failed to analyze code entity {code_entity['name']}: {e}") - print(f"Failed to analyze code entity {code_entity['name']}: {e}") + logger.error(f"Failed to analyze code entity {code_entity['name']}: {e}") + async def _extract_dependencies(script_path: str) -> List[str]: try: async with aiofiles.open(script_path, "r") as file: source_code = await file.read() except IOError as e: - print(f"Error opening {script_path}: {e}") + logger.error(f"Error opening {script_path}: {e}") return [] jedi.set_debug_function(lambda color, str_out: None) From ebb811af8745f1ebaf9915d387beaf234094250e Mon Sep 17 00:00:00 2001 From: lxobr Date: Tue, 19 Nov 2024 09:08:54 +0100 Subject: [PATCH 21/29] fix: Filter out None values in module paths --- cognee/tasks/repo_processor/local_script_dependencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index 18576b359..a39680521 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -84,7 +84,7 @@ async def _extract_dependencies(script_path: str) -> List[str]: module_paths = { entity.get("module_path") for entity in code_entities - if entity.get("module_path") + if entity.get("module_path") is not None } return sorted(str(path) for path in module_paths) From 8bc26bba977470fbfff590bc75b836503358acaf Mon Sep 17 00:00:00 2001 From: lxobr Date: Tue, 19 Nov 2024 09:09:30 +0100 Subject: [PATCH 22/29] fix: Add error handling for path conversion --- .../tasks/repo_processor/local_script_dependencies.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index a39680521..f0c3b246f 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -87,7 +87,15 @@ async def _extract_dependencies(script_path: str) -> List[str]: if entity.get("module_path") is not None } - return sorted(str(path) for path in module_paths) + str_paths = [] + for module_path in module_paths: + try: + str_paths.append(str(module_path)) + except Exception as e: + logger.error(f"Error converting path to string: {e}") + + return sorted(str_paths) + async def get_local_script_dependencies(script_path: str, repo_path: Optional[str] = None) -> List[str]: """ From 263ecb914951b22da5431ed4c4844508c21aceff Mon Sep 17 00:00:00 2001 From: lxobr Date: Tue, 19 Nov 2024 09:10:04 +0100 Subject: [PATCH 23/29] fix: Add input validation and error handling for paths --- .../local_script_dependencies.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/local_script_dependencies.py index f0c3b246f..ed11640da 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/local_script_dependencies.py @@ -101,14 +101,29 @@ async def get_local_script_dependencies(script_path: str, repo_path: Optional[st """ Extract and return a list of unique module paths that the script depends on. """ - if repo_path: - repo_path_resolved = str(Path(repo_path).resolve()) - with add_sys_path(repo_path_resolved): - dependencies = await _extract_dependencies(script_path) - dependencies = [path for path in dependencies if path.startswith(repo_path_resolved)] - else: + try: + script_path = Path(script_path).resolve(strict=True) + except (FileNotFoundError, PermissionError) as e: + logger.error(f"Error resolving script path: {e}") + return [] + + if not repo_path: + return await _extract_dependencies(script_path) + + try: + repo_path = Path(repo_path).resolve(strict=True) + except (FileNotFoundError, PermissionError) as e: + logger.warning(f"Error resolving repo path: {e}. Proceeding without repo_path.") + return await _extract_dependencies(script_path) + + if not script_path.is_relative_to(repo_path): + logger.warning(f"Script {script_path} not in repo {repo_path}. Proceeding without repo_path.") + return await _extract_dependencies(script_path) + + with add_sys_path(str(repo_path)): dependencies = await _extract_dependencies(script_path) - return dependencies + + return [path for path in dependencies if path.startswith(str(repo_path))] if __name__ == "__main__": From f27dc0c91a1ee13c193adeedcbd202be25eb3d3f Mon Sep 17 00:00:00 2001 From: lxobr Date: Tue, 19 Nov 2024 14:33:08 +0100 Subject: [PATCH 24/29] fix: Rename, extract checker into a separate script --- .../code/get_local_dependencies_checker.py | 20 ++++++++++++++++++ ...endencies.py => get_local_dependencies.py} | 21 ------------------- 2 files changed, 20 insertions(+), 21 deletions(-) create mode 100644 cognee/tasks/code/get_local_dependencies_checker.py rename cognee/tasks/repo_processor/{local_script_dependencies.py => get_local_dependencies.py} (85%) diff --git a/cognee/tasks/code/get_local_dependencies_checker.py b/cognee/tasks/code/get_local_dependencies_checker.py new file mode 100644 index 000000000..5d465254a --- /dev/null +++ b/cognee/tasks/code/get_local_dependencies_checker.py @@ -0,0 +1,20 @@ +import argparse +import asyncio +from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Get local script dependencies.") + + # Suggested path: .../cognee/examples/python/simple_example.py + parser.add_argument("script_path", type=str, help="Absolute path to the Python script file") + + # Suggested path: .../cognee + parser.add_argument("repo_path", type=str, help="Absolute path to the repository root") + + args = parser.parse_args() + + dependencies = asyncio.run(get_local_script_dependencies(args.script_path, args.repo_path)) + + print("Dependencies:") + for dependency in dependencies: + print(dependency) diff --git a/cognee/tasks/repo_processor/local_script_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py similarity index 85% rename from cognee/tasks/repo_processor/local_script_dependencies.py rename to cognee/tasks/repo_processor/get_local_dependencies.py index ed11640da..fb4c68710 100644 --- a/cognee/tasks/repo_processor/local_script_dependencies.py +++ b/cognee/tasks/repo_processor/get_local_dependencies.py @@ -124,24 +124,3 @@ async def get_local_script_dependencies(script_path: str, repo_path: Optional[st dependencies = await _extract_dependencies(script_path) return [path for path in dependencies if path.startswith(str(repo_path))] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Get local script dependencies.") - - # Suggested path: .../cognee/examples/python/simple_example.py - parser.add_argument("script_path", type=str, help="Absolute path to the Python script file") - - # Suggested path: .../cognee - parser.add_argument("repo_path", type=str, help="Absolute path to the repository root") - - args = parser.parse_args() - - script_path = args.script_path - repo_path = args.repo_path - - dependencies = asyncio.run(get_local_script_dependencies(script_path, repo_path)) - - print("Dependencies:") - for dependency in dependencies: - print(dependency) From 15b7b8ef2b10316c1e28799e35fa2cb60e911e0b Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 20 Nov 2024 14:54:35 +0100 Subject: [PATCH 25/29] fix: Resolve issue with table names in SQL commands Some SQL commands require lowercase characters in table names unless table name is wrapped in quotes. Renamed all new tables to use lowercase Fix COG-677 --- cognee/infrastructure/engine/models/DataPoint.py | 1 + cognee/modules/chunking/models/DocumentChunk.py | 1 + cognee/modules/engine/models/Entity.py | 1 + cognee/modules/engine/models/EntityType.py | 1 + cognee/tasks/chunks/query_chunks.py | 2 +- cognee/tasks/graph/query_graph_connections.py | 4 ++-- cognee/tasks/storage/index_data_points.py | 4 ++-- cognee/tasks/summarization/models.py | 1 + cognee/tasks/summarization/query_summaries.py | 2 +- cognee/tests/test_pgvector.py | 2 +- 10 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 337306cb6..f8ea1c9f0 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -8,6 +8,7 @@ class MetaData(TypedDict): index_fields: list[str] class DataPoint(BaseModel): + __tablename__ = "data_point" id: UUID = Field(default_factory = uuid4) updated_at: Optional[datetime] = datetime.now(timezone.utc) _metadata: Optional[MetaData] = { diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 975edb27e..b5b1cef94 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -3,6 +3,7 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.data.processing.document_types import Document class DocumentChunk(DataPoint): + __tablename__ = "document_chunk" text: str word_count: int chunk_index: int diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index c43774e38..cf946ceb6 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -3,6 +3,7 @@ from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from .EntityType import EntityType class Entity(DataPoint): + __tablename__ = "entity" name: str is_a: EntityType description: str diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py index b4f495857..56092f261 100644 --- a/cognee/modules/engine/models/EntityType.py +++ b/cognee/modules/engine/models/EntityType.py @@ -2,6 +2,7 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models.DocumentChunk import DocumentChunk class EntityType(DataPoint): + __tablename__ = "entity_type" name: str type: str description: str diff --git a/cognee/tasks/chunks/query_chunks.py b/cognee/tasks/chunks/query_chunks.py index 93f32a640..399528ee9 100644 --- a/cognee/tasks/chunks/query_chunks.py +++ b/cognee/tasks/chunks/query_chunks.py @@ -10,7 +10,7 @@ async def query_chunks(query: str) -> list[dict]: """ vector_engine = get_vector_engine() - found_chunks = await vector_engine.search("DocumentChunk_text", query, limit = 5) + found_chunks = await vector_engine.search("document_chunk_text", query, limit = 5) chunks = [result.payload for result in found_chunks] diff --git a/cognee/tasks/graph/query_graph_connections.py b/cognee/tasks/graph/query_graph_connections.py index cd4d76a5e..4020ddd13 100644 --- a/cognee/tasks/graph/query_graph_connections.py +++ b/cognee/tasks/graph/query_graph_connections.py @@ -27,8 +27,8 @@ async def query_graph_connections(query: str, exploration_levels = 1) -> list[(s else: vector_engine = get_vector_engine() results = await asyncio.gather( - vector_engine.search("Entity_name", query_text = query, limit = 5), - vector_engine.search("EntityType_name", query_text = query, limit = 5), + vector_engine.search("entity_name", query_text = query, limit = 5), + vector_engine.search("entity_type_name", query_text = query, limit = 5), ) results = [*results[0], *results[1]] relevant_results = [result for result in results if result.score < 0.5][:5] diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index dc74d705d..12903173a 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -16,10 +16,10 @@ async def index_data_points(data_points: list[DataPoint]): data_point_type = type(data_point) for field_name in data_point._metadata["index_fields"]: - index_name = f"{data_point_type.__name__}.{field_name}" + index_name = f"{data_point_type.__tablename__}.{field_name}" if index_name not in created_indexes: - await vector_engine.create_vector_index(data_point_type.__name__, field_name) + await vector_engine.create_vector_index(data_point_type.__tablename__, field_name) created_indexes[index_name] = True if index_name not in index_points: diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index c6a932b37..955c0e2fa 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -3,6 +3,7 @@ from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.data.processing.document_types import Document class TextSummary(DataPoint): + __tablename__ = "text_summary" text: str made_from: DocumentChunk diff --git a/cognee/tasks/summarization/query_summaries.py b/cognee/tasks/summarization/query_summaries.py index 896839143..d9ec0fa00 100644 --- a/cognee/tasks/summarization/query_summaries.py +++ b/cognee/tasks/summarization/query_summaries.py @@ -10,7 +10,7 @@ async def query_summaries(query: str) -> list: """ vector_engine = get_vector_engine() - summaries_results = await vector_engine.search("TextSummary_text", query, limit = 5) + summaries_results = await vector_engine.search("text_summary_text", query, limit = 5) summaries = [summary.payload for summary in summaries_results] diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index b5a6fc446..1466e195f 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -65,7 +65,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] + random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) From e4d00403ba5f0c23b467798a5ce936bed53d11ac Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 20 Nov 2024 15:02:22 +0100 Subject: [PATCH 26/29] fix: Update table names in tests Update table names in tests to accomodate to recent fix Fix COG-677 --- cognee/tests/test_library.py | 2 +- cognee/tests/test_neo4j.py | 2 +- cognee/tests/test_qdrant.py | 2 +- cognee/tests/test_weaviate.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 88c9cdc7b..66d218c3b 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -32,7 +32,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "AI"))[0] + random_node = (await vector_engine.search("entity_name", "AI"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 44bacd9b5..756b29cc4 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -36,7 +36,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] + random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) diff --git a/cognee/tests/test_qdrant.py b/cognee/tests/test_qdrant.py index 8ca525f0c..680399e60 100644 --- a/cognee/tests/test_qdrant.py +++ b/cognee/tests/test_qdrant.py @@ -37,7 +37,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] + random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) diff --git a/cognee/tests/test_weaviate.py b/cognee/tests/test_weaviate.py index cdb4b9349..c93dc036a 100644 --- a/cognee/tests/test_weaviate.py +++ b/cognee/tests/test_weaviate.py @@ -35,7 +35,7 @@ async def main(): from cognee.infrastructure.databases.vector import get_vector_engine vector_engine = get_vector_engine() - random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] + random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0] random_node_name = random_node.payload["text"] search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name) From 70fe6ac54120d05b4947f515b52a10539412be5f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 20 Nov 2024 15:07:38 +0100 Subject: [PATCH 27/29] fix: Update table name in notebook Update table name to use latest in notebook Fix COG-677 --- notebooks/cognee_demo.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index 45f5a618c..3246a5246 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -758,7 +758,7 @@ "from cognee.infrastructure.databases.vector import get_vector_engine\n", "\n", "vector_engine = get_vector_engine()\n", - "results = await search(vector_engine, \"Entity_name\", \"sarah.nguyen@example.com\")\n", + "results = await search(vector_engine, \"entity_name\", \"sarah.nguyen@example.com\")\n", "for result in results:\n", " print(result)" ] @@ -881,7 +881,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.9.6" } }, "nbformat": 4, From f9353d25faf91407cb823ebf441325bfdc6ebffc Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 20 Nov 2024 15:14:38 +0100 Subject: [PATCH 28/29] fix: Update table name in notebook Update table name in notebook Fix COG-677 --- notebooks/cognee_demo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/cognee_demo.ipynb b/notebooks/cognee_demo.ipynb index 3246a5246..67bb4e07f 100644 --- a/notebooks/cognee_demo.ipynb +++ b/notebooks/cognee_demo.ipynb @@ -788,7 +788,7 @@ "source": [ "from cognee.api.v1.search import SearchType\n", "\n", - "node = (await vector_engine.search(\"Entity_name\", \"sarah.nguyen@example.com\"))[0]\n", + "node = (await vector_engine.search(\"entity_name\", \"sarah.nguyen@example.com\"))[0]\n", "node_name = node.payload[\"text\"]\n", "\n", "search_results = await cognee.search(SearchType.SUMMARIES, query_text = node_name)\n", From e1d8f3ea8658713ee549cdc215fd2a8bcc34d1bd Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 20 Nov 2024 16:02:15 +0100 Subject: [PATCH 29/29] use acreate_structured_output instead of create_structured_output in eval script --- evals/eval_swe_bench.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index e5b8643c1..ec93bda07 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -42,7 +42,7 @@ async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): ]) llm_client = get_llm_client() - answer_prediction = llm_client.create_structured_output( + answer_prediction = await llm_client.acreate_structured_output( text_input=problem_statement, system_prompt=prompt, response_model=str, @@ -55,7 +55,7 @@ async def generate_patch_without_cognee(instance): prompt = instance["text"] llm_client = get_llm_client() - answer_prediction = llm_client.create_structured_output( + answer_prediction = await llm_client.acreate_structured_output( text_input=problem_statement, system_prompt=prompt, response_model=str, @@ -88,10 +88,7 @@ async def main(): dataset_name = 'princeton-nlp/SWE-bench_Lite_bm25_13K' dataset = load_swebench_dataset(dataset_name, split='test') predictions_path = "preds_nocognee.json" - if Path(predictions_path).exists(): - with open(predictions_path, "r") as file: - preds = json.load(file) - else: + if not Path(predictions_path).exists(): preds = await get_preds(dataset, with_cognee=False) with open(predictions_path, "w") as file: json.dump(preds, file) @@ -106,6 +103,8 @@ async def main(): dataset = download_instances(swe_dataset, filepath) predictions_path = "preds.json" preds = await get_preds(dataset, with_cognee=not args.cognee_off) + with open(predictions_path, "w") as file: + json.dump(preds, file) subprocess.run(["python", "-m", "swebench.harness.run_evaluation", "--dataset_name", dataset_name,