From 4ca1de266e344ec5b914f80407760b72ebfc76b3 Mon Sep 17 00:00:00 2001 From: Hassan <261925524@formanite.fccollege.edu.pk> Date: Thu, 31 Jul 2025 05:15:41 -0700 Subject: [PATCH 01/17] feat/configurable-path-exclusion --- cognee/api/v1/cognify/code_graph_pipeline.py | 20 +++- .../get_repo_file_dependencies.py | 106 ++++++++---------- cognee/tests/test_repo_processor.py | 45 ++++++++ 3 files changed, 109 insertions(+), 62 deletions(-) create mode 100644 cognee/tests/test_repo_processor.py diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 00a0d3dc9..d7faab6b5 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline") @observe -async def run_code_graph_pipeline(repo_path, include_docs=False): +async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None): import cognee from cognee.low_level import setup @@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): user = await get_default_user() detailed_extraction = True + # Default exclusion patterns + if excluded_paths is None: + excluded_paths = [ + ".venv/", "venv/", "__pycache__/", ".pytest_cache/", + "build/", "dist/", "node_modules/", ".npm/", ".git/", + ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", + "*.pyc", "*.pyo", "*.log", "*.tmp" + ] + tasks = [ - Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction), - # Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete + Task( + get_repo_file_dependencies, + detailed_extraction=detailed_extraction, + excluded_paths=excluded_paths + ), Task(add_data_points, task_config={"batch_size": 30}), ] if include_docs: - # This tasks take a long time to complete non_code_tasks = [ Task(get_non_py_files, task_config={"batch_size": 50}), Task(ingest_data, dataset_name="repo_docs", user=user), @@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): dataset_name = "codebase" - # Save dataset to database db_engine = get_relational_engine() async with db_engine.get_async_session() as session: dataset = await create_dataset(dataset_name, user, session) diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 232850936..2567a44cd 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -1,56 +1,68 @@ import asyncio import math import os - -# from concurrent.futures import ProcessPoolExecutor -from typing import AsyncGenerator +import fnmatch +from typing import AsyncGenerator, Optional, List from uuid import NAMESPACE_OID, uuid5 from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, Repository -async def get_source_code_files(repo_path): +async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None): """ - Retrieve Python source code files from the specified repository path. - - This function scans the given repository path for files that have the .py extension - while excluding test files and files within a virtual environment. It returns a list of - absolute paths to the source code files that are not empty. + Retrieve Python source code files from the specified repository path, + excluding paths and file patterns commonly irrelevant to code analysis. Parameters: ----------- - - - repo_path: The file path to the repository to search for Python source files. + - repo_path: Root path of the repository to search + - excluded_paths: Optional list of path fragments or glob patterns to exclude Returns: -------- - - A list of absolute paths to .py files that contain source code, excluding empty - files, test files, and files from a virtual environment. + List of absolute file paths for .py files, excluding test files, + empty files, and files under ignored directories or matching ignore patterns. """ - if not os.path.exists(repo_path): - return {} - py_files_paths = ( - os.path.join(root, file) - for root, _, files in os.walk(repo_path) - for file in files - if ( - file.endswith(".py") - and not file.startswith("test_") - and not file.endswith("_test") - and ".venv" not in file - ) - ) + if not os.path.exists(repo_path): + return [] + + # Default exclusions + default_excluded_patterns = [ + ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/", + "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", + "*.pyc", "*.pyo", "*.log", "*.tmp" + ] + + excluded_patterns = default_excluded_patterns + (excluded_paths or []) + + py_files_paths = [] + for root, _, files in os.walk(repo_path): + for file in files: + full_path = os.path.join(root, file) + rel_path = os.path.relpath(full_path, repo_path) + + # Check for exclusion + should_exclude = any( + pattern in rel_path or fnmatch.fnmatch(rel_path, pattern) + for pattern in excluded_patterns + ) + if should_exclude: + continue + + if ( + file.endswith(".py") + and not file.startswith("test_") + and not file.endswith("_test") + ): + py_files_paths.append(full_path) source_code_files = set() for file_path in py_files_paths: file_path = os.path.abspath(file_path) - if os.path.getsize(file_path) == 0: continue - source_code_files.add(file_path) return list(source_code_files) @@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs): This function creates a new asyncio event loop, sets it as the current loop, and executes the given coroutine function with the provided arguments. Once the coroutine - completes, the loop is closed. Intended for use in environments where an existing event - loop is not available or desirable. - - Parameters: - ----------- - - - coroutine_func: The coroutine function to be run. - - *args: Positional arguments to pass to the coroutine function. - - **kwargs: Keyword arguments to pass to the coroutine function. - - Returns: - -------- - - The result returned by the coroutine after completion. + completes, the loop is closed. """ loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs): async def get_repo_file_dependencies( - repo_path: str, detailed_extraction: bool = False + repo_path: str, + detailed_extraction: bool = False, + excluded_paths: Optional[List[str]] = None ) -> AsyncGenerator[DataPoint, None]: """ Generate a dependency graph for Python files in the given repository path. - Check the validity of the repository path and yield a repository object followed by the - dependencies of Python files within that repository. Raise a FileNotFoundError if the - provided path does not exist. The extraction of detailed dependencies can be controlled - via the `detailed_extraction` argument. - Parameters: ----------- - - - repo_path (str): The file path to the repository where Python files are located. - - detailed_extraction (bool): A flag indicating whether to perform a detailed - extraction of dependencies (default is False). (default False) + - repo_path: Path to local repository + - detailed_extraction: Whether to extract fine-grained dependencies + - excluded_paths: Optional custom exclusion list """ if not os.path.exists(repo_path): raise FileNotFoundError(f"Repository path {repo_path} does not exist.") - source_code_files = await get_source_code_files(repo_path) + source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths) repo = Repository( id=uuid5(NAMESPACE_OID, repo_path), @@ -125,11 +120,9 @@ async def get_repo_file_dependencies( for chunk_number in range(number_of_chunks) ] - # Codegraph dependencies are not installed by default, so we import where we use them. from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies for start_range, end_range in chunk_ranges: - # with ProcessPoolExecutor(max_workers=12) as executor: tasks = [ get_local_script_dependencies(repo_path, file_path, detailed_extraction) for file_path in source_code_files[start_range : end_range + 1] @@ -139,5 +132,4 @@ async def get_repo_file_dependencies( for source_code_file in results: source_code_file.part_of = repo - yield source_code_file diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py new file mode 100644 index 000000000..4de102da6 --- /dev/null +++ b/cognee/tests/test_repo_processor.py @@ -0,0 +1,45 @@ +import os +import shutil +import tempfile +from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files + +def test_get_source_code_files_excludes_common_dirs_and_files(): + # Create a temporary test directory + test_repo = tempfile.mkdtemp() + + # Create files and folders to include/exclude + included_file = os.path.join(test_repo, "main.py") + excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"] + excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"] + + # Create included file + with open(included_file, "w") as f: + f.write("print('Hello world')") + + # Create excluded directories and files inside them + for folder in excluded_dirs: + folder_path = os.path.join(test_repo, folder) + os.makedirs(folder_path) + file_path = os.path.join(folder_path, "ignored.js") + with open(file_path, "w") as f: + f.write("// ignore this") + + # Create excluded files in root + for file_name in excluded_files: + file_path = os.path.join(test_repo, file_name) + with open(file_path, "w") as f: + f.write("dummy") + + # Run function + results = get_source_code_files(test_repo) + + # Assert only included file is present + assert included_file in results + for root, dirs, files in os.walk(test_repo): + for name in files: + full_path = os.path.join(root, name) + if full_path != included_file: + assert full_path not in results, f"{full_path} should have been excluded" + + # Cleanup + shutil.rmtree(test_repo) From c898895f2229f851127a977411abb6b9cc6a4f74 Mon Sep 17 00:00:00 2001 From: Hassan <261925524@formanite.fccollege.edu.pk> Date: Thu, 31 Jul 2025 07:00:11 -0700 Subject: [PATCH 02/17] feat/configurable-path-exclusion --- cognee/tests/test_repo_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py index 4de102da6..fc3c26b05 100644 --- a/cognee/tests/test_repo_processor.py +++ b/cognee/tests/test_repo_processor.py @@ -1,7 +1,7 @@ import os import shutil import tempfile -from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files def test_get_source_code_files_excludes_common_dirs_and_files(): # Create a temporary test directory From 8f26a01b3ab744a818bfeaeae932a41921f92ccc Mon Sep 17 00:00:00 2001 From: Hassan <261925524@formanite.fccollege.edu.pk> Date: Sat, 2 Aug 2025 10:33:07 -0700 Subject: [PATCH 03/17] style: run ruff format and fix lint issues --- cognee/api/v1/cognify/code_graph_pipeline.py | 24 +++++++++++--- .../get_repo_file_dependencies.py | 31 ++++++++++++------- cognee/tests/test_repo_processor.py | 1 + 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index d7faab6b5..ae1c8b0ac 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -43,17 +43,31 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths= # Default exclusion patterns if excluded_paths is None: excluded_paths = [ - ".venv/", "venv/", "__pycache__/", ".pytest_cache/", - "build/", "dist/", "node_modules/", ".npm/", ".git/", - ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", - "*.pyc", "*.pyo", "*.log", "*.tmp" + ".venv/", + "venv/", + "__pycache__/", + ".pytest_cache/", + "build/", + "dist/", + "node_modules/", + ".npm/", + ".git/", + ".svn/", + ".idea/", + ".vscode/", + "tmp/", + "temp/", + "*.pyc", + "*.pyo", + "*.log", + "*.tmp", ] tasks = [ Task( get_repo_file_dependencies, detailed_extraction=detailed_extraction, - excluded_paths=excluded_paths + excluded_paths=excluded_paths, ), Task(add_data_points, task_config={"batch_size": 30}), ] diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 2567a44cd..f1435a9e2 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -30,9 +30,24 @@ async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[st # Default exclusions default_excluded_patterns = [ - ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/", - "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", - "*.pyc", "*.pyo", "*.log", "*.tmp" + ".venv/", + "venv/", + "__pycache__/", + ".pytest_cache/", + "build/", + "dist/", + "node_modules/", + ".npm/", + ".git/", + ".svn/", + ".idea/", + ".vscode/", + "tmp/", + "temp/", + "*.pyc", + "*.pyo", + "*.log", + "*.tmp", ] excluded_patterns = default_excluded_patterns + (excluded_paths or []) @@ -51,11 +66,7 @@ async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[st if should_exclude: continue - if ( - file.endswith(".py") - and not file.startswith("test_") - and not file.endswith("_test") - ): + if file.endswith(".py") and not file.startswith("test_") and not file.endswith("_test"): py_files_paths.append(full_path) source_code_files = set() @@ -84,9 +95,7 @@ def run_coroutine(coroutine_func, *args, **kwargs): async def get_repo_file_dependencies( - repo_path: str, - detailed_extraction: bool = False, - excluded_paths: Optional[List[str]] = None + repo_path: str, detailed_extraction: bool = False, excluded_paths: Optional[List[str]] = None ) -> AsyncGenerator[DataPoint, None]: """ Generate a dependency graph for Python files in the given repository path. diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py index fc3c26b05..2d5868f36 100644 --- a/cognee/tests/test_repo_processor.py +++ b/cognee/tests/test_repo_processor.py @@ -3,6 +3,7 @@ import shutil import tempfile from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files + def test_get_source_code_files_excludes_common_dirs_and_files(): # Create a temporary test directory test_repo = tempfile.mkdtemp() From ac87e62adb55803cc2335889b21bcc3777d3d833 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 28 Aug 2025 10:52:08 +0200 Subject: [PATCH 04/17] feat: Save search flag progress --- .../modules/retrieval/completion_retriever.py | 17 ++++++++++++-- ..._completion_context_extension_retriever.py | 13 ++++++++++- .../graph_completion_cot_retriever.py | 15 +++++++++++-- .../retrieval/graph_completion_retriever.py | 12 +++++++++- cognee/modules/retrieval/utils/completion.py | 22 +++++++++++++------ cognee/modules/search/methods/search.py | 7 +++++- 6 files changed, 72 insertions(+), 14 deletions(-) diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py index 655a9010d..e9c8331a1 100644 --- a/cognee/modules/retrieval/completion_retriever.py +++ b/cognee/modules/retrieval/completion_retriever.py @@ -65,7 +65,14 @@ class CompletionRetriever(BaseRetriever): logger.error("DocumentChunk_text collection not found") raise NoDataError("No data found in the system, please add data first.") from error - async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: + async def get_completion( + self, + query: str, + context: Optional[Any] = None, + user_prompt: str = None, + system_prompt: str = None, + only_context: bool = False, + ) -> Any: """ Generates an LLM completion using the context. @@ -88,6 +95,12 @@ class CompletionRetriever(BaseRetriever): context = await self.get_context(query) completion = await generate_completion( - query, context, self.user_prompt_path, self.system_prompt_path + query=query, + context=context, + user_prompt_path=self.user_prompt_path, + system_prompt_path=self.system_prompt_path, + user_prompt=user_prompt, + system_prompt=system_prompt, + only_context=only_context, ) return [completion] diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py index d05e6b4fa..f25edb4a7 100644 --- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py +++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py @@ -41,7 +41,13 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): ) async def get_completion( - self, query: str, context: Optional[Any] = None, context_extension_rounds=4 + self, + query: str, + context: Optional[Any] = None, + user_prompt: str = None, + system_prompt: str = None, + only_context: bool = False, + context_extension_rounds=4, ) -> List[str]: """ Extends the context for a given query by retrieving related triplets and generating new @@ -86,6 +92,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, + user_prompt=user_prompt, + system_prompt=system_prompt, ) triplets += await self.get_triplets(completion) @@ -112,6 +120,9 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, + user_prompt=user_prompt, + system_prompt=system_prompt, + only_context=only_context, ) if self.save_interaction and context and triplets and completion: diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py index 032dccf9e..63ab6b3b7 100644 --- a/cognee/modules/retrieval/graph_completion_cot_retriever.py +++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py @@ -51,7 +51,13 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): self.followup_user_prompt_path = followup_user_prompt_path async def get_completion( - self, query: str, context: Optional[Any] = None, max_iter=4 + self, + query: str, + context: Optional[Any] = None, + user_prompt: str = None, + system_prompt: str = None, + only_context: bool = False, + max_iter=4, ) -> List[str]: """ Generate completion responses based on a user query and contextual information. @@ -92,6 +98,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, + user_prompt=user_prompt, + system_prompt=system_prompt, ) logger.info(f"Chain-of-thought: round {round_idx} - answer: {completion}") if round_idx < max_iter: @@ -128,4 +136,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): question=query, answer=completion, context=context, triplets=triplets ) - return [completion] + if only_context: + return [context] + else: + return [completion] diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index fb3cf4885..d88252054 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -151,7 +151,14 @@ class GraphCompletionRetriever(BaseRetriever): return context, triplets - async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: + async def get_completion( + self, + query: str, + context: Optional[Any] = None, + user_prompt: str = None, + system_prompt: str = None, + only_context: bool = False, + ) -> Any: """ Generates a completion using graph connections context based on a query. @@ -177,6 +184,9 @@ class GraphCompletionRetriever(BaseRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, + user_prompt=user_prompt, + system_prompt=system_prompt, + only_context=only_context, ) if self.save_interaction and context and triplets and completion: diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py index ca0b30c18..69381d647 100644 --- a/cognee/modules/retrieval/utils/completion.py +++ b/cognee/modules/retrieval/utils/completion.py @@ -6,18 +6,26 @@ async def generate_completion( context: str, user_prompt_path: str, system_prompt_path: str, + user_prompt: str = None, + system_prompt: str = None, + only_context: bool = False, ) -> str: """Generates a completion using LLM with given context and prompts.""" args = {"question": query, "context": context} - user_prompt = LLMGateway.render_prompt(user_prompt_path, args) - system_prompt = LLMGateway.read_query_prompt(system_prompt_path) - - return await LLMGateway.acreate_structured_output( - text_input=user_prompt, - system_prompt=system_prompt, - response_model=str, + user_prompt = LLMGateway.render_prompt(user_prompt if user_prompt else user_prompt_path, args) + system_prompt = LLMGateway.read_query_prompt( + system_prompt if system_prompt else system_prompt_path ) + if only_context: + return context + else: + return await LLMGateway.acreate_structured_output( + text_input=user_prompt, + system_prompt=system_prompt, + response_model=str, + ) + async def summarize_text( text: str, diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index f5f2a793a..3e5d6ffcd 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -101,11 +101,14 @@ async def specific_search( query: str, user: User, system_prompt_path="answer_simple_question.txt", + user_prompt: str = None, + system_prompt: str = None, top_k: int = 10, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: Optional[bool] = False, last_k: Optional[int] = None, + only_context: bool = None, ) -> list: search_tasks: dict[SearchType, Callable] = { SearchType.SUMMARIES: SummariesRetriever(top_k=top_k).get_completion, @@ -159,7 +162,9 @@ async def specific_search( send_telemetry("cognee.search EXECUTION STARTED", user.id) - results = await search_task(query) + results = await search_task( + query=query, system_prompt=system_prompt, user_prompt=user_prompt, only_context=only_context + ) send_telemetry("cognee.search EXECUTION COMPLETED", user.id) From 2915698d601f8ce84d5d63458d0e8da51794fa67 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 28 Aug 2025 13:43:37 +0200 Subject: [PATCH 05/17] feat: Add only_context and system prompt flags for search --- .../v1/search/routers/get_search_router.py | 6 + cognee/api/v1/search/search.py | 4 + .../modules/retrieval/completion_retriever.py | 18 ++- ..._completion_context_extension_retriever.py | 20 +-- .../graph_completion_cot_retriever.py | 12 +- .../retrieval/graph_completion_retriever.py | 12 +- .../graph_summary_completion_retriever.py | 4 +- .../modules/retrieval/summaries_retriever.py | 2 +- cognee/modules/retrieval/utils/completion.py | 18 +-- cognee/modules/search/methods/search.py | 117 +++++++++++++----- 10 files changed, 140 insertions(+), 73 deletions(-) diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py index 0ceeb1abb..b141c6bdc 100644 --- a/cognee/api/v1/search/routers/get_search_router.py +++ b/cognee/api/v1/search/routers/get_search_router.py @@ -20,7 +20,9 @@ class SearchPayloadDTO(InDTO): datasets: Optional[list[str]] = Field(default=None) dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]]) query: str = Field(default="What is in the document?") + system_prompt: Optional[str] = Field(default=None) top_k: Optional[int] = Field(default=10) + only_context: bool = Field(default=False) def get_search_router() -> APIRouter: @@ -102,7 +104,9 @@ def get_search_router() -> APIRouter: "datasets": payload.datasets, "dataset_ids": [str(dataset_id) for dataset_id in payload.dataset_ids or []], "query": payload.query, + "system_prompt": payload.system_prompt, "top_k": payload.top_k, + "only_context": payload.only_context, }, ) @@ -115,7 +119,9 @@ def get_search_router() -> APIRouter: user=user, datasets=payload.datasets, dataset_ids=payload.dataset_ids, + system_prompt=payload.system_prompt, top_k=payload.top_k, + only_context=payload.only_context, ) return results diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index f37f8ba6d..113d33557 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -16,11 +16,13 @@ async def search( datasets: Optional[Union[list[str], str]] = None, dataset_ids: Optional[Union[list[UUID], UUID]] = None, system_prompt_path: str = "answer_simple_question.txt", + system_prompt: Optional[str] = None, top_k: int = 10, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, last_k: Optional[int] = None, + only_context: bool = False, ) -> list: """ Search and query the knowledge graph for insights, information, and connections. @@ -183,11 +185,13 @@ async def search( dataset_ids=dataset_ids if dataset_ids else datasets, user=user, system_prompt_path=system_prompt_path, + system_prompt=system_prompt, top_k=top_k, node_type=node_type, node_name=node_name, save_interaction=save_interaction, last_k=last_k, + only_context=only_context, ) return filtered_search_results diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py index e9c8331a1..4d34dfdbe 100644 --- a/cognee/modules/retrieval/completion_retriever.py +++ b/cognee/modules/retrieval/completion_retriever.py @@ -23,12 +23,16 @@ class CompletionRetriever(BaseRetriever): self, user_prompt_path: str = "context_for_question.txt", system_prompt_path: str = "answer_simple_question.txt", + system_prompt: str = None, top_k: Optional[int] = 1, + only_context: bool = False, ): """Initialize retriever with optional custom prompt paths.""" self.user_prompt_path = user_prompt_path self.system_prompt_path = system_prompt_path self.top_k = top_k if top_k is not None else 1 + self.system_prompt = system_prompt + self.only_context = only_context async def get_context(self, query: str) -> str: """ @@ -65,14 +69,7 @@ class CompletionRetriever(BaseRetriever): logger.error("DocumentChunk_text collection not found") raise NoDataError("No data found in the system, please add data first.") from error - async def get_completion( - self, - query: str, - context: Optional[Any] = None, - user_prompt: str = None, - system_prompt: str = None, - only_context: bool = False, - ) -> Any: + async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: """ Generates an LLM completion using the context. @@ -99,8 +96,7 @@ class CompletionRetriever(BaseRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, - user_prompt=user_prompt, - system_prompt=system_prompt, - only_context=only_context, + system_prompt=self.system_prompt, + only_context=self.only_context, ) return [completion] diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py index f25edb4a7..8bdf5f1a0 100644 --- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py +++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py @@ -26,10 +26,12 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): self, user_prompt_path: str = "graph_context_for_question.txt", system_prompt_path: str = "answer_simple_question.txt", + system_prompt: Optional[str] = None, top_k: Optional[int] = 5, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + only_context: bool = False, ): super().__init__( user_prompt_path=user_prompt_path, @@ -38,15 +40,14 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): node_type=node_type, node_name=node_name, save_interaction=save_interaction, + system_prompt=system_prompt, + only_context=only_context, ) async def get_completion( self, query: str, context: Optional[Any] = None, - user_prompt: str = None, - system_prompt: str = None, - only_context: bool = False, context_extension_rounds=4, ) -> List[str]: """ @@ -92,8 +93,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, - user_prompt=user_prompt, - system_prompt=system_prompt, + system_prompt=self.system_prompt, ) triplets += await self.get_triplets(completion) @@ -120,9 +120,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, - user_prompt=user_prompt, - system_prompt=system_prompt, - only_context=only_context, + system_prompt=self.system_prompt, + only_context=self.only_context, ) if self.save_interaction and context and triplets and completion: @@ -130,4 +129,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): question=query, answer=completion, context=context, triplets=triplets ) - return [completion] + if self.only_context: + return [context] + else: + return [completion] diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py index 63ab6b3b7..86ff8555b 100644 --- a/cognee/modules/retrieval/graph_completion_cot_retriever.py +++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py @@ -32,14 +32,18 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): validation_system_prompt_path: str = "cot_validation_system_prompt.txt", followup_system_prompt_path: str = "cot_followup_system_prompt.txt", followup_user_prompt_path: str = "cot_followup_user_prompt.txt", + system_prompt: str = None, top_k: Optional[int] = 5, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + only_context: bool = False, ): super().__init__( user_prompt_path=user_prompt_path, system_prompt_path=system_prompt_path, + system_prompt=system_prompt, + only_context=only_context, top_k=top_k, node_type=node_type, node_name=node_name, @@ -54,9 +58,6 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): self, query: str, context: Optional[Any] = None, - user_prompt: str = None, - system_prompt: str = None, - only_context: bool = False, max_iter=4, ) -> List[str]: """ @@ -98,8 +99,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, - user_prompt=user_prompt, - system_prompt=system_prompt, + system_prompt=self.system_prompt, ) logger.info(f"Chain-of-thought: round {round_idx} - answer: {completion}") if round_idx < max_iter: @@ -136,7 +136,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): question=query, answer=completion, context=context, triplets=triplets ) - if only_context: + if self.only_context: return [context] else: return [completion] diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index d88252054..6a5193c56 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -36,15 +36,19 @@ class GraphCompletionRetriever(BaseRetriever): self, user_prompt_path: str = "graph_context_for_question.txt", system_prompt_path: str = "answer_simple_question.txt", + system_prompt: str = None, top_k: Optional[int] = 5, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + only_context: bool = False, ): """Initialize retriever with prompt paths and search parameters.""" self.save_interaction = save_interaction self.user_prompt_path = user_prompt_path self.system_prompt_path = system_prompt_path + self.system_prompt = system_prompt + self.only_context = only_context self.top_k = top_k if top_k is not None else 5 self.node_type = node_type self.node_name = node_name @@ -155,9 +159,6 @@ class GraphCompletionRetriever(BaseRetriever): self, query: str, context: Optional[Any] = None, - user_prompt: str = None, - system_prompt: str = None, - only_context: bool = False, ) -> Any: """ Generates a completion using graph connections context based on a query. @@ -184,9 +185,8 @@ class GraphCompletionRetriever(BaseRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, - user_prompt=user_prompt, - system_prompt=system_prompt, - only_context=only_context, + system_prompt=self.system_prompt, + only_context=self.only_context, ) if self.save_interaction and context and triplets and completion: diff --git a/cognee/modules/retrieval/graph_summary_completion_retriever.py b/cognee/modules/retrieval/graph_summary_completion_retriever.py index d344ebd26..051f39b22 100644 --- a/cognee/modules/retrieval/graph_summary_completion_retriever.py +++ b/cognee/modules/retrieval/graph_summary_completion_retriever.py @@ -21,6 +21,7 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): user_prompt_path: str = "graph_context_for_question.txt", system_prompt_path: str = "answer_simple_question.txt", summarize_prompt_path: str = "summarize_search_results.txt", + system_prompt: Optional[str] = None, top_k: Optional[int] = 5, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, @@ -34,6 +35,7 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): node_type=node_type, node_name=node_name, save_interaction=save_interaction, + system_prompt=system_prompt, ) self.summarize_prompt_path = summarize_prompt_path @@ -57,4 +59,4 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): - str: A summary string representing the content of the retrieved edges. """ direct_text = await super().resolve_edges_to_text(retrieved_edges) - return await summarize_text(direct_text, self.summarize_prompt_path) + return await summarize_text(direct_text, self.summarize_prompt_path, self.system_prompt) diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py index 56f414013..df35cdc51 100644 --- a/cognee/modules/retrieval/summaries_retriever.py +++ b/cognee/modules/retrieval/summaries_retriever.py @@ -62,7 +62,7 @@ class SummariesRetriever(BaseRetriever): logger.info(f"Returning {len(summary_payloads)} summary payloads") return summary_payloads - async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: + async def get_completion(self, query: str, context: Optional[Any] = None, **kwargs) -> Any: """ Generates a completion using summaries context. diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py index 69381d647..4c2639517 100644 --- a/cognee/modules/retrieval/utils/completion.py +++ b/cognee/modules/retrieval/utils/completion.py @@ -1,3 +1,4 @@ +from typing import Optional from cognee.infrastructure.llm.LLMGateway import LLMGateway @@ -6,15 +7,15 @@ async def generate_completion( context: str, user_prompt_path: str, system_prompt_path: str, - user_prompt: str = None, - system_prompt: str = None, + user_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, only_context: bool = False, ) -> str: """Generates a completion using LLM with given context and prompts.""" args = {"question": query, "context": context} - user_prompt = LLMGateway.render_prompt(user_prompt if user_prompt else user_prompt_path, args) - system_prompt = LLMGateway.read_query_prompt( - system_prompt if system_prompt else system_prompt_path + user_prompt = user_prompt if user_prompt else LLMGateway.render_prompt(user_prompt_path, args) + system_prompt = ( + system_prompt if system_prompt else LLMGateway.read_query_prompt(system_prompt_path) ) if only_context: @@ -29,10 +30,13 @@ async def generate_completion( async def summarize_text( text: str, - prompt_path: str = "summarize_search_results.txt", + system_prompt_path: str = "summarize_search_results.txt", + system_prompt: str = None, ) -> str: """Summarizes text using LLM with the specified prompt.""" - system_prompt = LLMGateway.read_query_prompt(prompt_path) + system_prompt = ( + system_prompt if system_prompt else LLMGateway.read_query_prompt(system_prompt_path) + ) return await LLMGateway.acreate_structured_output( text_input=text, diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 3e5d6ffcd..465d0cbb3 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -37,11 +37,13 @@ async def search( dataset_ids: Union[list[UUID], None], user: User, system_prompt_path="answer_simple_question.txt", + system_prompt: Optional[str] = None, top_k: int = 10, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: Optional[bool] = False, last_k: Optional[int] = None, + only_context: bool = False, ): """ @@ -61,28 +63,34 @@ async def search( # Use search function filtered by permissions if access control is enabled if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": return await authorized_search( - query_text=query_text, query_type=query_type, + query_text=query_text, user=user, dataset_ids=dataset_ids, system_prompt_path=system_prompt_path, + system_prompt=system_prompt, top_k=top_k, + node_type=node_type, + node_name=node_name, save_interaction=save_interaction, last_k=last_k, + only_context=only_context, ) query = await log_query(query_text, query_type.value, user.id) search_results = await specific_search( - query_type, - query_text, - user, + query_type=query_type, + query_text=query_text, + user=user, system_prompt_path=system_prompt_path, + system_prompt=system_prompt, top_k=top_k, node_type=node_type, node_name=node_name, save_interaction=save_interaction, last_k=last_k, + only_context=only_context, ) await log_result( @@ -98,11 +106,10 @@ async def search( async def specific_search( query_type: SearchType, - query: str, + query_text: str, user: User, - system_prompt_path="answer_simple_question.txt", - user_prompt: str = None, - system_prompt: str = None, + system_prompt_path: str = "answer_simple_question.txt", + system_prompt: Optional[str] = None, top_k: int = 10, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, @@ -115,7 +122,10 @@ async def specific_search( SearchType.INSIGHTS: InsightsRetriever(top_k=top_k).get_completion, SearchType.CHUNKS: ChunksRetriever(top_k=top_k).get_completion, SearchType.RAG_COMPLETION: CompletionRetriever( - system_prompt_path=system_prompt_path, top_k=top_k + system_prompt_path=system_prompt_path, + top_k=top_k, + system_prompt=system_prompt, + only_context=only_context, ).get_completion, SearchType.GRAPH_COMPLETION: GraphCompletionRetriever( system_prompt_path=system_prompt_path, @@ -123,6 +133,8 @@ async def specific_search( node_type=node_type, node_name=node_name, save_interaction=save_interaction, + system_prompt=system_prompt, + only_context=only_context, ).get_completion, SearchType.GRAPH_COMPLETION_COT: GraphCompletionCotRetriever( system_prompt_path=system_prompt_path, @@ -130,6 +142,8 @@ async def specific_search( node_type=node_type, node_name=node_name, save_interaction=save_interaction, + system_prompt=system_prompt, + only_context=only_context, ).get_completion, SearchType.GRAPH_COMPLETION_CONTEXT_EXTENSION: GraphCompletionContextExtensionRetriever( system_prompt_path=system_prompt_path, @@ -137,6 +151,8 @@ async def specific_search( node_type=node_type, node_name=node_name, save_interaction=save_interaction, + system_prompt=system_prompt, + only_context=only_context, ).get_completion, SearchType.GRAPH_SUMMARY_COMPLETION: GraphSummaryCompletionRetriever( system_prompt_path=system_prompt_path, @@ -144,6 +160,7 @@ async def specific_search( node_type=node_type, node_name=node_name, save_interaction=save_interaction, + system_prompt=system_prompt, ).get_completion, SearchType.CODE: CodeRetriever(top_k=top_k).get_completion, SearchType.CYPHER: CypherSearchRetriever().get_completion, @@ -153,7 +170,7 @@ async def specific_search( # If the query type is FEELING_LUCKY, select the search type intelligently if query_type is SearchType.FEELING_LUCKY: - query_type = await select_search_type(query) + query_type = await select_search_type(query_text) search_task = search_tasks.get(query_type) @@ -162,9 +179,7 @@ async def specific_search( send_telemetry("cognee.search EXECUTION STARTED", user.id) - results = await search_task( - query=query, system_prompt=system_prompt, user_prompt=user_prompt, only_context=only_context - ) + results = await search_task(query=query_text) send_telemetry("cognee.search EXECUTION COMPLETED", user.id) @@ -172,14 +187,18 @@ async def specific_search( async def authorized_search( - query_text: str, query_type: SearchType, - user: User = None, + query_text: str, + user: User, dataset_ids: Optional[list[UUID]] = None, system_prompt_path: str = "answer_simple_question.txt", + system_prompt: Optional[str] = None, top_k: int = 10, - save_interaction: bool = False, + node_type: Optional[Type] = None, + node_name: Optional[List[str]] = None, + save_interaction: Optional[bool] = False, last_k: Optional[int] = None, + only_context: bool = None, ) -> list: """ Verifies access for provided datasets or uses all datasets user has read access for and performs search per dataset. @@ -193,14 +212,18 @@ async def authorized_search( # Searches all provided datasets and handles setting up of appropriate database context based on permissions search_results = await specific_search_by_context( - search_datasets, - query_text, - query_type, - user, - system_prompt_path, - top_k, - save_interaction, + search_datasets=search_datasets, + query_type=query_type, + query_text=query_text, + user=user, + system_prompt_path=system_prompt_path, + system_prompt=system_prompt, + top_k=top_k, + node_type=node_type, + node_name=node_name, + save_interaction=save_interaction, last_k=last_k, + only_context=only_context, ) await log_result(query.id, json.dumps(search_results, cls=JSONEncoder), user.id) @@ -210,13 +233,17 @@ async def authorized_search( async def specific_search_by_context( search_datasets: list[Dataset], - query_text: str, query_type: SearchType, + query_text: str, user: User, - system_prompt_path: str, - top_k: int, - save_interaction: bool = False, + system_prompt_path: str = "answer_simple_question.txt", + system_prompt: Optional[str] = None, + top_k: int = 10, + node_type: Optional[Type] = None, + node_name: Optional[List[str]] = None, + save_interaction: Optional[bool] = False, last_k: Optional[int] = None, + only_context: bool = None, ): """ Searches all provided datasets and handles setting up of appropriate database context based on permissions. @@ -224,18 +251,33 @@ async def specific_search_by_context( """ async def _search_by_context( - dataset, user, query_type, query_text, system_prompt_path, top_k, last_k + dataset: Dataset, + query_type: SearchType, + query_text: str, + user: User, + system_prompt_path: str = "answer_simple_question.txt", + system_prompt: Optional[str] = None, + top_k: int = 10, + node_type: Optional[Type] = None, + node_name: Optional[List[str]] = None, + save_interaction: Optional[bool] = False, + last_k: Optional[int] = None, + only_context: bool = None, ): # Set database configuration in async context for each dataset user has access for await set_database_global_context_variables(dataset.id, dataset.owner_id) search_results = await specific_search( - query_type, - query_text, - user, + query_type=query_type, + query_text=query_text, + user=user, system_prompt_path=system_prompt_path, + system_prompt=system_prompt, top_k=top_k, + node_type=node_type, + node_name=node_name, save_interaction=save_interaction, last_k=last_k, + only_context=only_context, ) return { "search_result": search_results, @@ -248,7 +290,18 @@ async def specific_search_by_context( for dataset in search_datasets: tasks.append( _search_by_context( - dataset, user, query_type, query_text, system_prompt_path, top_k, last_k + dataset=dataset, + query_type=query_type, + query_text=query_text, + user=user, + system_prompt_path=system_prompt_path, + system_prompt=system_prompt, + top_k=top_k, + node_type=node_type, + node_name=node_name, + save_interaction=save_interaction, + last_k=last_k, + only_context=only_context, ) ) From 7fd5e1e0104c061e056c5e97a4b0ea04effa45dd Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 28 Aug 2025 13:53:08 +0200 Subject: [PATCH 06/17] fix: Make custom_prompt be None by default --- cognee/api/v1/cognify/routers/get_cognify_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index 6809f089a..d40345f8e 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -38,7 +38,7 @@ class CognifyPayloadDTO(InDTO): dataset_ids: Optional[List[UUID]] = Field(default=None, examples=[[]]) run_in_background: Optional[bool] = Field(default=False) custom_prompt: Optional[str] = Field( - default=None, description="Custom prompt for entity extraction and graph generation" + default="", description="Custom prompt for entity extraction and graph generation" ) From 966e676d610a38b1607ce415ec8b9d620cf5cec2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 28 Aug 2025 17:23:15 +0200 Subject: [PATCH 07/17] refactor: Have search prompt be empty string by default --- cognee/api/v1/search/routers/get_search_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py index b141c6bdc..39a896dd8 100644 --- a/cognee/api/v1/search/routers/get_search_router.py +++ b/cognee/api/v1/search/routers/get_search_router.py @@ -20,7 +20,7 @@ class SearchPayloadDTO(InDTO): datasets: Optional[list[str]] = Field(default=None) dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]]) query: str = Field(default="What is in the document?") - system_prompt: Optional[str] = Field(default=None) + system_prompt: Optional[str] = Field(default="") top_k: Optional[int] = Field(default=10) only_context: bool = Field(default=False) From cf636ba77f08665ce075235c5571eabc45c559be Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 28 Aug 2025 18:37:44 +0200 Subject: [PATCH 08/17] feat: Enable nodesets on backend --- cognee/api/v1/add/routers/get_add_router.py | 38 ++++--------------- .../v1/search/routers/get_search_router.py | 3 ++ 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/cognee/api/v1/add/routers/get_add_router.py b/cognee/api/v1/add/routers/get_add_router.py index 66b165a38..8424a4fb5 100644 --- a/cognee/api/v1/add/routers/get_add_router.py +++ b/cognee/api/v1/add/routers/get_add_router.py @@ -25,6 +25,7 @@ def get_add_router() -> APIRouter: data: List[UploadFile] = File(default=None), datasetName: Optional[str] = Form(default=None), datasetId: Union[UUID, Literal[""], None] = Form(default=None, examples=[""]), + node_set: Optional[List[str]] = Form(default=[""], example=[""]), user: User = Depends(get_authenticated_user), ): """ @@ -65,9 +66,7 @@ def get_add_router() -> APIRouter: send_telemetry( "Add API Endpoint Invoked", user.id, - additional_properties={ - "endpoint": "POST /v1/add", - }, + additional_properties={"endpoint": "POST /v1/add", "node_set": node_set}, ) from cognee.api.v1.add import add as cognee_add @@ -76,34 +75,13 @@ def get_add_router() -> APIRouter: raise ValueError("Either datasetId or datasetName must be provided.") try: - if ( - isinstance(data, str) - and data.startswith("http") - and (os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true") - ): - if "github" in data: - # Perform git clone if the URL is from GitHub - repo_name = data.split("/")[-1].replace(".git", "") - subprocess.run(["git", "clone", data, f".data/{repo_name}"], check=True) - # TODO: Update add call with dataset info - await cognee_add( - "data://.data/", - f"{repo_name}", - ) - else: - # Fetch and store the data from other types of URL using curl - response = requests.get(data) - response.raise_for_status() + add_run = await cognee_add( + data, datasetName, user=user, dataset_id=datasetId, node_set=node_set + ) - file_data = await response.content() - # TODO: Update add call with dataset info - return await cognee_add(file_data) - else: - add_run = await cognee_add(data, datasetName, user=user, dataset_id=datasetId) - - if isinstance(add_run, PipelineRunErrored): - return JSONResponse(status_code=420, content=add_run.model_dump(mode="json")) - return add_run.model_dump() + if isinstance(add_run, PipelineRunErrored): + return JSONResponse(status_code=420, content=add_run.model_dump(mode="json")) + return add_run.model_dump() except Exception as error: return JSONResponse(status_code=409, content={"error": str(error)}) diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py index 0ceeb1abb..961532a06 100644 --- a/cognee/api/v1/search/routers/get_search_router.py +++ b/cognee/api/v1/search/routers/get_search_router.py @@ -20,6 +20,7 @@ class SearchPayloadDTO(InDTO): datasets: Optional[list[str]] = Field(default=None) dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]]) query: str = Field(default="What is in the document?") + node_name: Optional[list[str]] = Field(default=None, example=[]) top_k: Optional[int] = Field(default=10) @@ -102,6 +103,7 @@ def get_search_router() -> APIRouter: "datasets": payload.datasets, "dataset_ids": [str(dataset_id) for dataset_id in payload.dataset_ids or []], "query": payload.query, + "node_name": payload.node_name, "top_k": payload.top_k, }, ) @@ -115,6 +117,7 @@ def get_search_router() -> APIRouter: user=user, datasets=payload.datasets, dataset_ids=payload.dataset_ids, + node_name=payload.node_name, top_k=payload.top_k, ) From 5bfae7a36b10b746c167a4895d108130f9a62a2a Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 10:30:49 +0200 Subject: [PATCH 09/17] refactor: Resolve unit tests failing for search --- cognee/modules/search/methods/search.py | 2 +- .../unit/modules/search/search_methods_test.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 465d0cbb3..2db105d71 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -179,7 +179,7 @@ async def specific_search( send_telemetry("cognee.search EXECUTION STARTED", user.id) - results = await search_task(query=query_text) + results = await search_task(query_text) send_telemetry("cognee.search EXECUTION COMPLETED", user.id) diff --git a/cognee/tests/unit/modules/search/search_methods_test.py b/cognee/tests/unit/modules/search/search_methods_test.py index 46995d087..9833a770b 100644 --- a/cognee/tests/unit/modules/search/search_methods_test.py +++ b/cognee/tests/unit/modules/search/search_methods_test.py @@ -58,15 +58,17 @@ async def test_search( # Verify mock_log_query.assert_called_once_with(query_text, query_type.value, mock_user.id) mock_specific_search.assert_called_once_with( - query_type, - query_text, - mock_user, + query_type=query_type, + query_text=query_text, + user=mock_user, system_prompt_path="answer_simple_question.txt", + system_prompt=None, top_k=10, node_type=None, node_name=None, save_interaction=False, last_k=None, + only_context=False, ) # Verify result logging @@ -201,7 +203,10 @@ async def test_specific_search_feeling_lucky( if retriever_name == "CompletionRetriever": mock_retriever_class.assert_called_once_with( - system_prompt_path="answer_simple_question.txt", top_k=top_k + system_prompt_path="answer_simple_question.txt", + top_k=top_k, + system_prompt=None, + only_context=None, ) else: mock_retriever_class.assert_called_once_with(top_k=top_k) From c3f5840bff1a9623066718d3a6ab14994bd4b0fe Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 12:24:15 +0200 Subject: [PATCH 10/17] refactor: Remove unused argument --- cognee/modules/retrieval/utils/completion.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py index 4c2639517..81e636aad 100644 --- a/cognee/modules/retrieval/utils/completion.py +++ b/cognee/modules/retrieval/utils/completion.py @@ -7,13 +7,12 @@ async def generate_completion( context: str, user_prompt_path: str, system_prompt_path: str, - user_prompt: Optional[str] = None, system_prompt: Optional[str] = None, only_context: bool = False, ) -> str: """Generates a completion using LLM with given context and prompts.""" args = {"question": query, "context": context} - user_prompt = user_prompt if user_prompt else LLMGateway.render_prompt(user_prompt_path, args) + user_prompt = LLMGateway.render_prompt(user_prompt_path, args) system_prompt = ( system_prompt if system_prompt else LLMGateway.read_query_prompt(system_prompt_path) ) From 21f688385b16cc3bc50d355b32eb4b7610df2053 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 12:53:29 +0200 Subject: [PATCH 11/17] feat: Add nodeset as default node type --- cognee/api/v1/search/search.py | 3 ++- cognee/modules/search/methods/search.py | 27 ++++++++++++++++--- .../modules/search/search_methods_test.py | 4 +-- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index f37f8ba6d..344e763ae 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -1,6 +1,7 @@ from uuid import UUID from typing import Union, Optional, List, Type +from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.users.models import User from cognee.modules.search.types import SearchType from cognee.modules.users.methods import get_default_user @@ -17,7 +18,7 @@ async def search( dataset_ids: Optional[Union[list[UUID], UUID]] = None, system_prompt_path: str = "answer_simple_question.txt", top_k: int = 10, - node_type: Optional[Type] = None, + node_type: Optional[Type] = NodeSet, node_name: Optional[List[str]] = None, save_interaction: bool = False, last_k: Optional[int] = None, diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index f5f2a793a..8e38e63c3 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -4,6 +4,7 @@ import asyncio from uuid import UUID from typing import Callable, List, Optional, Type, Union +from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.retrieval.user_qa_feedback import UserQAFeedback from cognee.modules.search.exceptions import UnsupportedSearchTypeError from cognee.context_global_variables import set_database_global_context_variables @@ -38,7 +39,7 @@ async def search( user: User, system_prompt_path="answer_simple_question.txt", top_k: int = 10, - node_type: Optional[Type] = None, + node_type: Optional[Type] = NodeSet, node_name: Optional[List[str]] = None, save_interaction: Optional[bool] = False, last_k: Optional[int] = None, @@ -67,6 +68,8 @@ async def search( dataset_ids=dataset_ids, system_prompt_path=system_prompt_path, top_k=top_k, + node_type=node_type, + node_name=node_name, save_interaction=save_interaction, last_k=last_k, ) @@ -102,7 +105,7 @@ async def specific_search( user: User, system_prompt_path="answer_simple_question.txt", top_k: int = 10, - node_type: Optional[Type] = None, + node_type: Optional[Type] = NodeSet, node_name: Optional[List[str]] = None, save_interaction: Optional[bool] = False, last_k: Optional[int] = None, @@ -173,6 +176,8 @@ async def authorized_search( dataset_ids: Optional[list[UUID]] = None, system_prompt_path: str = "answer_simple_question.txt", top_k: int = 10, + node_type: Optional[Type] = NodeSet, + node_name: Optional[List[str]] = None, save_interaction: bool = False, last_k: Optional[int] = None, ) -> list: @@ -194,7 +199,9 @@ async def authorized_search( user, system_prompt_path, top_k, - save_interaction, + node_type=node_type, + node_name=node_name, + save_interaction=save_interaction, last_k=last_k, ) @@ -210,6 +217,8 @@ async def specific_search_by_context( user: User, system_prompt_path: str, top_k: int, + node_type: Optional[Type] = NodeSet, + node_name: Optional[List[str]] = None, save_interaction: bool = False, last_k: Optional[int] = None, ): @@ -229,6 +238,8 @@ async def specific_search_by_context( user, system_prompt_path=system_prompt_path, top_k=top_k, + node_type=node_type, + node_name=node_name, save_interaction=save_interaction, last_k=last_k, ) @@ -243,7 +254,15 @@ async def specific_search_by_context( for dataset in search_datasets: tasks.append( _search_by_context( - dataset, user, query_type, query_text, system_prompt_path, top_k, last_k + dataset, + user, + query_type, + query_text, + system_prompt_path, + top_k, + node_type=node_type, + node_name=node_name, + last_k=last_k, ) ) diff --git a/cognee/tests/unit/modules/search/search_methods_test.py b/cognee/tests/unit/modules/search/search_methods_test.py index 46995d087..004e1fca3 100644 --- a/cognee/tests/unit/modules/search/search_methods_test.py +++ b/cognee/tests/unit/modules/search/search_methods_test.py @@ -3,8 +3,8 @@ import uuid from unittest.mock import AsyncMock, MagicMock, patch import pytest -from pylint.checkers.utils import node_type +from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.search.exceptions import UnsupportedSearchTypeError from cognee.modules.search.methods.search import search, specific_search from cognee.modules.search.types import SearchType @@ -63,7 +63,7 @@ async def test_search( mock_user, system_prompt_path="answer_simple_question.txt", top_k=10, - node_type=None, + node_type=NodeSet, node_name=None, save_interaction=False, last_k=None, From e6ee182d789b43e056ce71400367c04683fc2e8a Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 13:03:06 +0200 Subject: [PATCH 12/17] fix: Handle [] node_name case --- cognee/modules/graph/cognee_graph/CogneeGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index ed867ae24..924532ce0 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -76,7 +76,7 @@ class CogneeGraph(CogneeAbstractGraph): start_time = time.time() # Determine projection strategy - if node_type is not None and node_name is not None: + if node_type is not None and node_name not in [None, []]: nodes_data, edges_data = await adapter.get_nodeset_subgraph( node_type=node_type, node_name=node_name ) From b9fa285c1ac9a1c98dac414a3f8dc62e57305c42 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 13:38:52 +0200 Subject: [PATCH 13/17] fix: Add node_name and node_type to context search --- cognee/modules/search/methods/search.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 8e38e63c3..74ef2a6ad 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -228,7 +228,15 @@ async def specific_search_by_context( """ async def _search_by_context( - dataset, user, query_type, query_text, system_prompt_path, top_k, last_k + dataset, + user, + query_type, + query_text, + system_prompt_path, + top_k, + node_type: Optional[Type] = NodeSet, + node_name: Optional[List[str]] = None, + last_k: Optional[int] = None, ): # Set database configuration in async context for each dataset user has access for await set_database_global_context_variables(dataset.id, dataset.owner_id) From 614055c850661fcbb816a9bf77b2e61324a83f69 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 14:16:18 +0200 Subject: [PATCH 14/17] refactor: Add docs for new search arguments --- cognee/api/v1/search/routers/get_search_router.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py index 39a896dd8..f9f4e4764 100644 --- a/cognee/api/v1/search/routers/get_search_router.py +++ b/cognee/api/v1/search/routers/get_search_router.py @@ -1,9 +1,11 @@ from uuid import UUID +import pathlib from typing import Optional from datetime import datetime from pydantic import Field from fastapi import Depends, APIRouter from fastapi.responses import JSONResponse + from cognee.modules.search.types import SearchType from cognee.api.DTO import InDTO, OutDTO from cognee.modules.users.exceptions.exceptions import PermissionDeniedError @@ -20,7 +22,9 @@ class SearchPayloadDTO(InDTO): datasets: Optional[list[str]] = Field(default=None) dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]]) query: str = Field(default="What is in the document?") - system_prompt: Optional[str] = Field(default="") + system_prompt: Optional[str] = Field( + default="Answer the question using the provided context. Be as brief as possible." + ) top_k: Optional[int] = Field(default=10) only_context: bool = Field(default=False) @@ -81,7 +85,9 @@ def get_search_router() -> APIRouter: - **datasets** (Optional[List[str]]): List of dataset names to search within - **dataset_ids** (Optional[List[UUID]]): List of dataset UUIDs to search within - **query** (str): The search query string + - **system_prompt** Optional[str]: System prompt to be used for Completion type searches in Cognee - **top_k** (Optional[int]): Maximum number of results to return (default: 10) + - **only_context** bool: Set to true to only return context Cognee will be sending to LLM in Completion type searches. This will be returned instead of LLM calls for completion type searches. ## Response Returns a list of search results containing relevant nodes from the graph. From 978815586cee1c0809c4fc3df57b88cebfc8c2e0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 14:21:42 +0200 Subject: [PATCH 15/17] docs: Add docstring for node usage in backend --- cognee/api/v1/add/routers/get_add_router.py | 2 ++ cognee/api/v1/search/routers/get_search_router.py | 1 + 2 files changed, 3 insertions(+) diff --git a/cognee/api/v1/add/routers/get_add_router.py b/cognee/api/v1/add/routers/get_add_router.py index 8424a4fb5..1703d9931 100644 --- a/cognee/api/v1/add/routers/get_add_router.py +++ b/cognee/api/v1/add/routers/get_add_router.py @@ -42,6 +42,8 @@ def get_add_router() -> APIRouter: - Regular file uploads - **datasetName** (Optional[str]): Name of the dataset to add data to - **datasetId** (Optional[UUID]): UUID of an already existing dataset + - **node_set** Optional[list[str]]: List of node identifiers for graph organization and access control. + Used for grouping related data points in the knowledge graph. Either datasetName or datasetId must be provided. diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py index 961532a06..003df7cd4 100644 --- a/cognee/api/v1/search/routers/get_search_router.py +++ b/cognee/api/v1/search/routers/get_search_router.py @@ -80,6 +80,7 @@ def get_search_router() -> APIRouter: - **datasets** (Optional[List[str]]): List of dataset names to search within - **dataset_ids** (Optional[List[UUID]]): List of dataset UUIDs to search within - **query** (str): The search query string + - **node_name** Optional[list[str]]: Filter results to specific node_sets defined in the add pipeline (for targeted search). - **top_k** (Optional[int]): Maximum number of results to return (default: 10) ## Response From 4159846bb39c2197b460f28d28b205953bf8ed39 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 16:04:14 +0200 Subject: [PATCH 16/17] fix: Make exluded paths use absolute path --- cognee/api/v1/cognify/code_graph_pipeline.py | 12 ++++++++---- cognee/modules/retrieval/code_retriever.py | 8 ++++++++ .../get_repo_file_dependencies.py | 17 +++++++++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 66b8568fa..fb3612857 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -1,6 +1,7 @@ import os import pathlib import asyncio +from typing import Optional from cognee.shared.logging_utils import get_logger, setup_logging from cognee.modules.observability.get_observe import get_observe @@ -28,7 +29,12 @@ logger = get_logger("code_graph_pipeline") @observe -async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None): +async def run_code_graph_pipeline( + repo_path, + include_docs=False, + excluded_paths: Optional[list[str]] = None, + supported_languages: Optional[list[str]] = None, +): import cognee from cognee.low_level import setup @@ -40,8 +46,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths= user = await get_default_user() detailed_extraction = True - # Multi-language support: allow passing supported_languages - supported_languages = None # defer to task defaults tasks = [ Task( get_repo_file_dependencies, @@ -95,7 +99,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths= if __name__ == "__main__": async def main(): - async for run_status in run_code_graph_pipeline("/Users/igorilic/Desktop/cognee/examples"): + async for run_status in run_code_graph_pipeline("REPO_PATH"): print(f"{run_status.pipeline_run_id}: {run_status.status}") file_path = os.path.join( diff --git a/cognee/modules/retrieval/code_retriever.py b/cognee/modules/retrieval/code_retriever.py index 6e819d8a7..76b5e758c 100644 --- a/cognee/modules/retrieval/code_retriever.py +++ b/cognee/modules/retrieval/code_retriever.py @@ -94,7 +94,15 @@ class CodeRetriever(BaseRetriever): {"id": res.id, "score": res.score, "payload": res.payload} ) + existing_collection = [] for collection in self.classes_and_functions_collections: + if await vector_engine.has_collection(collection): + existing_collection.append(collection) + + if not existing_collection: + raise RuntimeError("No collection found for code retriever") + + for collection in existing_collection: logger.debug(f"Searching {collection} collection with general query") search_results_code = await vector_engine.search( collection, query, limit=self.top_k diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 3ebf1fcb1..06cc3bddb 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -1,6 +1,7 @@ import asyncio import math import os +from pathlib import Path from typing import Set from typing import AsyncGenerator, Optional, List from uuid import NAMESPACE_OID, uuid5 @@ -78,15 +79,22 @@ async def get_source_code_files( if lang is None: continue # Exclude tests, common build/venv directories and files provided in exclude_paths - excluded_dirs = EXCLUDED_DIRS | set(excluded_paths or []) - root_parts = set(os.path.normpath(root).split(os.sep)) + excluded_dirs = EXCLUDED_DIRS + excluded_paths = {Path(p).resolve() for p in (excluded_paths or [])} # full paths + + root_path = Path(root).resolve() + root_parts = set(root_path.parts) # same as before base_name, _ext = os.path.splitext(file) if ( base_name.startswith("test_") - or base_name.endswith("_test") # catches Go's *_test.go and similar + or base_name.endswith("_test") or ".test." in file or ".spec." in file - or (excluded_dirs & root_parts) + or (excluded_dirs & root_parts) # name match + or any( + root_path.is_relative_to(p) # full-path match + for p in excluded_paths + ) ): continue file_path = os.path.abspath(os.path.join(root, file)) @@ -164,6 +172,7 @@ async def get_repo_file_dependencies( "go": [".go"], "rust": [".rs"], "cpp": [".cpp", ".c", ".h", ".hpp"], + "c": [".c", ".h"], } if supported_languages is not None: language_config = { From 0ecea42c2ccc0a12cf69b5dc23b51ae5196f0da5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 16:12:25 +0200 Subject: [PATCH 17/17] test: Remove repo path test --- cognee/tests/test_repo_processor.py | 46 ----------------------------- 1 file changed, 46 deletions(-) delete mode 100644 cognee/tests/test_repo_processor.py diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py deleted file mode 100644 index 2d5868f36..000000000 --- a/cognee/tests/test_repo_processor.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import shutil -import tempfile -from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files - - -def test_get_source_code_files_excludes_common_dirs_and_files(): - # Create a temporary test directory - test_repo = tempfile.mkdtemp() - - # Create files and folders to include/exclude - included_file = os.path.join(test_repo, "main.py") - excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"] - excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"] - - # Create included file - with open(included_file, "w") as f: - f.write("print('Hello world')") - - # Create excluded directories and files inside them - for folder in excluded_dirs: - folder_path = os.path.join(test_repo, folder) - os.makedirs(folder_path) - file_path = os.path.join(folder_path, "ignored.js") - with open(file_path, "w") as f: - f.write("// ignore this") - - # Create excluded files in root - for file_name in excluded_files: - file_path = os.path.join(test_repo, file_name) - with open(file_path, "w") as f: - f.write("dummy") - - # Run function - results = get_source_code_files(test_repo) - - # Assert only included file is present - assert included_file in results - for root, dirs, files in os.walk(test_repo): - for name in files: - full_path = os.path.join(root, name) - if full_path != included_file: - assert full_path not in results, f"{full_path} should have been excluded" - - # Cleanup - shutil.rmtree(test_repo)