From 4ca1de266e344ec5b914f80407760b72ebfc76b3 Mon Sep 17 00:00:00 2001
From: Hassan <261925524@formanite.fccollege.edu.pk>
Date: Thu, 31 Jul 2025 05:15:41 -0700
Subject: [PATCH 01/17] feat/configurable-path-exclusion

---
 cognee/api/v1/cognify/code_graph_pipeline.py  |  20 +++-
 .../get_repo_file_dependencies.py             | 106 ++++++++----------
 cognee/tests/test_repo_processor.py           |  45 ++++++++
 3 files changed, 109 insertions(+), 62 deletions(-)
 create mode 100644 cognee/tests/test_repo_processor.py

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 00a0d3dc9..d7faab6b5 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline")
 
 
 @observe
-async def run_code_graph_pipeline(repo_path, include_docs=False):
+async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None):
     import cognee
     from cognee.low_level import setup
 
@@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
     user = await get_default_user()
     detailed_extraction = True
 
+    # Default exclusion patterns
+    if excluded_paths is None:
+        excluded_paths = [
+            ".venv/", "venv/", "__pycache__/", ".pytest_cache/",
+            "build/", "dist/", "node_modules/", ".npm/", ".git/",
+            ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
+            "*.pyc", "*.pyo", "*.log", "*.tmp"
+        ]
+
     tasks = [
-        Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
-        # Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete
+        Task(
+            get_repo_file_dependencies,
+            detailed_extraction=detailed_extraction,
+            excluded_paths=excluded_paths
+        ),
         Task(add_data_points, task_config={"batch_size": 30}),
     ]
 
     if include_docs:
-        # This tasks take a long time to complete
         non_code_tasks = [
             Task(get_non_py_files, task_config={"batch_size": 50}),
             Task(ingest_data, dataset_name="repo_docs", user=user),
@@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
 
     dataset_name = "codebase"
 
-    # Save dataset to database
     db_engine = get_relational_engine()
     async with db_engine.get_async_session() as session:
         dataset = await create_dataset(dataset_name, user, session)
diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
index 232850936..2567a44cd 100644
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@@ -1,56 +1,68 @@
 import asyncio
 import math
 import os
-
-# from concurrent.futures import ProcessPoolExecutor
-from typing import AsyncGenerator
+import fnmatch
+from typing import AsyncGenerator, Optional, List
 from uuid import NAMESPACE_OID, uuid5
 
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, Repository
 
 
-async def get_source_code_files(repo_path):
+async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None):
     """
-    Retrieve Python source code files from the specified repository path.
-
-    This function scans the given repository path for files that have the .py extension
-    while excluding test files and files within a virtual environment. It returns a list of
-    absolute paths to the source code files that are not empty.
+    Retrieve Python source code files from the specified repository path,
+    excluding paths and file patterns commonly irrelevant to code analysis.
 
     Parameters:
     -----------
-
-        - repo_path: The file path to the repository to search for Python source files.
+    - repo_path: Root path of the repository to search
+    - excluded_paths: Optional list of path fragments or glob patterns to exclude
 
     Returns:
     --------
-
-        A list of absolute paths to .py files that contain source code, excluding empty
-        files, test files, and files from a virtual environment.
+    List of absolute file paths for .py files, excluding test files,
+    empty files, and files under ignored directories or matching ignore patterns.
     """
-    if not os.path.exists(repo_path):
-        return {}
 
-    py_files_paths = (
-        os.path.join(root, file)
-        for root, _, files in os.walk(repo_path)
-        for file in files
-        if (
-            file.endswith(".py")
-            and not file.startswith("test_")
-            and not file.endswith("_test")
-            and ".venv" not in file
-        )
-    )
+    if not os.path.exists(repo_path):
+        return []
+
+    # Default exclusions
+    default_excluded_patterns = [
+        ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/",
+        "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
+        "*.pyc", "*.pyo", "*.log", "*.tmp"
+    ]
+
+    excluded_patterns = default_excluded_patterns + (excluded_paths or [])
+
+    py_files_paths = []
+    for root, _, files in os.walk(repo_path):
+        for file in files:
+            full_path = os.path.join(root, file)
+            rel_path = os.path.relpath(full_path, repo_path)
+
+            # Check for exclusion
+            should_exclude = any(
+                pattern in rel_path or fnmatch.fnmatch(rel_path, pattern)
+                for pattern in excluded_patterns
+            )
+            if should_exclude:
+                continue
+
+            if (
+                file.endswith(".py")
+                and not file.startswith("test_")
+                and not file.endswith("_test")
+            ):
+                py_files_paths.append(full_path)
 
     source_code_files = set()
     for file_path in py_files_paths:
         file_path = os.path.abspath(file_path)
-
         if os.path.getsize(file_path) == 0:
             continue
-
         source_code_files.add(file_path)
 
     return list(source_code_files)
@@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs):
 
     This function creates a new asyncio event loop, sets it as the current loop, and
     executes the given coroutine function with the provided arguments. Once the coroutine
-    completes, the loop is closed. Intended for use in environments where an existing event
-    loop is not available or desirable.
-
-    Parameters:
-    -----------
-
-        - coroutine_func: The coroutine function to be run.
-        - *args: Positional arguments to pass to the coroutine function.
-        - **kwargs: Keyword arguments to pass to the coroutine function.
-
-    Returns:
-    --------
-
-        The result returned by the coroutine after completion.
+    completes, the loop is closed.
     """
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
@@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs):
 
 
 async def get_repo_file_dependencies(
-    repo_path: str, detailed_extraction: bool = False
+    repo_path: str,
+    detailed_extraction: bool = False,
+    excluded_paths: Optional[List[str]] = None
 ) -> AsyncGenerator[DataPoint, None]:
     """
     Generate a dependency graph for Python files in the given repository path.
 
-    Check the validity of the repository path and yield a repository object followed by the
-    dependencies of Python files within that repository. Raise a FileNotFoundError if the
-    provided path does not exist. The extraction of detailed dependencies can be controlled
-    via the `detailed_extraction` argument.
-
     Parameters:
     -----------
-
-        - repo_path (str): The file path to the repository where Python files are located.
-        - detailed_extraction (bool): A flag indicating whether to perform a detailed
-          extraction of dependencies (default is False). (default False)
+    - repo_path: Path to local repository
+    - detailed_extraction: Whether to extract fine-grained dependencies
+    - excluded_paths: Optional custom exclusion list
     """
 
     if not os.path.exists(repo_path):
         raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
 
-    source_code_files = await get_source_code_files(repo_path)
+    source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths)
 
     repo = Repository(
         id=uuid5(NAMESPACE_OID, repo_path),
@@ -125,11 +120,9 @@ async def get_repo_file_dependencies(
         for chunk_number in range(number_of_chunks)
     ]
 
-    # Codegraph dependencies are not installed by default, so we import where we use them.
     from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
 
     for start_range, end_range in chunk_ranges:
-        # with ProcessPoolExecutor(max_workers=12) as executor:
         tasks = [
             get_local_script_dependencies(repo_path, file_path, detailed_extraction)
             for file_path in source_code_files[start_range : end_range + 1]
@@ -139,5 +132,4 @@ async def get_repo_file_dependencies(
 
         for source_code_file in results:
             source_code_file.part_of = repo
-
             yield source_code_file
diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py
new file mode 100644
index 000000000..4de102da6
--- /dev/null
+++ b/cognee/tests/test_repo_processor.py
@@ -0,0 +1,45 @@
+import os
+import shutil
+import tempfile
+from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files
+
+def test_get_source_code_files_excludes_common_dirs_and_files():
+    # Create a temporary test directory
+    test_repo = tempfile.mkdtemp()
+
+    # Create files and folders to include/exclude
+    included_file = os.path.join(test_repo, "main.py")
+    excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"]
+    excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"]
+
+    # Create included file
+    with open(included_file, "w") as f:
+        f.write("print('Hello world')")
+
+    # Create excluded directories and files inside them
+    for folder in excluded_dirs:
+        folder_path = os.path.join(test_repo, folder)
+        os.makedirs(folder_path)
+        file_path = os.path.join(folder_path, "ignored.js")
+        with open(file_path, "w") as f:
+            f.write("// ignore this")
+
+    # Create excluded files in root
+    for file_name in excluded_files:
+        file_path = os.path.join(test_repo, file_name)
+        with open(file_path, "w") as f:
+            f.write("dummy")
+
+    # Run function
+    results = get_source_code_files(test_repo)
+
+    # Assert only included file is present
+    assert included_file in results
+    for root, dirs, files in os.walk(test_repo):
+        for name in files:
+            full_path = os.path.join(root, name)
+            if full_path != included_file:
+                assert full_path not in results, f"{full_path} should have been excluded"
+
+    # Cleanup
+    shutil.rmtree(test_repo)

From c898895f2229f851127a977411abb6b9cc6a4f74 Mon Sep 17 00:00:00 2001
From: Hassan <261925524@formanite.fccollege.edu.pk>
Date: Thu, 31 Jul 2025 07:00:11 -0700
Subject: [PATCH 02/17] feat/configurable-path-exclusion

---
 cognee/tests/test_repo_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py
index 4de102da6..fc3c26b05 100644
--- a/cognee/tests/test_repo_processor.py
+++ b/cognee/tests/test_repo_processor.py
@@ -1,7 +1,7 @@
 import os
 import shutil
 import tempfile
-from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files
+from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files
 
 def test_get_source_code_files_excludes_common_dirs_and_files():
     # Create a temporary test directory

From 8f26a01b3ab744a818bfeaeae932a41921f92ccc Mon Sep 17 00:00:00 2001
From: Hassan <261925524@formanite.fccollege.edu.pk>
Date: Sat, 2 Aug 2025 10:33:07 -0700
Subject: [PATCH 03/17] style: run ruff format and fix lint issues

---
 cognee/api/v1/cognify/code_graph_pipeline.py  | 24 +++++++++++---
 .../get_repo_file_dependencies.py             | 31 ++++++++++++-------
 cognee/tests/test_repo_processor.py           |  1 +
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index d7faab6b5..ae1c8b0ac 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -43,17 +43,31 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=
     # Default exclusion patterns
     if excluded_paths is None:
         excluded_paths = [
-            ".venv/", "venv/", "__pycache__/", ".pytest_cache/",
-            "build/", "dist/", "node_modules/", ".npm/", ".git/",
-            ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
-            "*.pyc", "*.pyo", "*.log", "*.tmp"
+            ".venv/",
+            "venv/",
+            "__pycache__/",
+            ".pytest_cache/",
+            "build/",
+            "dist/",
+            "node_modules/",
+            ".npm/",
+            ".git/",
+            ".svn/",
+            ".idea/",
+            ".vscode/",
+            "tmp/",
+            "temp/",
+            "*.pyc",
+            "*.pyo",
+            "*.log",
+            "*.tmp",
         ]
 
     tasks = [
         Task(
             get_repo_file_dependencies,
             detailed_extraction=detailed_extraction,
-            excluded_paths=excluded_paths
+            excluded_paths=excluded_paths,
         ),
         Task(add_data_points, task_config={"batch_size": 30}),
     ]
diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
index 2567a44cd..f1435a9e2 100644
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@@ -30,9 +30,24 @@ async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[st
 
     # Default exclusions
     default_excluded_patterns = [
-        ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/",
-        "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
-        "*.pyc", "*.pyo", "*.log", "*.tmp"
+        ".venv/",
+        "venv/",
+        "__pycache__/",
+        ".pytest_cache/",
+        "build/",
+        "dist/",
+        "node_modules/",
+        ".npm/",
+        ".git/",
+        ".svn/",
+        ".idea/",
+        ".vscode/",
+        "tmp/",
+        "temp/",
+        "*.pyc",
+        "*.pyo",
+        "*.log",
+        "*.tmp",
     ]
 
     excluded_patterns = default_excluded_patterns + (excluded_paths or [])
@@ -51,11 +66,7 @@ async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[st
             if should_exclude:
                 continue
 
-            if (
-                file.endswith(".py")
-                and not file.startswith("test_")
-                and not file.endswith("_test")
-            ):
+            if file.endswith(".py") and not file.startswith("test_") and not file.endswith("_test"):
                 py_files_paths.append(full_path)
 
     source_code_files = set()
@@ -84,9 +95,7 @@ def run_coroutine(coroutine_func, *args, **kwargs):
 
 
 async def get_repo_file_dependencies(
-    repo_path: str,
-    detailed_extraction: bool = False,
-    excluded_paths: Optional[List[str]] = None
+    repo_path: str, detailed_extraction: bool = False, excluded_paths: Optional[List[str]] = None
 ) -> AsyncGenerator[DataPoint, None]:
     """
     Generate a dependency graph for Python files in the given repository path.
diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py
index fc3c26b05..2d5868f36 100644
--- a/cognee/tests/test_repo_processor.py
+++ b/cognee/tests/test_repo_processor.py
@@ -3,6 +3,7 @@ import shutil
 import tempfile
 from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files
 
+
 def test_get_source_code_files_excludes_common_dirs_and_files():
     # Create a temporary test directory
     test_repo = tempfile.mkdtemp()

From ac87e62adb55803cc2335889b21bcc3777d3d833 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Thu, 28 Aug 2025 10:52:08 +0200
Subject: [PATCH 04/17] feat: Save search flag progress

---
 .../modules/retrieval/completion_retriever.py | 17 ++++++++++++--
 ..._completion_context_extension_retriever.py | 13 ++++++++++-
 .../graph_completion_cot_retriever.py         | 15 +++++++++++--
 .../retrieval/graph_completion_retriever.py   | 12 +++++++++-
 cognee/modules/retrieval/utils/completion.py  | 22 +++++++++++++------
 cognee/modules/search/methods/search.py       |  7 +++++-
 6 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py
index 655a9010d..e9c8331a1 100644
--- a/cognee/modules/retrieval/completion_retriever.py
+++ b/cognee/modules/retrieval/completion_retriever.py
@@ -65,7 +65,14 @@ class CompletionRetriever(BaseRetriever):
             logger.error("DocumentChunk_text collection not found")
             raise NoDataError("No data found in the system, please add data first.") from error
 
-    async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
+    async def get_completion(
+        self,
+        query: str,
+        context: Optional[Any] = None,
+        user_prompt: str = None,
+        system_prompt: str = None,
+        only_context: bool = False,
+    ) -> Any:
         """
         Generates an LLM completion using the context.
 
@@ -88,6 +95,12 @@ class CompletionRetriever(BaseRetriever):
             context = await self.get_context(query)
 
         completion = await generate_completion(
-            query, context, self.user_prompt_path, self.system_prompt_path
+            query=query,
+            context=context,
+            user_prompt_path=self.user_prompt_path,
+            system_prompt_path=self.system_prompt_path,
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            only_context=only_context,
         )
         return [completion]
diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py
index d05e6b4fa..f25edb4a7 100644
--- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py
@@ -41,7 +41,13 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
         )
 
     async def get_completion(
-        self, query: str, context: Optional[Any] = None, context_extension_rounds=4
+        self,
+        query: str,
+        context: Optional[Any] = None,
+        user_prompt: str = None,
+        system_prompt: str = None,
+        only_context: bool = False,
+        context_extension_rounds=4,
     ) -> List[str]:
         """
         Extends the context for a given query by retrieving related triplets and generating new
@@ -86,6 +92,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
                 context=context,
                 user_prompt_path=self.user_prompt_path,
                 system_prompt_path=self.system_prompt_path,
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
             )
 
             triplets += await self.get_triplets(completion)
@@ -112,6 +120,9 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
             context=context,
             user_prompt_path=self.user_prompt_path,
             system_prompt_path=self.system_prompt_path,
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            only_context=only_context,
         )
 
         if self.save_interaction and context and triplets and completion:
diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py
index 032dccf9e..63ab6b3b7 100644
--- a/cognee/modules/retrieval/graph_completion_cot_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py
@@ -51,7 +51,13 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
         self.followup_user_prompt_path = followup_user_prompt_path
 
     async def get_completion(
-        self, query: str, context: Optional[Any] = None, max_iter=4
+        self,
+        query: str,
+        context: Optional[Any] = None,
+        user_prompt: str = None,
+        system_prompt: str = None,
+        only_context: bool = False,
+        max_iter=4,
     ) -> List[str]:
         """
         Generate completion responses based on a user query and contextual information.
@@ -92,6 +98,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
                 context=context,
                 user_prompt_path=self.user_prompt_path,
                 system_prompt_path=self.system_prompt_path,
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
             )
             logger.info(f"Chain-of-thought: round {round_idx} - answer: {completion}")
             if round_idx < max_iter:
@@ -128,4 +136,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
                 question=query, answer=completion, context=context, triplets=triplets
             )
 
-        return [completion]
+        if only_context:
+            return [context]
+        else:
+            return [completion]
diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py
index fb3cf4885..d88252054 100644
--- a/cognee/modules/retrieval/graph_completion_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_retriever.py
@@ -151,7 +151,14 @@ class GraphCompletionRetriever(BaseRetriever):
 
         return context, triplets
 
-    async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
+    async def get_completion(
+        self,
+        query: str,
+        context: Optional[Any] = None,
+        user_prompt: str = None,
+        system_prompt: str = None,
+        only_context: bool = False,
+    ) -> Any:
         """
         Generates a completion using graph connections context based on a query.
 
@@ -177,6 +184,9 @@ class GraphCompletionRetriever(BaseRetriever):
             context=context,
             user_prompt_path=self.user_prompt_path,
             system_prompt_path=self.system_prompt_path,
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            only_context=only_context,
         )
 
         if self.save_interaction and context and triplets and completion:
diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py
index ca0b30c18..69381d647 100644
--- a/cognee/modules/retrieval/utils/completion.py
+++ b/cognee/modules/retrieval/utils/completion.py
@@ -6,18 +6,26 @@ async def generate_completion(
     context: str,
     user_prompt_path: str,
     system_prompt_path: str,
+    user_prompt: str = None,
+    system_prompt: str = None,
+    only_context: bool = False,
 ) -> str:
     """Generates a completion using LLM with given context and prompts."""
     args = {"question": query, "context": context}
-    user_prompt = LLMGateway.render_prompt(user_prompt_path, args)
-    system_prompt = LLMGateway.read_query_prompt(system_prompt_path)
-
-    return await LLMGateway.acreate_structured_output(
-        text_input=user_prompt,
-        system_prompt=system_prompt,
-        response_model=str,
+    user_prompt = LLMGateway.render_prompt(user_prompt if user_prompt else user_prompt_path, args)
+    system_prompt = LLMGateway.read_query_prompt(
+        system_prompt if system_prompt else system_prompt_path
     )
 
+    if only_context:
+        return context
+    else:
+        return await LLMGateway.acreate_structured_output(
+            text_input=user_prompt,
+            system_prompt=system_prompt,
+            response_model=str,
+        )
+
 
 async def summarize_text(
     text: str,
diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py
index f5f2a793a..3e5d6ffcd 100644
--- a/cognee/modules/search/methods/search.py
+++ b/cognee/modules/search/methods/search.py
@@ -101,11 +101,14 @@ async def specific_search(
     query: str,
     user: User,
     system_prompt_path="answer_simple_question.txt",
+    user_prompt: str = None,
+    system_prompt: str = None,
     top_k: int = 10,
     node_type: Optional[Type] = None,
     node_name: Optional[List[str]] = None,
     save_interaction: Optional[bool] = False,
     last_k: Optional[int] = None,
+    only_context: bool = None,
 ) -> list:
     search_tasks: dict[SearchType, Callable] = {
         SearchType.SUMMARIES: SummariesRetriever(top_k=top_k).get_completion,
@@ -159,7 +162,9 @@ async def specific_search(
 
     send_telemetry("cognee.search EXECUTION STARTED", user.id)
 
-    results = await search_task(query)
+    results = await search_task(
+        query=query, system_prompt=system_prompt, user_prompt=user_prompt, only_context=only_context
+    )
 
     send_telemetry("cognee.search EXECUTION COMPLETED", user.id)
 

From 2915698d601f8ce84d5d63458d0e8da51794fa67 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Thu, 28 Aug 2025 13:43:37 +0200
Subject: [PATCH 05/17] feat: Add only_context and system prompt flags for
 search

---
 .../v1/search/routers/get_search_router.py    |   6 +
 cognee/api/v1/search/search.py                |   4 +
 .../modules/retrieval/completion_retriever.py |  18 ++-
 ..._completion_context_extension_retriever.py |  20 +--
 .../graph_completion_cot_retriever.py         |  12 +-
 .../retrieval/graph_completion_retriever.py   |  12 +-
 .../graph_summary_completion_retriever.py     |   4 +-
 .../modules/retrieval/summaries_retriever.py  |   2 +-
 cognee/modules/retrieval/utils/completion.py  |  18 +--
 cognee/modules/search/methods/search.py       | 117 +++++++++++++-----
 10 files changed, 140 insertions(+), 73 deletions(-)

diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py
index 0ceeb1abb..b141c6bdc 100644
--- a/cognee/api/v1/search/routers/get_search_router.py
+++ b/cognee/api/v1/search/routers/get_search_router.py
@@ -20,7 +20,9 @@ class SearchPayloadDTO(InDTO):
     datasets: Optional[list[str]] = Field(default=None)
     dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]])
     query: str = Field(default="What is in the document?")
+    system_prompt: Optional[str] = Field(default=None)
     top_k: Optional[int] = Field(default=10)
+    only_context: bool = Field(default=False)
 
 
 def get_search_router() -> APIRouter:
@@ -102,7 +104,9 @@ def get_search_router() -> APIRouter:
                 "datasets": payload.datasets,
                 "dataset_ids": [str(dataset_id) for dataset_id in payload.dataset_ids or []],
                 "query": payload.query,
+                "system_prompt": payload.system_prompt,
                 "top_k": payload.top_k,
+                "only_context": payload.only_context,
             },
         )
 
@@ -115,7 +119,9 @@ def get_search_router() -> APIRouter:
                 user=user,
                 datasets=payload.datasets,
                 dataset_ids=payload.dataset_ids,
+                system_prompt=payload.system_prompt,
                 top_k=payload.top_k,
+                only_context=payload.only_context,
             )
 
             return results
diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py
index f37f8ba6d..113d33557 100644
--- a/cognee/api/v1/search/search.py
+++ b/cognee/api/v1/search/search.py
@@ -16,11 +16,13 @@ async def search(
     datasets: Optional[Union[list[str], str]] = None,
     dataset_ids: Optional[Union[list[UUID], UUID]] = None,
     system_prompt_path: str = "answer_simple_question.txt",
+    system_prompt: Optional[str] = None,
     top_k: int = 10,
     node_type: Optional[Type] = None,
     node_name: Optional[List[str]] = None,
     save_interaction: bool = False,
     last_k: Optional[int] = None,
+    only_context: bool = False,
 ) -> list:
     """
     Search and query the knowledge graph for insights, information, and connections.
@@ -183,11 +185,13 @@ async def search(
         dataset_ids=dataset_ids if dataset_ids else datasets,
         user=user,
         system_prompt_path=system_prompt_path,
+        system_prompt=system_prompt,
         top_k=top_k,
         node_type=node_type,
         node_name=node_name,
         save_interaction=save_interaction,
         last_k=last_k,
+        only_context=only_context,
     )
 
     return filtered_search_results
diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py
index e9c8331a1..4d34dfdbe 100644
--- a/cognee/modules/retrieval/completion_retriever.py
+++ b/cognee/modules/retrieval/completion_retriever.py
@@ -23,12 +23,16 @@ class CompletionRetriever(BaseRetriever):
         self,
         user_prompt_path: str = "context_for_question.txt",
         system_prompt_path: str = "answer_simple_question.txt",
+        system_prompt: str = None,
         top_k: Optional[int] = 1,
+        only_context: bool = False,
     ):
         """Initialize retriever with optional custom prompt paths."""
         self.user_prompt_path = user_prompt_path
         self.system_prompt_path = system_prompt_path
         self.top_k = top_k if top_k is not None else 1
+        self.system_prompt = system_prompt
+        self.only_context = only_context
 
     async def get_context(self, query: str) -> str:
         """
@@ -65,14 +69,7 @@ class CompletionRetriever(BaseRetriever):
             logger.error("DocumentChunk_text collection not found")
             raise NoDataError("No data found in the system, please add data first.") from error
 
-    async def get_completion(
-        self,
-        query: str,
-        context: Optional[Any] = None,
-        user_prompt: str = None,
-        system_prompt: str = None,
-        only_context: bool = False,
-    ) -> Any:
+    async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
         """
         Generates an LLM completion using the context.
 
@@ -99,8 +96,7 @@ class CompletionRetriever(BaseRetriever):
             context=context,
             user_prompt_path=self.user_prompt_path,
             system_prompt_path=self.system_prompt_path,
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-            only_context=only_context,
+            system_prompt=self.system_prompt,
+            only_context=self.only_context,
         )
         return [completion]
diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py
index f25edb4a7..8bdf5f1a0 100644
--- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py
@@ -26,10 +26,12 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
         self,
         user_prompt_path: str = "graph_context_for_question.txt",
         system_prompt_path: str = "answer_simple_question.txt",
+        system_prompt: Optional[str] = None,
         top_k: Optional[int] = 5,
         node_type: Optional[Type] = None,
         node_name: Optional[List[str]] = None,
         save_interaction: bool = False,
+        only_context: bool = False,
     ):
         super().__init__(
             user_prompt_path=user_prompt_path,
@@ -38,15 +40,14 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
             node_type=node_type,
             node_name=node_name,
             save_interaction=save_interaction,
+            system_prompt=system_prompt,
+            only_context=only_context,
         )
 
     async def get_completion(
         self,
         query: str,
         context: Optional[Any] = None,
-        user_prompt: str = None,
-        system_prompt: str = None,
-        only_context: bool = False,
         context_extension_rounds=4,
     ) -> List[str]:
         """
@@ -92,8 +93,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
                 context=context,
                 user_prompt_path=self.user_prompt_path,
                 system_prompt_path=self.system_prompt_path,
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
+                system_prompt=self.system_prompt,
             )
 
             triplets += await self.get_triplets(completion)
@@ -120,9 +120,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
             context=context,
             user_prompt_path=self.user_prompt_path,
             system_prompt_path=self.system_prompt_path,
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-            only_context=only_context,
+            system_prompt=self.system_prompt,
+            only_context=self.only_context,
         )
 
         if self.save_interaction and context and triplets and completion:
@@ -130,4 +129,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever):
                 question=query, answer=completion, context=context, triplets=triplets
             )
 
-        return [completion]
+        if self.only_context:
+            return [context]
+        else:
+            return [completion]
diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py
index 63ab6b3b7..86ff8555b 100644
--- a/cognee/modules/retrieval/graph_completion_cot_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py
@@ -32,14 +32,18 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
         validation_system_prompt_path: str = "cot_validation_system_prompt.txt",
         followup_system_prompt_path: str = "cot_followup_system_prompt.txt",
         followup_user_prompt_path: str = "cot_followup_user_prompt.txt",
+        system_prompt: str = None,
         top_k: Optional[int] = 5,
         node_type: Optional[Type] = None,
         node_name: Optional[List[str]] = None,
         save_interaction: bool = False,
+        only_context: bool = False,
     ):
         super().__init__(
             user_prompt_path=user_prompt_path,
             system_prompt_path=system_prompt_path,
+            system_prompt=system_prompt,
+            only_context=only_context,
             top_k=top_k,
             node_type=node_type,
             node_name=node_name,
@@ -54,9 +58,6 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
         self,
         query: str,
         context: Optional[Any] = None,
-        user_prompt: str = None,
-        system_prompt: str = None,
-        only_context: bool = False,
         max_iter=4,
     ) -> List[str]:
         """
@@ -98,8 +99,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
                 context=context,
                 user_prompt_path=self.user_prompt_path,
                 system_prompt_path=self.system_prompt_path,
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
+                system_prompt=self.system_prompt,
             )
             logger.info(f"Chain-of-thought: round {round_idx} - answer: {completion}")
             if round_idx < max_iter:
@@ -136,7 +136,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
                 question=query, answer=completion, context=context, triplets=triplets
             )
 
-        if only_context:
+        if self.only_context:
             return [context]
         else:
             return [completion]
diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py
index d88252054..6a5193c56 100644
--- a/cognee/modules/retrieval/graph_completion_retriever.py
+++ b/cognee/modules/retrieval/graph_completion_retriever.py
@@ -36,15 +36,19 @@ class GraphCompletionRetriever(BaseRetriever):
         self,
         user_prompt_path: str = "graph_context_for_question.txt",
         system_prompt_path: str = "answer_simple_question.txt",
+        system_prompt: str = None,
         top_k: Optional[int] = 5,
         node_type: Optional[Type] = None,
         node_name: Optional[List[str]] = None,
         save_interaction: bool = False,
+        only_context: bool = False,
     ):
         """Initialize retriever with prompt paths and search parameters."""
         self.save_interaction = save_interaction
         self.user_prompt_path = user_prompt_path
         self.system_prompt_path = system_prompt_path
+        self.system_prompt = system_prompt
+        self.only_context = only_context
         self.top_k = top_k if top_k is not None else 5
         self.node_type = node_type
         self.node_name = node_name
@@ -155,9 +159,6 @@ class GraphCompletionRetriever(BaseRetriever):
         self,
         query: str,
         context: Optional[Any] = None,
-        user_prompt: str = None,
-        system_prompt: str = None,
-        only_context: bool = False,
     ) -> Any:
         """
         Generates a completion using graph connections context based on a query.
@@ -184,9 +185,8 @@ class GraphCompletionRetriever(BaseRetriever):
             context=context,
             user_prompt_path=self.user_prompt_path,
             system_prompt_path=self.system_prompt_path,
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-            only_context=only_context,
+            system_prompt=self.system_prompt,
+            only_context=self.only_context,
         )
 
         if self.save_interaction and context and triplets and completion:
diff --git a/cognee/modules/retrieval/graph_summary_completion_retriever.py b/cognee/modules/retrieval/graph_summary_completion_retriever.py
index d344ebd26..051f39b22 100644
--- a/cognee/modules/retrieval/graph_summary_completion_retriever.py
+++ b/cognee/modules/retrieval/graph_summary_completion_retriever.py
@@ -21,6 +21,7 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever):
         user_prompt_path: str = "graph_context_for_question.txt",
         system_prompt_path: str = "answer_simple_question.txt",
         summarize_prompt_path: str = "summarize_search_results.txt",
+        system_prompt: Optional[str] = None,
         top_k: Optional[int] = 5,
         node_type: Optional[Type] = None,
         node_name: Optional[List[str]] = None,
@@ -34,6 +35,7 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever):
             node_type=node_type,
             node_name=node_name,
             save_interaction=save_interaction,
+            system_prompt=system_prompt,
         )
         self.summarize_prompt_path = summarize_prompt_path
 
@@ -57,4 +59,4 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever):
             - str: A summary string representing the content of the retrieved edges.
         """
         direct_text = await super().resolve_edges_to_text(retrieved_edges)
-        return await summarize_text(direct_text, self.summarize_prompt_path)
+        return await summarize_text(direct_text, self.summarize_prompt_path, self.system_prompt)
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
index 56f414013..df35cdc51 100644
--- a/cognee/modules/retrieval/summaries_retriever.py
+++ b/cognee/modules/retrieval/summaries_retriever.py
@@ -62,7 +62,7 @@ class SummariesRetriever(BaseRetriever):
         logger.info(f"Returning {len(summary_payloads)} summary payloads")
         return summary_payloads
 
-    async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
+    async def get_completion(self, query: str, context: Optional[Any] = None, **kwargs) -> Any:
         """
         Generates a completion using summaries context.
 
diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py
index 69381d647..4c2639517 100644
--- a/cognee/modules/retrieval/utils/completion.py
+++ b/cognee/modules/retrieval/utils/completion.py
@@ -1,3 +1,4 @@
+from typing import Optional
 from cognee.infrastructure.llm.LLMGateway import LLMGateway
 
 
@@ -6,15 +7,15 @@ async def generate_completion(
     context: str,
     user_prompt_path: str,
     system_prompt_path: str,
-    user_prompt: str = None,
-    system_prompt: str = None,
+    user_prompt: Optional[str] = None,
+    system_prompt: Optional[str] = None,
     only_context: bool = False,
 ) -> str:
     """Generates a completion using LLM with given context and prompts."""
     args = {"question": query, "context": context}
-    user_prompt = LLMGateway.render_prompt(user_prompt if user_prompt else user_prompt_path, args)
-    system_prompt = LLMGateway.read_query_prompt(
-        system_prompt if system_prompt else system_prompt_path
+    user_prompt = user_prompt if user_prompt else LLMGateway.render_prompt(user_prompt_path, args)
+    system_prompt = (
+        system_prompt if system_prompt else LLMGateway.read_query_prompt(system_prompt_path)
     )
 
     if only_context:
@@ -29,10 +30,13 @@ async def generate_completion(
 
 async def summarize_text(
     text: str,
-    prompt_path: str = "summarize_search_results.txt",
+    system_prompt_path: str = "summarize_search_results.txt",
+    system_prompt: str = None,
 ) -> str:
     """Summarizes text using LLM with the specified prompt."""
-    system_prompt = LLMGateway.read_query_prompt(prompt_path)
+    system_prompt = (
+        system_prompt if system_prompt else LLMGateway.read_query_prompt(system_prompt_path)
+    )
 
     return await LLMGateway.acreate_structured_output(
         text_input=text,
diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py
index 3e5d6ffcd..465d0cbb3 100644
--- a/cognee/modules/search/methods/search.py
+++ b/cognee/modules/search/methods/search.py
@@ -37,11 +37,13 @@ async def search(
     dataset_ids: Union[list[UUID], None],
     user: User,
     system_prompt_path="answer_simple_question.txt",
+    system_prompt: Optional[str] = None,
     top_k: int = 10,
     node_type: Optional[Type] = None,
     node_name: Optional[List[str]] = None,
     save_interaction: Optional[bool] = False,
     last_k: Optional[int] = None,
+    only_context: bool = False,
 ):
     """
 
@@ -61,28 +63,34 @@ async def search(
     # Use search function filtered by permissions if access control is enabled
     if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true":
         return await authorized_search(
-            query_text=query_text,
             query_type=query_type,
+            query_text=query_text,
             user=user,
             dataset_ids=dataset_ids,
             system_prompt_path=system_prompt_path,
+            system_prompt=system_prompt,
             top_k=top_k,
+            node_type=node_type,
+            node_name=node_name,
             save_interaction=save_interaction,
             last_k=last_k,
+            only_context=only_context,
         )
 
     query = await log_query(query_text, query_type.value, user.id)
 
     search_results = await specific_search(
-        query_type,
-        query_text,
-        user,
+        query_type=query_type,
+        query_text=query_text,
+        user=user,
         system_prompt_path=system_prompt_path,
+        system_prompt=system_prompt,
         top_k=top_k,
         node_type=node_type,
         node_name=node_name,
         save_interaction=save_interaction,
         last_k=last_k,
+        only_context=only_context,
     )
 
     await log_result(
@@ -98,11 +106,10 @@ async def search(
 
 async def specific_search(
     query_type: SearchType,
-    query: str,
+    query_text: str,
     user: User,
-    system_prompt_path="answer_simple_question.txt",
-    user_prompt: str = None,
-    system_prompt: str = None,
+    system_prompt_path: str = "answer_simple_question.txt",
+    system_prompt: Optional[str] = None,
     top_k: int = 10,
     node_type: Optional[Type] = None,
     node_name: Optional[List[str]] = None,
@@ -115,7 +122,10 @@ async def specific_search(
         SearchType.INSIGHTS: InsightsRetriever(top_k=top_k).get_completion,
         SearchType.CHUNKS: ChunksRetriever(top_k=top_k).get_completion,
         SearchType.RAG_COMPLETION: CompletionRetriever(
-            system_prompt_path=system_prompt_path, top_k=top_k
+            system_prompt_path=system_prompt_path,
+            top_k=top_k,
+            system_prompt=system_prompt,
+            only_context=only_context,
         ).get_completion,
         SearchType.GRAPH_COMPLETION: GraphCompletionRetriever(
             system_prompt_path=system_prompt_path,
@@ -123,6 +133,8 @@ async def specific_search(
             node_type=node_type,
             node_name=node_name,
             save_interaction=save_interaction,
+            system_prompt=system_prompt,
+            only_context=only_context,
         ).get_completion,
         SearchType.GRAPH_COMPLETION_COT: GraphCompletionCotRetriever(
             system_prompt_path=system_prompt_path,
@@ -130,6 +142,8 @@ async def specific_search(
             node_type=node_type,
             node_name=node_name,
             save_interaction=save_interaction,
+            system_prompt=system_prompt,
+            only_context=only_context,
         ).get_completion,
         SearchType.GRAPH_COMPLETION_CONTEXT_EXTENSION: GraphCompletionContextExtensionRetriever(
             system_prompt_path=system_prompt_path,
@@ -137,6 +151,8 @@ async def specific_search(
             node_type=node_type,
             node_name=node_name,
             save_interaction=save_interaction,
+            system_prompt=system_prompt,
+            only_context=only_context,
         ).get_completion,
         SearchType.GRAPH_SUMMARY_COMPLETION: GraphSummaryCompletionRetriever(
             system_prompt_path=system_prompt_path,
@@ -144,6 +160,7 @@ async def specific_search(
             node_type=node_type,
             node_name=node_name,
             save_interaction=save_interaction,
+            system_prompt=system_prompt,
         ).get_completion,
         SearchType.CODE: CodeRetriever(top_k=top_k).get_completion,
         SearchType.CYPHER: CypherSearchRetriever().get_completion,
@@ -153,7 +170,7 @@ async def specific_search(
 
     # If the query type is FEELING_LUCKY, select the search type intelligently
     if query_type is SearchType.FEELING_LUCKY:
-        query_type = await select_search_type(query)
+        query_type = await select_search_type(query_text)
 
     search_task = search_tasks.get(query_type)
 
@@ -162,9 +179,7 @@ async def specific_search(
 
     send_telemetry("cognee.search EXECUTION STARTED", user.id)
 
-    results = await search_task(
-        query=query, system_prompt=system_prompt, user_prompt=user_prompt, only_context=only_context
-    )
+    results = await search_task(query=query_text)
 
     send_telemetry("cognee.search EXECUTION COMPLETED", user.id)
 
@@ -172,14 +187,18 @@ async def specific_search(
 
 
 async def authorized_search(
-    query_text: str,
     query_type: SearchType,
-    user: User = None,
+    query_text: str,
+    user: User,
     dataset_ids: Optional[list[UUID]] = None,
     system_prompt_path: str = "answer_simple_question.txt",
+    system_prompt: Optional[str] = None,
     top_k: int = 10,
-    save_interaction: bool = False,
+    node_type: Optional[Type] = None,
+    node_name: Optional[List[str]] = None,
+    save_interaction: Optional[bool] = False,
     last_k: Optional[int] = None,
+    only_context: bool = None,
 ) -> list:
     """
     Verifies access for provided datasets or uses all datasets user has read access for and performs search per dataset.
@@ -193,14 +212,18 @@ async def authorized_search(
 
     # Searches all provided datasets and handles setting up of appropriate database context based on permissions
     search_results = await specific_search_by_context(
-        search_datasets,
-        query_text,
-        query_type,
-        user,
-        system_prompt_path,
-        top_k,
-        save_interaction,
+        search_datasets=search_datasets,
+        query_type=query_type,
+        query_text=query_text,
+        user=user,
+        system_prompt_path=system_prompt_path,
+        system_prompt=system_prompt,
+        top_k=top_k,
+        node_type=node_type,
+        node_name=node_name,
+        save_interaction=save_interaction,
         last_k=last_k,
+        only_context=only_context,
     )
 
     await log_result(query.id, json.dumps(search_results, cls=JSONEncoder), user.id)
@@ -210,13 +233,17 @@ async def authorized_search(
 
 async def specific_search_by_context(
     search_datasets: list[Dataset],
-    query_text: str,
     query_type: SearchType,
+    query_text: str,
     user: User,
-    system_prompt_path: str,
-    top_k: int,
-    save_interaction: bool = False,
+    system_prompt_path: str = "answer_simple_question.txt",
+    system_prompt: Optional[str] = None,
+    top_k: int = 10,
+    node_type: Optional[Type] = None,
+    node_name: Optional[List[str]] = None,
+    save_interaction: Optional[bool] = False,
     last_k: Optional[int] = None,
+    only_context: bool = None,
 ):
     """
     Searches all provided datasets and handles setting up of appropriate database context based on permissions.
@@ -224,18 +251,33 @@ async def specific_search_by_context(
     """
 
     async def _search_by_context(
-        dataset, user, query_type, query_text, system_prompt_path, top_k, last_k
+        dataset: Dataset,
+        query_type: SearchType,
+        query_text: str,
+        user: User,
+        system_prompt_path: str = "answer_simple_question.txt",
+        system_prompt: Optional[str] = None,
+        top_k: int = 10,
+        node_type: Optional[Type] = None,
+        node_name: Optional[List[str]] = None,
+        save_interaction: Optional[bool] = False,
+        last_k: Optional[int] = None,
+        only_context: bool = None,
     ):
         # Set database configuration in async context for each dataset user has access for
         await set_database_global_context_variables(dataset.id, dataset.owner_id)
         search_results = await specific_search(
-            query_type,
-            query_text,
-            user,
+            query_type=query_type,
+            query_text=query_text,
+            user=user,
             system_prompt_path=system_prompt_path,
+            system_prompt=system_prompt,
             top_k=top_k,
+            node_type=node_type,
+            node_name=node_name,
             save_interaction=save_interaction,
             last_k=last_k,
+            only_context=only_context,
         )
         return {
             "search_result": search_results,
@@ -248,7 +290,18 @@ async def specific_search_by_context(
     for dataset in search_datasets:
         tasks.append(
             _search_by_context(
-                dataset, user, query_type, query_text, system_prompt_path, top_k, last_k
+                dataset=dataset,
+                query_type=query_type,
+                query_text=query_text,
+                user=user,
+                system_prompt_path=system_prompt_path,
+                system_prompt=system_prompt,
+                top_k=top_k,
+                node_type=node_type,
+                node_name=node_name,
+                save_interaction=save_interaction,
+                last_k=last_k,
+                only_context=only_context,
             )
         )
 

From 7fd5e1e0104c061e056c5e97a4b0ea04effa45dd Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Thu, 28 Aug 2025 13:53:08 +0200
Subject: [PATCH 06/17] fix: Make custom_prompt be None by default

---
 cognee/api/v1/cognify/routers/get_cognify_router.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py
index 6809f089a..d40345f8e 100644
--- a/cognee/api/v1/cognify/routers/get_cognify_router.py
+++ b/cognee/api/v1/cognify/routers/get_cognify_router.py
@@ -38,7 +38,7 @@ class CognifyPayloadDTO(InDTO):
     dataset_ids: Optional[List[UUID]] = Field(default=None, examples=[[]])
     run_in_background: Optional[bool] = Field(default=False)
     custom_prompt: Optional[str] = Field(
-        default=None, description="Custom prompt for entity extraction and graph generation"
+        default="", description="Custom prompt for entity extraction and graph generation"
     )
 
 

From 966e676d610a38b1607ce415ec8b9d620cf5cec2 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Thu, 28 Aug 2025 17:23:15 +0200
Subject: [PATCH 07/17] refactor: Have search prompt be empty string by default

---
 cognee/api/v1/search/routers/get_search_router.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py
index b141c6bdc..39a896dd8 100644
--- a/cognee/api/v1/search/routers/get_search_router.py
+++ b/cognee/api/v1/search/routers/get_search_router.py
@@ -20,7 +20,7 @@ class SearchPayloadDTO(InDTO):
     datasets: Optional[list[str]] = Field(default=None)
     dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]])
     query: str = Field(default="What is in the document?")
-    system_prompt: Optional[str] = Field(default=None)
+    system_prompt: Optional[str] = Field(default="")
     top_k: Optional[int] = Field(default=10)
     only_context: bool = Field(default=False)
 

From cf636ba77f08665ce075235c5571eabc45c559be Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Thu, 28 Aug 2025 18:37:44 +0200
Subject: [PATCH 08/17] feat: Enable nodesets on backend

---
 cognee/api/v1/add/routers/get_add_router.py   | 38 ++++---------------
 .../v1/search/routers/get_search_router.py    |  3 ++
 2 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/cognee/api/v1/add/routers/get_add_router.py b/cognee/api/v1/add/routers/get_add_router.py
index 66b165a38..8424a4fb5 100644
--- a/cognee/api/v1/add/routers/get_add_router.py
+++ b/cognee/api/v1/add/routers/get_add_router.py
@@ -25,6 +25,7 @@ def get_add_router() -> APIRouter:
         data: List[UploadFile] = File(default=None),
         datasetName: Optional[str] = Form(default=None),
         datasetId: Union[UUID, Literal[""], None] = Form(default=None, examples=[""]),
+        node_set: Optional[List[str]] = Form(default=[""], example=[""]),
         user: User = Depends(get_authenticated_user),
     ):
         """
@@ -65,9 +66,7 @@ def get_add_router() -> APIRouter:
         send_telemetry(
             "Add API Endpoint Invoked",
             user.id,
-            additional_properties={
-                "endpoint": "POST /v1/add",
-            },
+            additional_properties={"endpoint": "POST /v1/add", "node_set": node_set},
         )
 
         from cognee.api.v1.add import add as cognee_add
@@ -76,34 +75,13 @@ def get_add_router() -> APIRouter:
             raise ValueError("Either datasetId or datasetName must be provided.")
 
         try:
-            if (
-                isinstance(data, str)
-                and data.startswith("http")
-                and (os.getenv("ALLOW_HTTP_REQUESTS", "true").lower() == "true")
-            ):
-                if "github" in data:
-                    # Perform git clone if the URL is from GitHub
-                    repo_name = data.split("/")[-1].replace(".git", "")
-                    subprocess.run(["git", "clone", data, f".data/{repo_name}"], check=True)
-                    # TODO: Update add call with dataset info
-                    await cognee_add(
-                        "data://.data/",
-                        f"{repo_name}",
-                    )
-                else:
-                    # Fetch and store the data from other types of URL using curl
-                    response = requests.get(data)
-                    response.raise_for_status()
+            add_run = await cognee_add(
+                data, datasetName, user=user, dataset_id=datasetId, node_set=node_set
+            )
 
-                    file_data = await response.content()
-                    # TODO: Update add call with dataset info
-                    return await cognee_add(file_data)
-            else:
-                add_run = await cognee_add(data, datasetName, user=user, dataset_id=datasetId)
-
-                if isinstance(add_run, PipelineRunErrored):
-                    return JSONResponse(status_code=420, content=add_run.model_dump(mode="json"))
-                return add_run.model_dump()
+            if isinstance(add_run, PipelineRunErrored):
+                return JSONResponse(status_code=420, content=add_run.model_dump(mode="json"))
+            return add_run.model_dump()
         except Exception as error:
             return JSONResponse(status_code=409, content={"error": str(error)})
 
diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py
index 0ceeb1abb..961532a06 100644
--- a/cognee/api/v1/search/routers/get_search_router.py
+++ b/cognee/api/v1/search/routers/get_search_router.py
@@ -20,6 +20,7 @@ class SearchPayloadDTO(InDTO):
     datasets: Optional[list[str]] = Field(default=None)
     dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]])
     query: str = Field(default="What is in the document?")
+    node_name: Optional[list[str]] = Field(default=None, example=[])
     top_k: Optional[int] = Field(default=10)
 
 
@@ -102,6 +103,7 @@ def get_search_router() -> APIRouter:
                 "datasets": payload.datasets,
                 "dataset_ids": [str(dataset_id) for dataset_id in payload.dataset_ids or []],
                 "query": payload.query,
+                "node_name": payload.node_name,
                 "top_k": payload.top_k,
             },
         )
@@ -115,6 +117,7 @@ def get_search_router() -> APIRouter:
                 user=user,
                 datasets=payload.datasets,
                 dataset_ids=payload.dataset_ids,
+                node_name=payload.node_name,
                 top_k=payload.top_k,
             )
 

From 5bfae7a36b10b746c167a4895d108130f9a62a2a Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 10:30:49 +0200
Subject: [PATCH 09/17] refactor: Resolve unit tests failing for search

---
 cognee/modules/search/methods/search.py             |  2 +-
 .../unit/modules/search/search_methods_test.py      | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py
index 465d0cbb3..2db105d71 100644
--- a/cognee/modules/search/methods/search.py
+++ b/cognee/modules/search/methods/search.py
@@ -179,7 +179,7 @@ async def specific_search(
 
     send_telemetry("cognee.search EXECUTION STARTED", user.id)
 
-    results = await search_task(query=query_text)
+    results = await search_task(query_text)
 
     send_telemetry("cognee.search EXECUTION COMPLETED", user.id)
 
diff --git a/cognee/tests/unit/modules/search/search_methods_test.py b/cognee/tests/unit/modules/search/search_methods_test.py
index 46995d087..9833a770b 100644
--- a/cognee/tests/unit/modules/search/search_methods_test.py
+++ b/cognee/tests/unit/modules/search/search_methods_test.py
@@ -58,15 +58,17 @@ async def test_search(
     # Verify
     mock_log_query.assert_called_once_with(query_text, query_type.value, mock_user.id)
     mock_specific_search.assert_called_once_with(
-        query_type,
-        query_text,
-        mock_user,
+        query_type=query_type,
+        query_text=query_text,
+        user=mock_user,
         system_prompt_path="answer_simple_question.txt",
+        system_prompt=None,
         top_k=10,
         node_type=None,
         node_name=None,
         save_interaction=False,
         last_k=None,
+        only_context=False,
     )
 
     # Verify result logging
@@ -201,7 +203,10 @@ async def test_specific_search_feeling_lucky(
 
         if retriever_name == "CompletionRetriever":
             mock_retriever_class.assert_called_once_with(
-                system_prompt_path="answer_simple_question.txt", top_k=top_k
+                system_prompt_path="answer_simple_question.txt",
+                top_k=top_k,
+                system_prompt=None,
+                only_context=None,
             )
         else:
             mock_retriever_class.assert_called_once_with(top_k=top_k)

From c3f5840bff1a9623066718d3a6ab14994bd4b0fe Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 12:24:15 +0200
Subject: [PATCH 10/17] refactor: Remove unused argument

---
 cognee/modules/retrieval/utils/completion.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py
index 4c2639517..81e636aad 100644
--- a/cognee/modules/retrieval/utils/completion.py
+++ b/cognee/modules/retrieval/utils/completion.py
@@ -7,13 +7,12 @@ async def generate_completion(
     context: str,
     user_prompt_path: str,
     system_prompt_path: str,
-    user_prompt: Optional[str] = None,
     system_prompt: Optional[str] = None,
     only_context: bool = False,
 ) -> str:
     """Generates a completion using LLM with given context and prompts."""
     args = {"question": query, "context": context}
-    user_prompt = user_prompt if user_prompt else LLMGateway.render_prompt(user_prompt_path, args)
+    user_prompt = LLMGateway.render_prompt(user_prompt_path, args)
     system_prompt = (
         system_prompt if system_prompt else LLMGateway.read_query_prompt(system_prompt_path)
     )

From 21f688385b16cc3bc50d355b32eb4b7610df2053 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 12:53:29 +0200
Subject: [PATCH 11/17] feat: Add nodeset as default node type

---
 cognee/api/v1/search/search.py                |  3 ++-
 cognee/modules/search/methods/search.py       | 27 ++++++++++++++++---
 .../modules/search/search_methods_test.py     |  4 +--
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py
index f37f8ba6d..344e763ae 100644
--- a/cognee/api/v1/search/search.py
+++ b/cognee/api/v1/search/search.py
@@ -1,6 +1,7 @@
 from uuid import UUID
 from typing import Union, Optional, List, Type
 
+from cognee.modules.engine.models.node_set import NodeSet
 from cognee.modules.users.models import User
 from cognee.modules.search.types import SearchType
 from cognee.modules.users.methods import get_default_user
@@ -17,7 +18,7 @@ async def search(
     dataset_ids: Optional[Union[list[UUID], UUID]] = None,
     system_prompt_path: str = "answer_simple_question.txt",
     top_k: int = 10,
-    node_type: Optional[Type] = None,
+    node_type: Optional[Type] = NodeSet,
     node_name: Optional[List[str]] = None,
     save_interaction: bool = False,
     last_k: Optional[int] = None,
diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py
index f5f2a793a..8e38e63c3 100644
--- a/cognee/modules/search/methods/search.py
+++ b/cognee/modules/search/methods/search.py
@@ -4,6 +4,7 @@ import asyncio
 from uuid import UUID
 from typing import Callable, List, Optional, Type, Union
 
+from cognee.modules.engine.models.node_set import NodeSet
 from cognee.modules.retrieval.user_qa_feedback import UserQAFeedback
 from cognee.modules.search.exceptions import UnsupportedSearchTypeError
 from cognee.context_global_variables import set_database_global_context_variables
@@ -38,7 +39,7 @@ async def search(
     user: User,
     system_prompt_path="answer_simple_question.txt",
     top_k: int = 10,
-    node_type: Optional[Type] = None,
+    node_type: Optional[Type] = NodeSet,
     node_name: Optional[List[str]] = None,
     save_interaction: Optional[bool] = False,
     last_k: Optional[int] = None,
@@ -67,6 +68,8 @@ async def search(
             dataset_ids=dataset_ids,
             system_prompt_path=system_prompt_path,
             top_k=top_k,
+            node_type=node_type,
+            node_name=node_name,
             save_interaction=save_interaction,
             last_k=last_k,
         )
@@ -102,7 +105,7 @@ async def specific_search(
     user: User,
     system_prompt_path="answer_simple_question.txt",
     top_k: int = 10,
-    node_type: Optional[Type] = None,
+    node_type: Optional[Type] = NodeSet,
     node_name: Optional[List[str]] = None,
     save_interaction: Optional[bool] = False,
     last_k: Optional[int] = None,
@@ -173,6 +176,8 @@ async def authorized_search(
     dataset_ids: Optional[list[UUID]] = None,
     system_prompt_path: str = "answer_simple_question.txt",
     top_k: int = 10,
+    node_type: Optional[Type] = NodeSet,
+    node_name: Optional[List[str]] = None,
     save_interaction: bool = False,
     last_k: Optional[int] = None,
 ) -> list:
@@ -194,7 +199,9 @@ async def authorized_search(
         user,
         system_prompt_path,
         top_k,
-        save_interaction,
+        node_type=node_type,
+        node_name=node_name,
+        save_interaction=save_interaction,
         last_k=last_k,
     )
 
@@ -210,6 +217,8 @@ async def specific_search_by_context(
     user: User,
     system_prompt_path: str,
     top_k: int,
+    node_type: Optional[Type] = NodeSet,
+    node_name: Optional[List[str]] = None,
     save_interaction: bool = False,
     last_k: Optional[int] = None,
 ):
@@ -229,6 +238,8 @@ async def specific_search_by_context(
             user,
             system_prompt_path=system_prompt_path,
             top_k=top_k,
+            node_type=node_type,
+            node_name=node_name,
             save_interaction=save_interaction,
             last_k=last_k,
         )
@@ -243,7 +254,15 @@ async def specific_search_by_context(
     for dataset in search_datasets:
         tasks.append(
             _search_by_context(
-                dataset, user, query_type, query_text, system_prompt_path, top_k, last_k
+                dataset,
+                user,
+                query_type,
+                query_text,
+                system_prompt_path,
+                top_k,
+                node_type=node_type,
+                node_name=node_name,
+                last_k=last_k,
             )
         )
 
diff --git a/cognee/tests/unit/modules/search/search_methods_test.py b/cognee/tests/unit/modules/search/search_methods_test.py
index 46995d087..004e1fca3 100644
--- a/cognee/tests/unit/modules/search/search_methods_test.py
+++ b/cognee/tests/unit/modules/search/search_methods_test.py
@@ -3,8 +3,8 @@ import uuid
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from pylint.checkers.utils import node_type
 
+from cognee.modules.engine.models.node_set import NodeSet
 from cognee.modules.search.exceptions import UnsupportedSearchTypeError
 from cognee.modules.search.methods.search import search, specific_search
 from cognee.modules.search.types import SearchType
@@ -63,7 +63,7 @@ async def test_search(
         mock_user,
         system_prompt_path="answer_simple_question.txt",
         top_k=10,
-        node_type=None,
+        node_type=NodeSet,
         node_name=None,
         save_interaction=False,
         last_k=None,

From e6ee182d789b43e056ce71400367c04683fc2e8a Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 13:03:06 +0200
Subject: [PATCH 12/17] fix: Handle [] node_name case

---
 cognee/modules/graph/cognee_graph/CogneeGraph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py
index ed867ae24..924532ce0 100644
--- a/cognee/modules/graph/cognee_graph/CogneeGraph.py
+++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py
@@ -76,7 +76,7 @@ class CogneeGraph(CogneeAbstractGraph):
             start_time = time.time()
 
             # Determine projection strategy
-            if node_type is not None and node_name is not None:
+            if node_type is not None and node_name not in [None, []]:
                 nodes_data, edges_data = await adapter.get_nodeset_subgraph(
                     node_type=node_type, node_name=node_name
                 )

From b9fa285c1ac9a1c98dac414a3f8dc62e57305c42 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 13:38:52 +0200
Subject: [PATCH 13/17] fix: Add node_name and node_type to context search

---
 cognee/modules/search/methods/search.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py
index 8e38e63c3..74ef2a6ad 100644
--- a/cognee/modules/search/methods/search.py
+++ b/cognee/modules/search/methods/search.py
@@ -228,7 +228,15 @@ async def specific_search_by_context(
     """
 
     async def _search_by_context(
-        dataset, user, query_type, query_text, system_prompt_path, top_k, last_k
+        dataset,
+        user,
+        query_type,
+        query_text,
+        system_prompt_path,
+        top_k,
+        node_type: Optional[Type] = NodeSet,
+        node_name: Optional[List[str]] = None,
+        last_k: Optional[int] = None,
     ):
         # Set database configuration in async context for each dataset user has access for
         await set_database_global_context_variables(dataset.id, dataset.owner_id)

From 614055c850661fcbb816a9bf77b2e61324a83f69 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 14:16:18 +0200
Subject: [PATCH 14/17] refactor: Add docs for new search arguments

---
 cognee/api/v1/search/routers/get_search_router.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py
index 39a896dd8..f9f4e4764 100644
--- a/cognee/api/v1/search/routers/get_search_router.py
+++ b/cognee/api/v1/search/routers/get_search_router.py
@@ -1,9 +1,11 @@
 from uuid import UUID
+import pathlib
 from typing import Optional
 from datetime import datetime
 from pydantic import Field
 from fastapi import Depends, APIRouter
 from fastapi.responses import JSONResponse
+
 from cognee.modules.search.types import SearchType
 from cognee.api.DTO import InDTO, OutDTO
 from cognee.modules.users.exceptions.exceptions import PermissionDeniedError
@@ -20,7 +22,9 @@ class SearchPayloadDTO(InDTO):
     datasets: Optional[list[str]] = Field(default=None)
     dataset_ids: Optional[list[UUID]] = Field(default=None, examples=[[]])
     query: str = Field(default="What is in the document?")
-    system_prompt: Optional[str] = Field(default="")
+    system_prompt: Optional[str] = Field(
+        default="Answer the question using the provided context. Be as brief as possible."
+    )
     top_k: Optional[int] = Field(default=10)
     only_context: bool = Field(default=False)
 
@@ -81,7 +85,9 @@ def get_search_router() -> APIRouter:
         - **datasets** (Optional[List[str]]): List of dataset names to search within
         - **dataset_ids** (Optional[List[UUID]]): List of dataset UUIDs to search within
         - **query** (str): The search query string
+        - **system_prompt** Optional[str]: System prompt to be used for Completion type searches in Cognee
         - **top_k** (Optional[int]): Maximum number of results to return (default: 10)
+        - **only_context** bool: Set to true to only return context Cognee will be sending to LLM in Completion type searches. This will be returned instead of LLM calls for completion type searches.
 
         ## Response
         Returns a list of search results containing relevant nodes from the graph.

From 978815586cee1c0809c4fc3df57b88cebfc8c2e0 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 14:21:42 +0200
Subject: [PATCH 15/17] docs: Add docstring for node usage in backend

---
 cognee/api/v1/add/routers/get_add_router.py       | 2 ++
 cognee/api/v1/search/routers/get_search_router.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/cognee/api/v1/add/routers/get_add_router.py b/cognee/api/v1/add/routers/get_add_router.py
index 8424a4fb5..1703d9931 100644
--- a/cognee/api/v1/add/routers/get_add_router.py
+++ b/cognee/api/v1/add/routers/get_add_router.py
@@ -42,6 +42,8 @@ def get_add_router() -> APIRouter:
           - Regular file uploads
         - **datasetName** (Optional[str]): Name of the dataset to add data to
         - **datasetId** (Optional[UUID]): UUID of an already existing dataset
+        - **node_set** Optional[list[str]]: List of node identifiers for graph organization and access control.
+                 Used for grouping related data points in the knowledge graph.
 
         Either datasetName or datasetId must be provided.
 
diff --git a/cognee/api/v1/search/routers/get_search_router.py b/cognee/api/v1/search/routers/get_search_router.py
index 961532a06..003df7cd4 100644
--- a/cognee/api/v1/search/routers/get_search_router.py
+++ b/cognee/api/v1/search/routers/get_search_router.py
@@ -80,6 +80,7 @@ def get_search_router() -> APIRouter:
         - **datasets** (Optional[List[str]]): List of dataset names to search within
         - **dataset_ids** (Optional[List[UUID]]): List of dataset UUIDs to search within
         - **query** (str): The search query string
+        - **node_name** Optional[list[str]]: Filter results to specific node_sets defined in the add pipeline (for targeted search).
         - **top_k** (Optional[int]): Maximum number of results to return (default: 10)
 
         ## Response

From 4159846bb39c2197b460f28d28b205953bf8ed39 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 16:04:14 +0200
Subject: [PATCH 16/17] fix: Make exluded paths use absolute path

---
 cognee/api/v1/cognify/code_graph_pipeline.py    | 12 ++++++++----
 cognee/modules/retrieval/code_retriever.py      |  8 ++++++++
 .../get_repo_file_dependencies.py               | 17 +++++++++++++----
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 66b8568fa..fb3612857 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -1,6 +1,7 @@
 import os
 import pathlib
 import asyncio
+from typing import Optional
 from cognee.shared.logging_utils import get_logger, setup_logging
 from cognee.modules.observability.get_observe import get_observe
 
@@ -28,7 +29,12 @@ logger = get_logger("code_graph_pipeline")
 
 
 @observe
-async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None):
+async def run_code_graph_pipeline(
+    repo_path,
+    include_docs=False,
+    excluded_paths: Optional[list[str]] = None,
+    supported_languages: Optional[list[str]] = None,
+):
     import cognee
     from cognee.low_level import setup
 
@@ -40,8 +46,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=
     user = await get_default_user()
     detailed_extraction = True
 
-    # Multi-language support: allow passing supported_languages
-    supported_languages = None  # defer to task defaults
     tasks = [
         Task(
             get_repo_file_dependencies,
@@ -95,7 +99,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=
 if __name__ == "__main__":
 
     async def main():
-        async for run_status in run_code_graph_pipeline("/Users/igorilic/Desktop/cognee/examples"):
+        async for run_status in run_code_graph_pipeline("REPO_PATH"):
             print(f"{run_status.pipeline_run_id}: {run_status.status}")
 
         file_path = os.path.join(
diff --git a/cognee/modules/retrieval/code_retriever.py b/cognee/modules/retrieval/code_retriever.py
index 6e819d8a7..76b5e758c 100644
--- a/cognee/modules/retrieval/code_retriever.py
+++ b/cognee/modules/retrieval/code_retriever.py
@@ -94,7 +94,15 @@ class CodeRetriever(BaseRetriever):
                         {"id": res.id, "score": res.score, "payload": res.payload}
                     )
 
+            existing_collection = []
             for collection in self.classes_and_functions_collections:
+                if await vector_engine.has_collection(collection):
+                    existing_collection.append(collection)
+
+            if not existing_collection:
+                raise RuntimeError("No collection found for code retriever")
+
+            for collection in existing_collection:
                 logger.debug(f"Searching {collection} collection with general query")
                 search_results_code = await vector_engine.search(
                     collection, query, limit=self.top_k
diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
index 3ebf1fcb1..06cc3bddb 100644
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@@ -1,6 +1,7 @@
 import asyncio
 import math
 import os
+from pathlib import Path
 from typing import Set
 from typing import AsyncGenerator, Optional, List
 from uuid import NAMESPACE_OID, uuid5
@@ -78,15 +79,22 @@ async def get_source_code_files(
             if lang is None:
                 continue
             # Exclude tests, common build/venv directories and files provided in exclude_paths
-            excluded_dirs = EXCLUDED_DIRS | set(excluded_paths or [])
-            root_parts = set(os.path.normpath(root).split(os.sep))
+            excluded_dirs = EXCLUDED_DIRS
+            excluded_paths = {Path(p).resolve() for p in (excluded_paths or [])}  # full paths
+
+            root_path = Path(root).resolve()
+            root_parts = set(root_path.parts)  # same as before
             base_name, _ext = os.path.splitext(file)
             if (
                 base_name.startswith("test_")
-                or base_name.endswith("_test")  # catches Go's *_test.go and similar
+                or base_name.endswith("_test")
                 or ".test." in file
                 or ".spec." in file
-                or (excluded_dirs & root_parts)
+                or (excluded_dirs & root_parts)  # name match
+                or any(
+                    root_path.is_relative_to(p)  # full-path match
+                    for p in excluded_paths
+                )
             ):
                 continue
             file_path = os.path.abspath(os.path.join(root, file))
@@ -164,6 +172,7 @@ async def get_repo_file_dependencies(
         "go": [".go"],
         "rust": [".rs"],
         "cpp": [".cpp", ".c", ".h", ".hpp"],
+        "c": [".c", ".h"],
     }
     if supported_languages is not None:
         language_config = {

From 0ecea42c2ccc0a12cf69b5dc23b51ae5196f0da5 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Fri, 29 Aug 2025 16:12:25 +0200
Subject: [PATCH 17/17] test: Remove repo path test

---
 cognee/tests/test_repo_processor.py | 46 -----------------------------
 1 file changed, 46 deletions(-)
 delete mode 100644 cognee/tests/test_repo_processor.py

diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py
deleted file mode 100644
index 2d5868f36..000000000
--- a/cognee/tests/test_repo_processor.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import shutil
-import tempfile
-from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files
-
-
-def test_get_source_code_files_excludes_common_dirs_and_files():
-    # Create a temporary test directory
-    test_repo = tempfile.mkdtemp()
-
-    # Create files and folders to include/exclude
-    included_file = os.path.join(test_repo, "main.py")
-    excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"]
-    excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"]
-
-    # Create included file
-    with open(included_file, "w") as f:
-        f.write("print('Hello world')")
-
-    # Create excluded directories and files inside them
-    for folder in excluded_dirs:
-        folder_path = os.path.join(test_repo, folder)
-        os.makedirs(folder_path)
-        file_path = os.path.join(folder_path, "ignored.js")
-        with open(file_path, "w") as f:
-            f.write("// ignore this")
-
-    # Create excluded files in root
-    for file_name in excluded_files:
-        file_path = os.path.join(test_repo, file_name)
-        with open(file_path, "w") as f:
-            f.write("dummy")
-
-    # Run function
-    results = get_source_code_files(test_repo)
-
-    # Assert only included file is present
-    assert included_file in results
-    for root, dirs, files in os.walk(test_repo):
-        for name in files:
-            full_path = os.path.join(root, name)
-            if full_path != included_file:
-                assert full_path not in results, f"{full_path} should have been excluded"
-
-    # Cleanup
-    shutil.rmtree(test_repo)