From b3bb4e82df704f58b181164d32c42a15a7a62439 Mon Sep 17 00:00:00 2001
From: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
Date: Sat, 14 Dec 2024 13:02:16 +0100
Subject: [PATCH 01/10] Create dockerhub.yml

---
 .github/workflows/dockerhub.yml | 46 +++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 .github/workflows/dockerhub.yml

diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml
new file mode 100644
index 000000000..40830dc65
--- /dev/null
+++ b/.github/workflows/dockerhub.yml
@@ -0,0 +1,46 @@
+name: build | Build and Push Docker Image to DockerHub
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  docker-build-and-push:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Log in to Docker Hub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+
+    - name: Extract Git information
+      id: git-info
+      run: |
+        echo "BRANCH_NAME=$(echo ${GITHUB_REF#refs/heads/} | sed 's/\\//-/g')" >> $GITHUB_ENV
+        echo "COMMIT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
+
+    - name: Build and Push Docker Image
+      run: |
+        IMAGE_NAME=cognee/cognee
+        TAG_VERSION="${BRANCH_NAME}-${COMMIT_SHA}"
+        
+        echo "Building image: ${IMAGE_NAME}:${TAG_VERSION}"
+        docker buildx build \
+          --platform linux/amd64,linux/arm64 \
+          --push \
+          --tag "${IMAGE_NAME}:${TAG_VERSION}" \
+          --tag "${IMAGE_NAME}:latest" \
+          .
+
+    - name: Verify pushed Docker images
+      run: |
+        echo "Successfully pushed images to Docker Hub"

From 630ab556dbc24d8002745bc4ee082121032b1eb4 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Tue, 17 Dec 2024 11:20:22 +0100
Subject: [PATCH 02/10] feat: Add search by dataset for cognee

Added ability to search by datasets for cognee users

Feature COG-912
---
 cognee/api/v1/search/search_v2.py             | 11 +++++--
 cognee/modules/data/models/__init__.py        |  1 +
 .../methods/get_document_ids_for_user.py      | 31 ++++++++++++++++++-
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/cognee/api/v1/search/search_v2.py b/cognee/api/v1/search/search_v2.py
index 6a5da4648..222ec6791 100644
--- a/cognee/api/v1/search/search_v2.py
+++ b/cognee/api/v1/search/search_v2.py
@@ -1,7 +1,7 @@
 import json
 from uuid import UUID
 from enum import Enum
-from typing import Callable, Dict
+from typing import Callable, Dict, Union
 
 from cognee.exceptions import InvalidValueError
 from cognee.modules.search.operations import log_query, log_result
@@ -22,7 +22,12 @@ class SearchType(Enum):
     CHUNKS = "CHUNKS"
     COMPLETION = "COMPLETION"
 
-async def search(query_type: SearchType, query_text: str, user: User = None) -> list:
+async def search(query_type: SearchType, query_text: str, user: User = None,
+                 datasets: Union[list[str], str, None] = None) -> list:
+    # We use lists from now on for datasets
+    if isinstance(datasets, str):
+        datasets = [datasets]
+
     if user is None:
         user = await get_default_user()
 
@@ -31,7 +36,7 @@ async def search(query_type: SearchType, query_text: str, user: User = None) ->
 
     query = await log_query(query_text, str(query_type), user.id)
 
-    own_document_ids = await get_document_ids_for_user(user.id)
+    own_document_ids = await get_document_ids_for_user(user.id, datasets)
     search_results = await specific_search(query_type, query_text, user)
 
     filtered_search_results = []
diff --git a/cognee/modules/data/models/__init__.py b/cognee/modules/data/models/__init__.py
index 5d79dbd40..bd5774f88 100644
--- a/cognee/modules/data/models/__init__.py
+++ b/cognee/modules/data/models/__init__.py
@@ -1,2 +1,3 @@
 from .Data import Data
 from .Dataset import Dataset
+from .DatasetData import DatasetData
diff --git a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
index 79736db0f..7e052ebc9 100644
--- a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
+++ b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
@@ -1,9 +1,11 @@
 from uuid import UUID
 from sqlalchemy import select
 from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.modules.data.models import Dataset, DatasetData
 from ...models import ACL, Resource, Permission
 
-async def get_document_ids_for_user(user_id: UUID) -> list[str]:
+
+async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -> list[str]:
     db_engine = get_relational_engine()
 
     async with db_engine.get_async_session() as session:
@@ -18,4 +20,31 @@ async def get_document_ids_for_user(user_id: UUID) -> list[str]:
                 )
             )).all()
 
+            if datasets:
+                documnets_ids_in_dataset = set()
+                # If datasets are specified filter out documents that aren't part of the specified datasets
+                for dataset in datasets:
+                    # Find dataset id for dataset element
+                    dataset_id = (await session.scalars(
+                        select(Dataset.id)
+                        .where(
+                            Dataset.name == dataset,
+                            Dataset.owner_id == user_id,
+                        )
+                    )).one()
+
+                    # Check which documents are connected to this dataset
+                    for document_id in document_ids:
+                        data_id = (await session.scalars(
+                            select(DatasetData.data_id)
+                            .where(
+                                DatasetData.dataset_id == dataset_id,
+                                DatasetData.data_id == document_id,
+                            )
+                        )).one()
+
+                        # If document is related to dataset added it to return value
+                        if data_id:
+                            documnets_ids_in_dataset.add(document_id)
+                return list(documnets_ids_in_dataset)
             return document_ids

From c2d2b2631967d18e79615ff35a121636a4b95b80 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Tue, 17 Dec 2024 11:52:30 +0100
Subject: [PATCH 03/10] fix: Remove backend lock from UI

Removed lock that prevented using multiple datasets in cognify

Fix COG-912
---
 cognee/api/v1/cognify/cognify_v2.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py
index 2c45774ee..c14f00978 100644
--- a/cognee/api/v1/cognify/cognify_v2.py
+++ b/cognee/api/v1/cognify/cognify_v2.py
@@ -69,17 +69,18 @@ async def run_cognify_pipeline(dataset: Dataset, user: User, graph_model: BaseMo
 
     send_telemetry("cognee.cognify EXECUTION STARTED", user.id)
 
-    async with update_status_lock:
-        task_status = await get_pipeline_status([dataset_id])
+    #async with update_status_lock: TODO: Add UI lock to prevent multiple backend requests
+    task_status = await get_pipeline_status([dataset_id])
 
-        if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
-            logger.info("Dataset %s is already being processed.", dataset_name)
-            return
+    if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
+        logger.info("Dataset %s is already being processed.", dataset_name)
+        return
+
+    await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
+        "dataset_name": dataset_name,
+        "files": document_ids_str,
+    })
 
-        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
-            "dataset_name": dataset_name,
-            "files": document_ids_str,
-        })
     try:
         cognee_config = get_cognify_config()
 

From af335fafe36c2332fcd5a1e18a2e44ea0fa46b4e Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Tue, 17 Dec 2024 12:11:24 +0100
Subject: [PATCH 04/10] test: Added test for getting of documents for search

Added test to verify getting documents related to datasets intended for search

Test COG-912
---
 .../methods/get_document_ids_for_user.py      |  4 ++--
 cognee/tests/test_pgvector.py                 | 24 +++++++++++++++----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
index 7e052ebc9..d726e9002 100644
--- a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
+++ b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
@@ -31,7 +31,7 @@ async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -
                             Dataset.name == dataset,
                             Dataset.owner_id == user_id,
                         )
-                    )).one()
+                    )).one_or_none()
 
                     # Check which documents are connected to this dataset
                     for document_id in document_ids:
@@ -41,7 +41,7 @@ async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -
                                 DatasetData.dataset_id == dataset_id,
                                 DatasetData.data_id == document_id,
                             )
-                        )).one()
+                        )).one_or_none()
 
                         # If document is related to dataset added it to return value
                         if data_id:
diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py
index 3b4fa19c5..3aaed016a 100644
--- a/cognee/tests/test_pgvector.py
+++ b/cognee/tests/test_pgvector.py
@@ -4,6 +4,7 @@ import pathlib
 import cognee
 from cognee.api.v1.search import SearchType
 from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
+from cognee.modules.users.methods import get_default_user
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -44,12 +45,13 @@ async def main():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata = True)
 
-    dataset_name = "cs_explanations"
+    dataset_name_1 = "natural_language"
+    dataset_name_2 = "quantum"
 
     explanation_file_path = os.path.join(
         pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
     )
-    await cognee.add([explanation_file_path], dataset_name)
+    await cognee.add([explanation_file_path], dataset_name_1)
 
     text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
     At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
@@ -59,12 +61,24 @@ async def main():
     In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
     """
 
-    await cognee.add([text], dataset_name)
+    await cognee.add([text], dataset_name_2)
 
-    await cognee.cognify([dataset_name])
+    await cognee.cognify([dataset_name_2, dataset_name_1])
 
     from cognee.infrastructure.databases.vector import get_vector_engine
 
+    # Test getting of documents for search per dataset
+    from cognee.modules.users.permissions.methods import get_document_ids_for_user
+    user = await get_default_user()
+    document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
+    assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1"
+
+    # Test getting of documents for search when no dataset is provided
+    from cognee.modules.users.permissions.methods import get_document_ids_for_user
+    user = await get_default_user()
+    document_ids = await get_document_ids_for_user(user.id)
+    assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2"
+
     vector_engine = get_vector_engine()
     random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0]
     random_node_name = random_node.payload["text"]
@@ -75,7 +89,7 @@ async def main():
     for result in search_results:
         print(f"{result}\n")
 
-    search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
+    search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name, datasets=[dataset_name_2])
     assert len(search_results) != 0, "The search results list is empty."
     print("\n\nExtracted chunks are:\n")
     for result in search_results:

From 63c3dceec6c76956bea3bf5216235ecea06c03a5 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Tue, 17 Dec 2024 14:07:50 +0100
Subject: [PATCH 05/10] fix: Resolve issue with cognify router graph model
 default value

Resolve issue with default value for graph model in cognify endpoint

Fix
---
 cognee/api/v1/cognify/routers/get_cognify_router.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py
index a56dbd7a5..257ac994f 100644
--- a/cognee/api/v1/cognify/routers/get_cognify_router.py
+++ b/cognee/api/v1/cognify/routers/get_cognify_router.py
@@ -5,11 +5,11 @@ from cognee.modules.users.models import User
 from fastapi.responses import JSONResponse
 from cognee.modules.users.methods import get_authenticated_user
 from fastapi import Depends
-
+from cognee.shared.data_models import KnowledgeGraph
 
 class CognifyPayloadDTO(BaseModel):
     datasets: List[str]
-    graph_model: Optional[BaseModel] = None
+    graph_model: Optional[BaseModel] = KnowledgeGraph
 
 def get_cognify_router() -> APIRouter:
     router = APIRouter()

From 48825d0d84c9cb346bca66ffd428fba1c1678501 Mon Sep 17 00:00:00 2001
From: Igor Ilic <igorilic03@gmail.com>
Date: Tue, 17 Dec 2024 14:22:51 +0100
Subject: [PATCH 06/10] chore: Resolve typo in getting documents code

Resolve typo in code

chore COG-912
---
 .../users/permissions/methods/get_document_ids_for_user.py  | 6 +++---
 cognee/tests/test_pgvector.py                               | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
index d726e9002..d439fb4f5 100644
--- a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
+++ b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
@@ -21,7 +21,7 @@ async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -
             )).all()
 
             if datasets:
-                documnets_ids_in_dataset = set()
+                documents_ids_in_dataset = set()
                 # If datasets are specified filter out documents that aren't part of the specified datasets
                 for dataset in datasets:
                     # Find dataset id for dataset element
@@ -45,6 +45,6 @@ async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -
 
                         # If document is related to dataset added it to return value
                         if data_id:
-                            documnets_ids_in_dataset.add(document_id)
-                return list(documnets_ids_in_dataset)
+                            documents_ids_in_dataset.add(document_id)
+                return list(documents_ids_in_dataset)
             return document_ids
diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py
index 3aaed016a..9554a3f9d 100644
--- a/cognee/tests/test_pgvector.py
+++ b/cognee/tests/test_pgvector.py
@@ -74,7 +74,6 @@ async def main():
     assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1"
 
     # Test getting of documents for search when no dataset is provided
-    from cognee.modules.users.permissions.methods import get_document_ids_for_user
     user = await get_default_user()
     document_ids = await get_document_ids_for_user(user.id)
     assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2"

From bc6f371ff972e104f027d27f6b3510c638ee238d Mon Sep 17 00:00:00 2001
From: Boris <boris@topoteretes.com>
Date: Tue, 17 Dec 2024 14:30:19 +0100
Subject: [PATCH 07/10] Update .github/workflows/dockerhub.yml

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .github/workflows/dockerhub.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml
index 40830dc65..20d800c7e 100644
--- a/.github/workflows/dockerhub.yml
+++ b/.github/workflows/dockerhub.yml
@@ -25,8 +25,8 @@ jobs:
     - name: Extract Git information
       id: git-info
       run: |
-        echo "BRANCH_NAME=$(echo ${GITHUB_REF#refs/heads/} | sed 's/\\//-/g')" >> $GITHUB_ENV
-        echo "COMMIT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
+        echo "BRANCH_NAME=${GITHUB_REF_NAME}" >> "$GITHUB_ENV"
+        echo "COMMIT_SHA=${GITHUB_SHA::7}" >> "$GITHUB_ENV"
 
     - name: Build and Push Docker Image
       run: |

From e3a5f0ae0ca9a1b9307117e2c72c47bfc34ad259 Mon Sep 17 00:00:00 2001
From: Boris <boris@topoteretes.com>
Date: Tue, 17 Dec 2024 15:01:36 +0100
Subject: [PATCH 08/10] Update .github/workflows/dockerhub.yml

---
 .github/workflows/dockerhub.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml
index 20d800c7e..e01ddd0a5 100644
--- a/.github/workflows/dockerhub.yml
+++ b/.github/workflows/dockerhub.yml
@@ -43,4 +43,9 @@ jobs:
 
     - name: Verify pushed Docker images
       run: |
-        echo "Successfully pushed images to Docker Hub"
+       # Verify both platform variants
+       for PLATFORM in "linux/amd64" "linux/arm64"; do
+         echo "Verifying image for $PLATFORM..."
+         docker buildx imagetools inspect "${IMAGE_NAME}:${TAG_VERSION}" --format "{{.Manifest.$PLATFORM.Digest}}"
+       done
+       echo "Successfully verified images in Docker Hub"

From f3f3a0515e9d64a8ec7ab74b336244127125c7c9 Mon Sep 17 00:00:00 2001
From: Boris <boris@topoteretes.com>
Date: Tue, 17 Dec 2024 15:03:25 +0100
Subject: [PATCH 09/10] Update .github/workflows/dockerhub.yml

---
 .github/workflows/dockerhub.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml
index e01ddd0a5..009042e0c 100644
--- a/.github/workflows/dockerhub.yml
+++ b/.github/workflows/dockerhub.yml
@@ -11,7 +11,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Set up Docker Buildx
       uses: docker/setup-buildx-action@v3

From 0014ebe67c82ad92f7db0f73f992998ba650ab1d Mon Sep 17 00:00:00 2001
From: Boris <boris@topoteretes.com>
Date: Tue, 17 Dec 2024 15:03:30 +0100
Subject: [PATCH 10/10] Update .github/workflows/dockerhub.yml

---
 .github/workflows/dockerhub.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml
index 009042e0c..a80f1f442 100644
--- a/.github/workflows/dockerhub.yml
+++ b/.github/workflows/dockerhub.yml
@@ -17,7 +17,7 @@ jobs:
       uses: docker/setup-buildx-action@v3
 
     - name: Log in to Docker Hub
-      uses: docker/login-action@v2
+      uses: docker/login-action@v3
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_PASSWORD }}