Merge branch 'dev' into COG-650-replace-pylint

2024-12-18 14:27:56 +01:00 · 2024-12-18 14:27:56 +01:00 · ad8f2a59d4
commit ad8f2a59d4
parent f660127d2d 45cb2c3289
6 changed files with 118 additions and 18 deletions
--- a/.github/workflows/dockerhub.yml
+++ b/.github/workflows/dockerhub.yml
@ -0,0 +1,51 @@
+name: build | Build and Push Docker Image to DockerHub
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  docker-build-and-push:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Log in to Docker Hub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+
+    - name: Extract Git information
+      id: git-info
+      run: |
+        echo "BRANCH_NAME=${GITHUB_REF_NAME}" >> "$GITHUB_ENV"
+        echo "COMMIT_SHA=${GITHUB_SHA::7}" >> "$GITHUB_ENV"
+
+    - name: Build and Push Docker Image
+      run: |
+        IMAGE_NAME=cognee/cognee
+        TAG_VERSION="${BRANCH_NAME}-${COMMIT_SHA}"
+        
+        echo "Building image: ${IMAGE_NAME}:${TAG_VERSION}"
+        docker buildx build \
+          --platform linux/amd64,linux/arm64 \
+          --push \
+          --tag "${IMAGE_NAME}:${TAG_VERSION}" \
+          --tag "${IMAGE_NAME}:latest" \
+          .
+
+    - name: Verify pushed Docker images
+      run: |
+       # Verify both platform variants
+       for PLATFORM in "linux/amd64" "linux/arm64"; do
+         echo "Verifying image for $PLATFORM..."
+         docker buildx imagetools inspect "${IMAGE_NAME}:${TAG_VERSION}" --format "{{.Manifest.$PLATFORM.Digest}}"
+       done
+       echo "Successfully verified images in Docker Hub"
--- a/cognee/api/v1/cognify/cognify_v2.py
+++ b/cognee/api/v1/cognify/cognify_v2.py
@ -69,17 +69,18 @@ async def run_cognify_pipeline(dataset: Dataset, user: User, graph_model: BaseMo

    send_telemetry("cognee.cognify EXECUTION STARTED", user.id)

-    async with update_status_lock:
-        task_status = await get_pipeline_status([dataset_id])
+    #async with update_status_lock: TODO: Add UI lock to prevent multiple backend requests
+    task_status = await get_pipeline_status([dataset_id])

-        if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
-            logger.info("Dataset %s is already being processed.", dataset_name)
-            return
+    if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
+        logger.info("Dataset %s is already being processed.", dataset_name)
+        return
+
+    await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
+        "dataset_name": dataset_name,
+        "files": document_ids_str,
+    })

-        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
-            "dataset_name": dataset_name,
-            "files": document_ids_str,
-        })
    try:
        cognee_config = get_cognify_config()

--- a/cognee/api/v1/search/search_v2.py
+++ b/cognee/api/v1/search/search_v2.py
@ -1,7 +1,7 @@
 import json
 from uuid import UUID
 from enum import Enum
-from typing import Callable, Dict
+from typing import Callable, Dict, Union

 from cognee.exceptions import InvalidValueError
 from cognee.modules.search.operations import log_query, log_result
@ -22,7 +22,12 @@ class SearchType(Enum):
    CHUNKS = "CHUNKS"
    COMPLETION = "COMPLETION"

-async def search(query_type: SearchType, query_text: str, user: User = None) -> list:
+async def search(query_type: SearchType, query_text: str, user: User = None,
+                 datasets: Union[list[str], str, None] = None) -> list:
+    # We use lists from now on for datasets
+    if isinstance(datasets, str):
+        datasets = [datasets]
+
    if user is None:
        user = await get_default_user()

@ -31,7 +36,7 @@ async def search(query_type: SearchType, query_text: str, user: User = None) ->

    query = await log_query(query_text, str(query_type), user.id)

-    own_document_ids = await get_document_ids_for_user(user.id)
+    own_document_ids = await get_document_ids_for_user(user.id, datasets)
    search_results = await specific_search(query_type, query_text, user)

    filtered_search_results = []
--- a/cognee/modules/data/models/init.py
+++ b/cognee/modules/data/models/init.py
@ -1,2 +1,3 @@
 from .Data import Data
 from .Dataset import Dataset
+from .DatasetData import DatasetData
--- a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
+++ b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py
@ -1,9 +1,11 @@
 from uuid import UUID
 from sqlalchemy import select
 from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.modules.data.models import Dataset, DatasetData
 from ...models import ACL, Resource, Permission

-async def get_document_ids_for_user(user_id: UUID) -> list[str]:
+
+async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -> list[str]:
    db_engine = get_relational_engine()

    async with db_engine.get_async_session() as session:
@ -18,4 +20,31 @@ async def get_document_ids_for_user(user_id: UUID) -> list[str]:
                )
            )).all()

+            if datasets:
+                documents_ids_in_dataset = set()
+                # If datasets are specified filter out documents that aren't part of the specified datasets
+                for dataset in datasets:
+                    # Find dataset id for dataset element
+                    dataset_id = (await session.scalars(
+                        select(Dataset.id)
+                        .where(
+                            Dataset.name == dataset,
+                            Dataset.owner_id == user_id,
+                        )
+                    )).one_or_none()
+
+                    # Check which documents are connected to this dataset
+                    for document_id in document_ids:
+                        data_id = (await session.scalars(
+                            select(DatasetData.data_id)
+                            .where(
+                                DatasetData.dataset_id == dataset_id,
+                                DatasetData.data_id == document_id,
+                            )
+                        )).one_or_none()
+
+                        # If document is related to dataset added it to return value
+                        if data_id:
+                            documents_ids_in_dataset.add(document_id)
+                return list(documents_ids_in_dataset)
            return document_ids
--- a/cognee/tests/test_pgvector.py
+++ b/cognee/tests/test_pgvector.py
@ -4,6 +4,7 @@ import pathlib
 import cognee
 from cognee.api.v1.search import SearchType
 from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
+from cognee.modules.users.methods import get_default_user

 logging.basicConfig(level=logging.DEBUG)

@ -44,12 +45,13 @@ async def main():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata = True)

-    dataset_name = "cs_explanations"
+    dataset_name_1 = "natural_language"
+    dataset_name_2 = "quantum"

    explanation_file_path = os.path.join(
        pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
    )
-    await cognee.add([explanation_file_path], dataset_name)
+    await cognee.add([explanation_file_path], dataset_name_1)

    text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
    At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
@ -59,12 +61,23 @@ async def main():
    In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
    """

-    await cognee.add([text], dataset_name)
+    await cognee.add([text], dataset_name_2)

-    await cognee.cognify([dataset_name])
+    await cognee.cognify([dataset_name_2, dataset_name_1])

    from cognee.infrastructure.databases.vector import get_vector_engine

+    # Test getting of documents for search per dataset
+    from cognee.modules.users.permissions.methods import get_document_ids_for_user
+    user = await get_default_user()
+    document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
+    assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1"
+
+    # Test getting of documents for search when no dataset is provided
+    user = await get_default_user()
+    document_ids = await get_document_ids_for_user(user.id)
+    assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2"
+
    vector_engine = get_vector_engine()
    random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0]
    random_node_name = random_node.payload["text"]
@ -75,7 +88,7 @@ async def main():
    for result in search_results:
        print(f"{result}\n")

-    search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
+    search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name, datasets=[dataset_name_2])
    assert len(search_results) != 0, "The search results list is empty."
    print("\n\nExtracted chunks are:\n")
    for result in search_results: