Merge branch 'dev' into COG-650-replace-pylint

This commit is contained in:
Igor Ilic 2024-12-18 14:27:56 +01:00 committed by GitHub
commit ad8f2a59d4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 118 additions and 18 deletions

51
.github/workflows/dockerhub.yml vendored Normal file
View file

@ -0,0 +1,51 @@
name: build | Build and Push Docker Image to DockerHub
on:
push:
branches:
- main
jobs:
docker-build-and-push:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Extract Git information
id: git-info
run: |
echo "BRANCH_NAME=${GITHUB_REF_NAME}" >> "$GITHUB_ENV"
echo "COMMIT_SHA=${GITHUB_SHA::7}" >> "$GITHUB_ENV"
- name: Build and Push Docker Image
run: |
IMAGE_NAME=cognee/cognee
TAG_VERSION="${BRANCH_NAME}-${COMMIT_SHA}"
echo "Building image: ${IMAGE_NAME}:${TAG_VERSION}"
docker buildx build \
--platform linux/amd64,linux/arm64 \
--push \
--tag "${IMAGE_NAME}:${TAG_VERSION}" \
--tag "${IMAGE_NAME}:latest" \
.
- name: Verify pushed Docker images
run: |
# Verify both platform variants
for PLATFORM in "linux/amd64" "linux/arm64"; do
echo "Verifying image for $PLATFORM..."
docker buildx imagetools inspect "${IMAGE_NAME}:${TAG_VERSION}" --format "{{.Manifest.$PLATFORM.Digest}}"
done
echo "Successfully verified images in Docker Hub"

View file

@ -69,17 +69,18 @@ async def run_cognify_pipeline(dataset: Dataset, user: User, graph_model: BaseMo
send_telemetry("cognee.cognify EXECUTION STARTED", user.id)
async with update_status_lock:
task_status = await get_pipeline_status([dataset_id])
#async with update_status_lock: TODO: Add UI lock to prevent multiple backend requests
task_status = await get_pipeline_status([dataset_id])
if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
logger.info("Dataset %s is already being processed.", dataset_name)
return
if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
logger.info("Dataset %s is already being processed.", dataset_name)
return
await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
"dataset_name": dataset_name,
"files": document_ids_str,
})
await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
"dataset_name": dataset_name,
"files": document_ids_str,
})
try:
cognee_config = get_cognify_config()

View file

@ -1,7 +1,7 @@
import json
from uuid import UUID
from enum import Enum
from typing import Callable, Dict
from typing import Callable, Dict, Union
from cognee.exceptions import InvalidValueError
from cognee.modules.search.operations import log_query, log_result
@ -22,7 +22,12 @@ class SearchType(Enum):
CHUNKS = "CHUNKS"
COMPLETION = "COMPLETION"
async def search(query_type: SearchType, query_text: str, user: User = None) -> list:
async def search(query_type: SearchType, query_text: str, user: User = None,
datasets: Union[list[str], str, None] = None) -> list:
# We use lists from now on for datasets
if isinstance(datasets, str):
datasets = [datasets]
if user is None:
user = await get_default_user()
@ -31,7 +36,7 @@ async def search(query_type: SearchType, query_text: str, user: User = None) ->
query = await log_query(query_text, str(query_type), user.id)
own_document_ids = await get_document_ids_for_user(user.id)
own_document_ids = await get_document_ids_for_user(user.id, datasets)
search_results = await specific_search(query_type, query_text, user)
filtered_search_results = []

View file

@ -1,2 +1,3 @@
from .Data import Data
from .Dataset import Dataset
from .DatasetData import DatasetData

View file

@ -1,9 +1,11 @@
from uuid import UUID
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.models import Dataset, DatasetData
from ...models import ACL, Resource, Permission
async def get_document_ids_for_user(user_id: UUID) -> list[str]:
async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -> list[str]:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
@ -18,4 +20,31 @@ async def get_document_ids_for_user(user_id: UUID) -> list[str]:
)
)).all()
if datasets:
documents_ids_in_dataset = set()
# If datasets are specified filter out documents that aren't part of the specified datasets
for dataset in datasets:
# Find dataset id for dataset element
dataset_id = (await session.scalars(
select(Dataset.id)
.where(
Dataset.name == dataset,
Dataset.owner_id == user_id,
)
)).one_or_none()
# Check which documents are connected to this dataset
for document_id in document_ids:
data_id = (await session.scalars(
select(DatasetData.data_id)
.where(
DatasetData.dataset_id == dataset_id,
DatasetData.data_id == document_id,
)
)).one_or_none()
# If document is related to dataset added it to return value
if data_id:
documents_ids_in_dataset.add(document_id)
return list(documents_ids_in_dataset)
return document_ids

View file

@ -4,6 +4,7 @@ import pathlib
import cognee
from cognee.api.v1.search import SearchType
from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
from cognee.modules.users.methods import get_default_user
logging.basicConfig(level=logging.DEBUG)
@ -44,12 +45,13 @@ async def main():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata = True)
dataset_name = "cs_explanations"
dataset_name_1 = "natural_language"
dataset_name_2 = "quantum"
explanation_file_path = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
)
await cognee.add([explanation_file_path], dataset_name)
await cognee.add([explanation_file_path], dataset_name_1)
text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
@ -59,12 +61,23 @@ async def main():
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
"""
await cognee.add([text], dataset_name)
await cognee.add([text], dataset_name_2)
await cognee.cognify([dataset_name])
await cognee.cognify([dataset_name_2, dataset_name_1])
from cognee.infrastructure.databases.vector import get_vector_engine
# Test getting of documents for search per dataset
from cognee.modules.users.permissions.methods import get_document_ids_for_user
user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
assert len(document_ids) == 1, f"Number of expected documents doesn't match {len(document_ids)} != 1"
# Test getting of documents for search when no dataset is provided
user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id)
assert len(document_ids) == 2, f"Number of expected documents doesn't match {len(document_ids)} != 2"
vector_engine = get_vector_engine()
random_node = (await vector_engine.search("entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"]
@ -75,7 +88,7 @@ async def main():
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name, datasets=[dataset_name_2])
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results: