From 62c84dde5e18591c9a4623eaa0729dfc60a19615 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Sat, 8 Mar 2025 08:51:57 -0800 Subject: [PATCH 01/14] feat: added helm clean push (#606) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Introduced a Helm-based deployment package that streamlines setup for the backend application and PostgreSQL database on Kubernetes. - Added orchestration support via Docker Compose for managing multi-container deployments. - Added new Kubernetes resources including Deployments, Services, and PersistentVolumeClaims for both the backend and PostgreSQL. - **Documentation** - Provided comprehensive infrastructure and deployment instructions for Kubernetes environments. - **Chores** - Established a standardized container build process for the Python application. - Introduced configuration settings for service ports, resource limits, and environment variables. --------- Co-authored-by: Daniel Molnar Co-authored-by: Boris --- helm/Chart.yaml | 24 ++++++++++ helm/Dockerfile | 59 +++++++++++++++++++++++++ helm/README.md | 25 +++++++++++ helm/docker-compose-helm.yml | 46 +++++++++++++++++++ helm/templates/cognee_deployment.yaml | 32 ++++++++++++++ helm/templates/cognee_service.yaml | 13 ++++++ helm/templates/postgres_deployment.yaml | 35 +++++++++++++++ helm/templates/postgres_pvc.yaml | 10 +++++ helm/templates/postgres_service.yaml | 14 ++++++ helm/values.yaml | 22 +++++++++ 10 files changed, 280 insertions(+) create mode 100644 helm/Chart.yaml create mode 100644 helm/Dockerfile create mode 100644 helm/README.md create mode 100644 helm/docker-compose-helm.yml create mode 100644 helm/templates/cognee_deployment.yaml create mode 100644 helm/templates/cognee_service.yaml create mode 100644 helm/templates/postgres_deployment.yaml create mode 100644 helm/templates/postgres_pvc.yaml create mode 100644 helm/templates/postgres_service.yaml create mode 100644 helm/values.yaml diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 000000000..ab9e087df --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: cognee-chart +description: A helm chart of the cognee backend deployment on Kubernetes environment + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/helm/Dockerfile b/helm/Dockerfile new file mode 100644 index 000000000..153834cd6 --- /dev/null +++ b/helm/Dockerfile @@ -0,0 +1,59 @@ +FROM python:3.11-slim + +# Define Poetry extras to install +ARG POETRY_EXTRAS="\ +# Storage & Databases \ +filesystem postgres weaviate qdrant neo4j falkordb milvus \ +# Notebooks & Interactive Environments \ +notebook \ +# LLM & AI Frameworks \ +langchain llama-index gemini huggingface ollama mistral groq \ +# Evaluation & Monitoring \ +deepeval evals posthog \ +# Graph Processing & Code Analysis \ +codegraph graphiti \ +# Document Processing \ +docs" + +# Set build argument +ARG DEBUG + +# Set environment variable based on the build argument +ENV DEBUG=${DEBUG} +ENV PIP_NO_CACHE_DIR=true +ENV PATH="${PATH}:/root/.poetry/bin" + + +RUN apt-get install -y \ + gcc \ + libpq-dev + + +WORKDIR /app +COPY pyproject.toml poetry.lock /app/ + + +RUN pip install poetry + +# Don't create virtualenv since docker is already isolated +RUN poetry config virtualenvs.create false + +# Install the dependencies +RUN poetry install --extras "${POETRY_EXTRAS}" --no-root --without dev + + +# Set the PYTHONPATH environment variable to include the /app directory +ENV PYTHONPATH=/app + +COPY cognee/ /app/cognee + +# Copy Alembic configuration +COPY alembic.ini /app/alembic.ini +COPY alembic/ /app/alembic + +COPY entrypoint.sh /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh + +RUN sed -i 's/\r$//' /app/entrypoint.sh + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/helm/README.md b/helm/README.md new file mode 100644 index 000000000..b7aaa6325 --- /dev/null +++ b/helm/README.md @@ -0,0 +1,25 @@ + +# cognee-infra-helm +General infrastructure setup for Cognee on Kubernetes using a Helm chart. + +## Prerequisites +Before deploying the Helm chart, ensure the following prerequisites are met:  + +**Kubernetes Cluster**: A running Kubernetes cluster (e.g., Minikube, GKE, EKS). + +**Helm**: Installed and configured for your Kubernetes cluster. You can install Helm by following the [official guide](https://helm.sh/docs/intro/install/).  + +**kubectl**: Installed and configured to interact with your cluster. Follow the instructions [here](https://kubernetes.io/docs/tasks/tools/install-kubectl/). + +Clone the Repository Clone this repository to your local machine and navigate to the directory. + +## Deploy Helm Chart: + + ```bash + helm install cognee ./cognee-chart + ``` + +**Uninstall Helm Release**: + ```bash + helm uninstall cognee + ``` diff --git a/helm/docker-compose-helm.yml b/helm/docker-compose-helm.yml new file mode 100644 index 000000000..8aaa63816 --- /dev/null +++ b/helm/docker-compose-helm.yml @@ -0,0 +1,46 @@ +services: + cognee: + image : cognee-backend:latest + container_name: cognee-backend + networks: + - cognee-network + build: + context: . + dockerfile: Dockerfile + volumes: + - .:/app + - /app/cognee-frontend/ # Ignore frontend code + environment: + - HOST=0.0.0.0 + - ENVIRONMENT=local + - PYTHONPATH=. + ports: + - 8000:8000 + # - 5678:5678 # Debugging + deploy: + resources: + limits: + cpus: '4.0' + memory: 8GB + + postgres: + image: pgvector/pgvector:pg17 + container_name: postgres + environment: + POSTGRES_USER: cognee + POSTGRES_PASSWORD: cognee + POSTGRES_DB: cognee_db + volumes: + - postgres_data:/var/lib/postgresql/data + ports: + - 5432:5432 + networks: + - cognee-network + +networks: + cognee-network: + name: cognee-network + +volumes: + postgres_data: + diff --git a/helm/templates/cognee_deployment.yaml b/helm/templates/cognee_deployment.yaml new file mode 100644 index 000000000..f16a475ec --- /dev/null +++ b/helm/templates/cognee_deployment.yaml @@ -0,0 +1,32 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }}-cognee + labels: + app: {{ .Release.Name }}-cognee +spec: + replicas: 1 + selector: + matchLabels: + app: {{ .Release.Name }}-cognee + template: + metadata: + labels: + app: {{ .Release.Name }}-cognee + spec: + containers: + - name: cognee + image: {{ .Values.cognee.image }} + ports: + - containerPort: {{ .Values.cognee.port }} + env: + - name: HOST + value: {{ .Values.cognee.env.HOST }} + - name: ENVIRONMENT + value: {{ .Values.cognee.env.ENVIRONMENT }} + - name: PYTHONPATH + value: {{ .Values.cognee.env.PYTHONPATH }} + resources: + limits: + cpu: {{ .Values.cognee.resources.cpu }} + memory: {{ .Values.cognee.resources.memory }} diff --git a/helm/templates/cognee_service.yaml b/helm/templates/cognee_service.yaml new file mode 100644 index 000000000..21e9e470e --- /dev/null +++ b/helm/templates/cognee_service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-cognee + labels: + app: {{ .Release.Name }}-cognee +spec: + type: NodePort + ports: + - port: {{ .Values.cognee.port }} + targetPort: {{ .Values.cognee.port }} + selector: + app: {{ .Release.Name }}-cognee diff --git a/helm/templates/postgres_deployment.yaml b/helm/templates/postgres_deployment.yaml new file mode 100644 index 000000000..fc47647a2 --- /dev/null +++ b/helm/templates/postgres_deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }}-postgres + labels: + app: {{ .Release.Name }}-postgres +spec: + replicas: 1 + selector: + matchLabels: + app: {{ .Release.Name }}-postgres + template: + metadata: + labels: + app: {{ .Release.Name }}-postgres + spec: + containers: + - name: postgres + image: {{ .Values.postgres.image }} + ports: + - containerPort: {{ .Values.postgres.port }} + env: + - name: POSTGRES_USER + value: {{ .Values.postgres.env.POSTGRES_USER }} + - name: POSTGRES_PASSWORD + value: {{ .Values.postgres.env.POSTGRES_PASSWORD }} + - name: POSTGRES_DB + value: {{ .Values.postgres.env.POSTGRES_DB }} + volumeMounts: + - name: postgres-storage + mountPath: /var/lib/postgresql/data + volumes: + - name: postgres-storage + persistentVolumeClaim: + claimName: {{ .Release.Name }}-postgres-pvc diff --git a/helm/templates/postgres_pvc.yaml b/helm/templates/postgres_pvc.yaml new file mode 100644 index 000000000..7d7661b16 --- /dev/null +++ b/helm/templates/postgres_pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Release.Name }}-postgres-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.postgres.storage }} diff --git a/helm/templates/postgres_service.yaml b/helm/templates/postgres_service.yaml new file mode 100644 index 000000000..7a944a128 --- /dev/null +++ b/helm/templates/postgres_service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-postgres + labels: + app: {{ .Release.Name }}-postgres +spec: + type: ClusterIP + ports: + - port: {{ .Values.postgres.port }} + targetPort: {{ .Values.postgres.port }} + selector: + app: {{ .Release.Name }}-postgres + diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 000000000..fb2d3f7e3 --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,22 @@ +# Configuration for the 'cognee' application service +cognee: + # Image name (using the local image we’ll build in Minikube) + image: "hajdul1988/cognee-backend:latest" + port: 8000 + env: + HOST: "0.0.0.0" + ENVIRONMENT: "local" + PYTHONPATH: "." + resources: + cpu: "4.0" + memory: "8Gi" + +# Configuration for the 'postgres' database service +postgres: + image: "pgvector/pgvector:pg17" + port: 5432 + env: + POSTGRES_USER: "cognee" + POSTGRES_PASSWORD: "cognee" + POSTGRES_DB: "cognee_db" + storage: "8Gi" From 56427f287ef85e1548e77d650e648a378c8410ab Mon Sep 17 00:00:00 2001 From: hibajamal <35984866+hibajamal@users.noreply.github.com> Date: Sat, 8 Mar 2025 20:33:42 +0100 Subject: [PATCH 02/14] Demo for relational db with cognee (#620) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description This demo uses pydantic models and dlt to pull data from the Pokémon API and structure it into a relational format. By feeding this structured data into cognee, it makes searching across multiple tables easier and more intuitive, thanks to the relational model. ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Introduced a comprehensive Pokémon data processing pipeline, available as both a Python script and an interactive Jupyter Notebook. - Enabled asynchronous operations for efficient data collection and querying, including an integrated search functionality. - Improved error handling and data validation during the data fetching and processing stages for a smoother user experience. Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> --- examples/python/pokemon_datapoints_example.py | 190 +++++++ notebooks/pokemon_datapoints_notebook.ipynb | 536 ++++++++++++++++++ 2 files changed, 726 insertions(+) create mode 100644 examples/python/pokemon_datapoints_example.py create mode 100644 notebooks/pokemon_datapoints_notebook.ipynb diff --git a/examples/python/pokemon_datapoints_example.py b/examples/python/pokemon_datapoints_example.py new file mode 100644 index 000000000..058492e63 --- /dev/null +++ b/examples/python/pokemon_datapoints_example.py @@ -0,0 +1,190 @@ +# Standard library imports +import os +import json +import asyncio +import pathlib +from uuid import uuid5, NAMESPACE_OID +from typing import List, Optional +from pathlib import Path + +import dlt +import requests +import cognee +from cognee.low_level import DataPoint, setup as cognee_setup +from cognee.api.v1.search import SearchType +from cognee.tasks.storage import add_data_points +from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.pipelines import run_tasks + + +BASE_URL = "https://pokeapi.co/api/v2/" +os.environ["BUCKET_URL"] = "./.data_storage" +os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "true" + +# Data Models +class Abilities(DataPoint): + name: str = "Abilities" + metadata: dict = {"index_fields": ["name"]} + +class PokemonAbility(DataPoint): + name: str + ability__name: str + ability__url: str + is_hidden: bool + slot: int + _dlt_load_id: str + _dlt_id: str + _dlt_parent_id: str + _dlt_list_idx: str + is_type: Abilities + metadata: dict = {"index_fields": ["ability__name"]} + +class Pokemons(DataPoint): + name: str = "Pokemons" + have: Abilities + metadata: dict = {"index_fields": ["name"]} + +class Pokemon(DataPoint): + name: str + base_experience: int + height: int + weight: int + is_default: bool + order: int + location_area_encounters: str + species__name: str + species__url: str + cries__latest: str + cries__legacy: str + sprites__front_default: str + sprites__front_shiny: str + sprites__back_default: Optional[str] + sprites__back_shiny: Optional[str] + _dlt_load_id: str + _dlt_id: str + is_type: Pokemons + abilities: List[PokemonAbility] + metadata: dict = {"index_fields": ["name"]} + +# Data Collection Functions +@dlt.resource(write_disposition="replace") +def pokemon_list(limit: int = 50): + response = requests.get(f"{BASE_URL}pokemon", params={"limit": limit}) + response.raise_for_status() + yield response.json()["results"] + +@dlt.transformer(data_from=pokemon_list) +def pokemon_details(pokemons): + """Fetches detailed info for each Pokémon""" + for pokemon in pokemons: + response = requests.get(pokemon["url"]) + response.raise_for_status() + yield response.json() + +# Data Loading Functions +def load_abilities_data(jsonl_abilities): + abilities_root = Abilities() + pokemon_abilities = [] + + for jsonl_ability in jsonl_abilities: + with open(jsonl_ability, "r") as f: + for line in f: + ability = json.loads(line) + ability["id"] = uuid5(NAMESPACE_OID, ability["_dlt_id"]) + ability["name"] = ability["ability__name"] + ability["is_type"] = abilities_root + pokemon_abilities.append(ability) + + return abilities_root, pokemon_abilities + +def load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root): + pokemons = [] + + for jsonl_pokemon in jsonl_pokemons: + with open(jsonl_pokemon, "r") as f: + for line in f: + pokemon_data = json.loads(line) + abilities = [ + ability for ability in pokemon_abilities + if ability["_dlt_parent_id"] == pokemon_data["_dlt_id"] + ] + pokemon_data["external_id"] = pokemon_data["id"] + pokemon_data["id"] = uuid5(NAMESPACE_OID, str(pokemon_data["id"])) + pokemon_data["abilities"] = [PokemonAbility(**ability) for ability in abilities] + pokemon_data["is_type"] = pokemon_root + pokemons.append(Pokemon(**pokemon_data)) + + return pokemons + +# Main Application Logic +async def setup_and_process_data(): + """Setup configuration and process Pokemon data""" + # Setup configuration + data_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".data_storage")).resolve()) + cognee_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system")).resolve()) + + cognee.config.data_root_directory(data_directory_path) + cognee.config.system_root_directory(cognee_directory_path) + + # Initialize pipeline and collect data + pipeline = dlt.pipeline( + pipeline_name="pokemon_pipeline", + destination="filesystem", + dataset_name="pokemon_data", + ) + info = pipeline.run([pokemon_list, pokemon_details]) + print(info) + + # Load and process data + STORAGE_PATH = Path(".data_storage/pokemon_data/pokemon_details") + jsonl_pokemons = sorted(STORAGE_PATH.glob("*.jsonl")) + if not jsonl_pokemons: + raise FileNotFoundError("No JSONL files found in the storage directory.") + + ABILITIES_PATH = Path(".data_storage/pokemon_data/pokemon_details__abilities") + jsonl_abilities = sorted(ABILITIES_PATH.glob("*.jsonl")) + if not jsonl_abilities: + raise FileNotFoundError("No JSONL files found in the storage directory.") + + # Process data + abilities_root, pokemon_abilities = load_abilities_data(jsonl_abilities) + pokemon_root = Pokemons(have=abilities_root) + pokemons = load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root) + + return pokemons + +async def pokemon_cognify(pokemons): + """Process Pokemon data with Cognee and perform search""" + # Setup and run Cognee tasks + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await cognee_setup() + + tasks = [Task(add_data_points, task_config={"batch_size": 50})] + results = run_tasks( + tasks=tasks, + data=pokemons, + dataset_id=uuid5(NAMESPACE_OID, "Pokemon"), + pipeline_name='pokemon_pipeline', + ) + + async for result in results: + print(result) + print("Done") + + # Perform search + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="pokemons?" + ) + + print("Search results:") + for result_text in search_results: + print(result_text) + +async def main(): + pokemons = await setup_and_process_data() + await pokemon_cognify(pokemons) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/notebooks/pokemon_datapoints_notebook.ipynb b/notebooks/pokemon_datapoints_notebook.ipynb new file mode 100644 index 000000000..9fbc34bc1 --- /dev/null +++ b/notebooks/pokemon_datapoints_notebook.ipynb @@ -0,0 +1,536 @@ +{ + "cells": [ + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:58:00.193158Z", + "start_time": "2025-03-04T11:58:00.190238Z" + } + }, + "cell_type": "code", + "source": [ + "import nest_asyncio\n", + "nest_asyncio.apply()" + ], + "id": "2efba278d106bb5f", + "outputs": [], + "execution_count": 2 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### Environment Configuration\n", + "#### Setup required directories and environment variables.\n" + ], + "id": "ccbb2bc23aa456ee" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:59:33.879188Z", + "start_time": "2025-03-04T11:59:33.873682Z" + } + }, + "cell_type": "code", + "source": [ + "import pathlib\n", + "import os\n", + "import cognee\n", + "\n", + "notebook_dir = pathlib.Path().resolve()\n", + "data_directory_path = str(notebook_dir / \".data_storage\")\n", + "cognee_directory_path = str(notebook_dir / \".cognee_system\")\n", + "\n", + "cognee.config.data_root_directory(data_directory_path)\n", + "cognee.config.system_root_directory(cognee_directory_path)\n", + "\n", + "BASE_URL = \"https://pokeapi.co/api/v2/\"\n", + "os.environ[\"BUCKET_URL\"] = data_directory_path\n", + "os.environ[\"DATA_WRITER__DISABLE_COMPRESSION\"] = \"true\"\n" + ], + "id": "662d554f96f211d9", + "outputs": [], + "execution_count": 8 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Initialize DLT Pipeline\n", + "### Create the DLT pipeline to fetch Pokémon data.\n" + ], + "id": "36ae0be71f6e9167" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:58:03.982939Z", + "start_time": "2025-03-04T11:58:03.819676Z" + } + }, + "cell_type": "code", + "source": [ + "import dlt\n", + "from pathlib import Path\n", + "\n", + "pipeline = dlt.pipeline(\n", + " pipeline_name=\"pokemon_pipeline\",\n", + " destination=\"filesystem\",\n", + " dataset_name=\"pokemon_data\",\n", + ")\n" + ], + "id": "25101ae5f016ce0c", + "outputs": [], + "execution_count": 4 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Fetch Pokémon List\n", + "### Retrieve a list of Pokémon from the API.\n" + ], + "id": "9a87ce05a072c48b" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:58:03.990076Z", + "start_time": "2025-03-04T11:58:03.987199Z" + } + }, + "cell_type": "code", + "source": [ + "@dlt.resource(write_disposition=\"replace\")\n", + "def pokemon_list(limit: int = 50):\n", + " import requests\n", + " response = requests.get(f\"{BASE_URL}pokemon\", params={\"limit\": limit})\n", + " response.raise_for_status()\n", + " yield response.json()[\"results\"]\n" + ], + "id": "3b6e60778c61e24a", + "outputs": [], + "execution_count": 5 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Fetch Pokémon Details\n", + "### Fetch detailed information about each Pokémon.\n" + ], + "id": "9952767846194e97" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:58:03.996394Z", + "start_time": "2025-03-04T11:58:03.994122Z" + } + }, + "cell_type": "code", + "source": [ + "@dlt.transformer(data_from=pokemon_list)\n", + "def pokemon_details(pokemons):\n", + " \"\"\"Fetches detailed info for each Pokémon\"\"\"\n", + " import requests\n", + " for pokemon in pokemons:\n", + " response = requests.get(pokemon[\"url\"])\n", + " response.raise_for_status()\n", + " yield response.json()\n" + ], + "id": "79ec9fef12267485", + "outputs": [], + "execution_count": 6 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Run Data Pipeline\n", + "### Execute the pipeline and store Pokémon data.\n" + ], + "id": "41e05f660bf9e9d2" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:59:41.571015Z", + "start_time": "2025-03-04T11:59:36.840744Z" + } + }, + "cell_type": "code", + "source": [ + "info = pipeline.run([pokemon_list, pokemon_details])\n", + "print(info)\n" + ], + "id": "20a3b2c7f404677f", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pipeline pokemon_pipeline load step completed in 0.06 seconds\n", + "1 load package(s) were loaded to destination filesystem and into dataset pokemon_data\n", + "The filesystem destination used file:///Users/lazar/PycharmProjects/cognee/.data_storage location to store data\n", + "Load package 1741089576.860229 is LOADED and contains no failed jobs\n" + ] + } + ], + "execution_count": 9 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Load Pokémon Abilities\n", + "### Load Pokémon ability data from stored files.\n" + ], + "id": "937f10b8d1037743" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:59:44.377719Z", + "start_time": "2025-03-04T11:59:44.363718Z" + } + }, + "cell_type": "code", + "source": [ + "import json\n", + "from cognee.low_level import DataPoint\n", + "from uuid import uuid5, NAMESPACE_OID\n", + "\n", + "class Abilities(DataPoint):\n", + " name: str = \"Abilities\"\n", + " metadata: dict = {\"index_fields\": [\"name\"]}\n", + "\n", + "def load_abilities_data(jsonl_abilities):\n", + " abilities_root = Abilities()\n", + " pokemon_abilities = []\n", + "\n", + " for jsonl_ability in jsonl_abilities:\n", + " with open(jsonl_ability, \"r\") as f:\n", + " for line in f:\n", + " ability = json.loads(line)\n", + " ability[\"id\"] = uuid5(NAMESPACE_OID, ability[\"_dlt_id\"])\n", + " ability[\"name\"] = ability[\"ability__name\"]\n", + " ability[\"is_type\"] = abilities_root\n", + " pokemon_abilities.append(ability)\n", + "\n", + " return abilities_root, pokemon_abilities\n" + ], + "id": "be73050036439ea1", + "outputs": [], + "execution_count": 10 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Load Pokémon Data\n", + "### Load Pokémon details and associate them with abilities.\n" + ], + "id": "98c97f799f73df77" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:59:46.251306Z", + "start_time": "2025-03-04T11:59:46.238283Z" + } + }, + "cell_type": "code", + "source": [ + "from typing import List, Optional\n", + "\n", + "class Pokemons(DataPoint):\n", + " name: str = \"Pokemons\"\n", + " have: Abilities\n", + " metadata: dict = {\"index_fields\": [\"name\"]}\n", + "\n", + "class PokemonAbility(DataPoint):\n", + " name: str\n", + " ability__name: str\n", + " ability__url: str\n", + " is_hidden: bool\n", + " slot: int\n", + " _dlt_load_id: str\n", + " _dlt_id: str\n", + " _dlt_parent_id: str\n", + " _dlt_list_idx: str\n", + " is_type: Abilities\n", + " metadata: dict = {\"index_fields\": [\"ability__name\"]}\n", + "\n", + "class Pokemon(DataPoint):\n", + " name: str\n", + " base_experience: int\n", + " height: int\n", + " weight: int\n", + " is_default: bool\n", + " order: int\n", + " location_area_encounters: str\n", + " species__name: str\n", + " species__url: str\n", + " cries__latest: str\n", + " cries__legacy: str\n", + " sprites__front_default: str\n", + " sprites__front_shiny: str\n", + " sprites__back_default: Optional[str]\n", + " sprites__back_shiny: Optional[str]\n", + " _dlt_load_id: str\n", + " _dlt_id: str\n", + " is_type: Pokemons\n", + " abilities: List[PokemonAbility]\n", + " metadata: dict = {\"index_fields\": [\"name\"]}\n", + "\n", + "def load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root):\n", + " pokemons = []\n", + "\n", + " for jsonl_pokemon in jsonl_pokemons:\n", + " with open(jsonl_pokemon, \"r\") as f:\n", + " for line in f:\n", + " pokemon_data = json.loads(line)\n", + " abilities = [\n", + " ability for ability in pokemon_abilities\n", + " if ability[\"_dlt_parent_id\"] == pokemon_data[\"_dlt_id\"]\n", + " ]\n", + " pokemon_data[\"external_id\"] = pokemon_data[\"id\"]\n", + " pokemon_data[\"id\"] = uuid5(NAMESPACE_OID, str(pokemon_data[\"id\"]))\n", + " pokemon_data[\"abilities\"] = [PokemonAbility(**ability) for ability in abilities]\n", + " pokemon_data[\"is_type\"] = pokemon_root\n", + " pokemons.append(Pokemon(**pokemon_data))\n", + "\n", + " return pokemons\n" + ], + "id": "7862951248df0bf5", + "outputs": [], + "execution_count": 11 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Process Pokémon Data\n", + "### Load and associate Pokémon abilities.\n" + ], + "id": "676fa5a2b61c2107" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:59:47.365226Z", + "start_time": "2025-03-04T11:59:47.356722Z" + } + }, + "cell_type": "code", + "source": [ + "STORAGE_PATH = Path(\".data_storage/pokemon_data/pokemon_details\")\n", + "jsonl_pokemons = sorted(STORAGE_PATH.glob(\"*.jsonl\"))\n", + "\n", + "ABILITIES_PATH = Path(\".data_storage/pokemon_data/pokemon_details__abilities\")\n", + "jsonl_abilities = sorted(ABILITIES_PATH.glob(\"*.jsonl\"))\n", + "\n", + "abilities_root, pokemon_abilities = load_abilities_data(jsonl_abilities)\n", + "pokemon_root = Pokemons(have=abilities_root)\n", + "pokemons = load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root)\n" + ], + "id": "ad14cdecdccd71bb", + "outputs": [], + "execution_count": 12 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Initialize Cognee\n", + "### Setup Cognee for data processing.\n" + ], + "id": "59dec67b2ae50f0f" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:59:49.244577Z", + "start_time": "2025-03-04T11:59:48.618261Z" + } + }, + "cell_type": "code", + "source": [ + "import asyncio\n", + "from cognee.low_level import setup as cognee_setup\n", + "\n", + "async def initialize_cognee():\n", + " await cognee.prune.prune_data()\n", + " await cognee.prune.prune_system(metadata=True)\n", + " await cognee_setup()\n", + "\n", + "await initialize_cognee()\n" + ], + "id": "d2e095ae576a02c1", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:cognee.infrastructure.databases.relational.sqlalchemy.SqlAlchemyAdapter:Database deleted successfully.INFO:cognee.infrastructure.databases.relational.sqlalchemy.SqlAlchemyAdapter:Database deleted successfully." + ] + } + ], + "execution_count": 13 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Process Pokémon Data\n", + "### Add Pokémon data points to Cognee.\n" + ], + "id": "5f0b8090bc7b1fe6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T11:59:57.744035Z", + "start_time": "2025-03-04T11:59:50.574033Z" + } + }, + "cell_type": "code", + "source": [ + "from cognee.modules.pipelines.tasks.Task import Task\n", + "from cognee.tasks.storage import add_data_points\n", + "from cognee.modules.pipelines import run_tasks\n", + "\n", + "tasks = [Task(add_data_points, task_config={\"batch_size\": 50})]\n", + "results = run_tasks(\n", + " tasks=tasks,\n", + " data=pokemons,\n", + " dataset_id=uuid5(NAMESPACE_OID, \"Pokemon\"),\n", + " pipeline_name='pokemon_pipeline',\n", + ")\n", + "\n", + "async for result in results:\n", + " print(result)\n", + "print(\"Done\")\n" + ], + "id": "ffa12fc1f5350d95", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:run_tasks(tasks: [Task], data):Pipeline run started: `fd2ed59d-b550-5b05-bbe6-7b708fe12483`INFO:run_tasks(tasks: [Task], data):Coroutine task started: `add_data_points`" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "User d347ea85-e512-4cae-b9d7-496fe1745424 has registered.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lazar/PycharmProjects/cognee/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py:79: SAWarning: This declarative base already contains a class with the same class name and module name as cognee.infrastructure.databases.vector.pgvector.PGVectorAdapter.PGVectorDataPoint, and will be replaced in the string-lookup table.\n", + " class PGVectorDataPoint(Base):\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"/Users/lazar/PycharmProjects/cognee/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py:113: SAWarning: This declarative base already contains a class with the same class name and module name as cognee.infrastructure.databases.vector.pgvector.PGVectorAdapter.PGVectorDataPoint, and will be replaced in the string-lookup table.\n", + " class PGVectorDataPoint(Base):\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 8, column: 16, offset: 335} for query: '\\n UNWIND $nodes AS node\\n MERGE (n {id: node.node_id})\\n ON CREATE SET n += node.properties, n.updated_at = timestamp()\\n ON MATCH SET n += node.properties, n.updated_at = timestamp()\\n WITH n, node.node_id AS label\\n CALL apoc.create.addLabels(n, [label]) YIELD node AS labeledNode\\n RETURN ID(labeledNode) AS internal_id, labeledNode.id AS nodeId\\n 'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 1, column: 18, offset: 17} for query: 'MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 16, offset: 43} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 33, offset: 60} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:run_tasks(tasks: [Task], data):Coroutine task completed: `add_data_points`INFO:run_tasks(tasks: [Task], data):Pipeline run completed: `fd2ed59d-b550-5b05-bbe6-7b708fe12483`" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Done\n" + ] + } + ], + "execution_count": 14 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Search Pokémon Data\n", + "### Execute a search query using Cognee.\n" + ], + "id": "e0d98d9832a2797a" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-03-04T12:00:02.878871Z", + "start_time": "2025-03-04T11:59:59.571965Z" + } + }, + "cell_type": "code", + "source": [ + "from cognee.api.v1.search import SearchType\n", + "\n", + "search_results = await cognee.search(\n", + " query_type=SearchType.GRAPH_COMPLETION,\n", + " query_text=\"pokemons?\"\n", + ")\n", + "\n", + "print(\"Search results:\")\n", + "for result_text in search_results:\n", + " print(result_text)" + ], + "id": "bb2476b6b0c2aff", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 1, column: 18, offset: 17} for query: 'MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 16, offset: 43} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'WARNING:neo4j.notifications:Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated function: `id`.} {position: line: 3, column: 33, offset: 60} for query: '\\n MATCH (n)-[r]->(m)\\n RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties\\n 'INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\u001B[92m13:00:02 - LiteLLM:INFO\u001B[0m: utils.py:2784 - \n", + "LiteLLM completion() model= gpt-4o-mini; provider = openaiINFO:LiteLLM:\n", + "LiteLLM completion() model= gpt-4o-mini; provider = openai" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Search results:\n", + "The Pokemons mentioned are: golbat, jigglypuff, raichu, vulpix, and pikachu.\n" + ] + } + ], + "execution_count": 15 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "a4c2d3e9c15b017" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ac0156514d6dbc237559ac44a937fef330a679c3 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 10 Mar 2025 10:55:31 +0100 Subject: [PATCH 03/14] feat: COG-1523 add top_k in run_question_answering (#625) ## Description - Expose top_k as an optional argument of run_question_answering - Update retrievers to handle the parameters ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Enhanced answer generation and document retrieval capabilities by introducing an optional parameter that allows users to specify the number of top results. This improvement adds flexibility when retrieving question responses and associated context, adapting the output based on user preference. --- .../answer_generation/run_question_answering_module.py | 8 +++++--- cognee/modules/retrieval/completion_retriever.py | 4 +++- cognee/modules/retrieval/graph_completion_retriever.py | 4 ++-- .../retrieval/graph_summary_completion_retriever.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cognee/eval_framework/answer_generation/run_question_answering_module.py b/cognee/eval_framework/answer_generation/run_question_answering_module.py index 1d3686efb..70938a451 100644 --- a/cognee/eval_framework/answer_generation/run_question_answering_module.py +++ b/cognee/eval_framework/answer_generation/run_question_answering_module.py @@ -1,6 +1,6 @@ import logging import json -from typing import List +from typing import List, Optional from cognee.eval_framework.answer_generation.answer_generation_executor import ( AnswerGeneratorExecutor, retriever_options, @@ -32,7 +32,7 @@ async def create_and_insert_answers_table(questions_payload): async def run_question_answering( - params: dict, system_prompt="answer_simple_question.txt" + params: dict, system_prompt="answer_simple_question.txt", top_k: Optional[int] = None ) -> List[dict]: if params.get("answering_questions"): logging.info("Question answering started...") @@ -48,7 +48,9 @@ async def run_question_answering( answer_generator = AnswerGeneratorExecutor() answers = await answer_generator.question_answering_non_parallel( questions=questions, - retriever=retriever_options[params["qa_engine"]](system_prompt_path=system_prompt), + retriever=retriever_options[params["qa_engine"]]( + system_prompt_path=system_prompt, top_k=top_k + ), ) with open(params["answers_path"], "w", encoding="utf-8") as f: json.dump(answers, f, ensure_ascii=False, indent=4) diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py index f2427f062..cf8600f27 100644 --- a/cognee/modules/retrieval/completion_retriever.py +++ b/cognee/modules/retrieval/completion_retriever.py @@ -13,15 +13,17 @@ class CompletionRetriever(BaseRetriever): self, user_prompt_path: str = "context_for_question.txt", system_prompt_path: str = "answer_simple_question.txt", + top_k: Optional[int] = 1, ): """Initialize retriever with optional custom prompt paths.""" self.user_prompt_path = user_prompt_path self.system_prompt_path = system_prompt_path + self.top_k = top_k if top_k is not None else 1 async def get_context(self, query: str) -> Any: """Retrieves relevant document chunks as context.""" vector_engine = get_vector_engine() - found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=1) + found_chunks = await vector_engine.search("DocumentChunk_text", query, limit=self.top_k) if len(found_chunks) == 0: raise NoRelevantDataFound return found_chunks[0].payload["text"] diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index 709415fa7..80b8855d1 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -15,12 +15,12 @@ class GraphCompletionRetriever(BaseRetriever): self, user_prompt_path: str = "graph_context_for_question.txt", system_prompt_path: str = "answer_simple_question.txt", - top_k: int = 5, + top_k: Optional[int] = 5, ): """Initialize retriever with prompt paths and search parameters.""" self.user_prompt_path = user_prompt_path self.system_prompt_path = system_prompt_path - self.top_k = top_k + self.top_k = top_k if top_k is not None else 5 async def resolve_edges_to_text(self, retrieved_edges: list) -> str: """Converts retrieved graph edges into a human-readable string format.""" diff --git a/cognee/modules/retrieval/graph_summary_completion_retriever.py b/cognee/modules/retrieval/graph_summary_completion_retriever.py index 536bafe5d..76ed5f5d4 100644 --- a/cognee/modules/retrieval/graph_summary_completion_retriever.py +++ b/cognee/modules/retrieval/graph_summary_completion_retriever.py @@ -12,7 +12,7 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): user_prompt_path: str = "graph_context_for_question.txt", system_prompt_path: str = "answer_simple_question.txt", summarize_prompt_path: str = "summarize_search_results.txt", - top_k: int = 5, + top_k: Optional[int] = 5, ): """Initialize retriever with default prompt paths and search parameters.""" super().__init__( From 7b5bd7897f23d657fe670e802f44ba35664ab4ad Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Mon, 10 Mar 2025 15:27:48 +0100 Subject: [PATCH 04/14] Feat: evaluate retrieved context against golden context [cog-1481] (#619) ## Description - Compare retrieved context to golden context using deepeval's summarization metric - Display relevant fields to each metric on metrics dashboard Example output: ![image](https://github.com/user-attachments/assets/9facf716-b2ab-4573-bfdf-7b343d2a57c5) ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Enhanced context handling in answer generation and corpus building to include extended details. - Introduced a new context coverage metric for deeper evaluation insights. - Upgraded the evaluation dashboard with dynamic presentation of metric details. - Added a new parameter to support loading golden context in corpus loading methods. - **Bug Fixes** - Improved clarity in how answers are structured and appended in the answer generation process. --- .../answer_generation_executor.py | 19 ++++--- .../benchmark_adapters/dummy_adapter.py | 19 ++++--- .../corpus_builder/corpus_builder_executor.py | 16 ++++-- .../corpus_builder/run_corpus_builder.py | 5 +- .../evaluation/deep_eval_adapter.py | 3 ++ .../evaluation/evaluation_executor.py | 1 + .../evaluation/metrics/context_coverage.py | 50 +++++++++++++++++++ cognee/eval_framework/metrics_dashboard.py | 36 +++++++------ .../eval_framework/deepeval_adapter_test.py | 7 ++- 9 files changed, 115 insertions(+), 41 deletions(-) create mode 100644 cognee/eval_framework/evaluation/metrics/context_coverage.py diff --git a/cognee/eval_framework/answer_generation/answer_generation_executor.py b/cognee/eval_framework/answer_generation/answer_generation_executor.py index 1b984d465..67eb02578 100644 --- a/cognee/eval_framework/answer_generation/answer_generation_executor.py +++ b/cognee/eval_framework/answer_generation/answer_generation_executor.py @@ -29,13 +29,16 @@ class AnswerGeneratorExecutor: retrieval_context = await retriever.get_context(query_text) search_results = await retriever.get_completion(query_text, retrieval_context) - answers.append( - { - "question": query_text, - "answer": search_results[0], - "golden_answer": correct_answer, - "retrieval_context": retrieval_context, - } - ) + answer = { + "question": query_text, + "answer": search_results[0], + "golden_answer": correct_answer, + "retrieval_context": retrieval_context, + } + + if "golden_context" in instance: + answer["golden_context"] = instance["golden_context"] + + answers.append(answer) return answers diff --git a/cognee/eval_framework/benchmark_adapters/dummy_adapter.py b/cognee/eval_framework/benchmark_adapters/dummy_adapter.py index 69cc6e518..9bf945d06 100644 --- a/cognee/eval_framework/benchmark_adapters/dummy_adapter.py +++ b/cognee/eval_framework/benchmark_adapters/dummy_adapter.py @@ -5,18 +5,21 @@ from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import Base class DummyAdapter(BaseBenchmarkAdapter): def load_corpus( - self, limit: Optional[int] = None, seed: int = 42 + self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False ) -> tuple[list[str], list[dict[str, Any]]]: corpus_list = [ "The cognee is an AI memory engine that supports different vector and graph databases", "Neo4j is a graph database supported by cognee", ] - question_answer_pairs = [ - { - "answer": "Yes", - "question": "Is Neo4j supported by cognee?", - "type": "dummy", - } - ] + qa_pair = { + "answer": "Yes", + "question": "Is Neo4j supported by cognee?", + "type": "dummy", + } + + if load_golden_context: + qa_pair["golden_context"] = "Cognee supports Neo4j and NetworkX" + + question_answer_pairs = [qa_pair] return corpus_list, question_answer_pairs diff --git a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py index 2e4a7fd3d..1d2b31e41 100644 --- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py +++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py @@ -28,14 +28,22 @@ class CorpusBuilderExecutor: self.questions = None self.task_getter = task_getter - def load_corpus(self, limit: Optional[int] = None) -> Tuple[List[Dict], List[str]]: - self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit) + def load_corpus( + self, limit: Optional[int] = None, load_golden_context: bool = False + ) -> Tuple[List[Dict], List[str]]: + self.raw_corpus, self.questions = self.adapter.load_corpus( + limit=limit, load_golden_context=load_golden_context + ) return self.raw_corpus, self.questions async def build_corpus( - self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker + self, + limit: Optional[int] = None, + chunk_size=1024, + chunker=TextChunker, + load_golden_context: bool = False, ) -> List[str]: - self.load_corpus(limit=limit) + self.load_corpus(limit=limit, load_golden_context=load_golden_context) await self.run_cognee(chunk_size=chunk_size, chunker=chunker) return self.questions diff --git a/cognee/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py index 2aff21249..6054688d2 100644 --- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py +++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py @@ -47,7 +47,10 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker) task_getter=task_getter, ) questions = await corpus_builder.build_corpus( - limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker + limit=params.get("number_of_samples_in_corpus"), + chunk_size=chunk_size, + chunker=chunker, + load_golden_context=params.get("evaluating_contexts"), ) with open(params["questions_path"], "w", encoding="utf-8") as f: json.dump(questions, f, ensure_ascii=False, indent=4) diff --git a/cognee/eval_framework/evaluation/deep_eval_adapter.py b/cognee/eval_framework/evaluation/deep_eval_adapter.py index 11f33571b..761d66e05 100644 --- a/cognee/eval_framework/evaluation/deep_eval_adapter.py +++ b/cognee/eval_framework/evaluation/deep_eval_adapter.py @@ -4,6 +4,7 @@ from cognee.eval_framework.eval_config import EvalConfig from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric +from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric from typing import Any, Dict, List from deepeval.metrics import ContextualRelevancyMetric @@ -15,6 +16,7 @@ class DeepEvalAdapter(BaseEvalAdapter): "EM": ExactMatchMetric(), "f1": F1ScoreMetric(), "contextual_relevancy": ContextualRelevancyMetric(), + "context_coverage": ContextCoverageMetric(), } async def evaluate_answers( @@ -32,6 +34,7 @@ class DeepEvalAdapter(BaseEvalAdapter): actual_output=answer["answer"], expected_output=answer["golden_answer"], retrieval_context=[answer["retrieval_context"]], + context=[answer["golden_context"]] if "golden_context" in answer else None, ) metric_results = {} for metric in evaluator_metrics: diff --git a/cognee/eval_framework/evaluation/evaluation_executor.py b/cognee/eval_framework/evaluation/evaluation_executor.py index 5e56b50c7..1de01f101 100644 --- a/cognee/eval_framework/evaluation/evaluation_executor.py +++ b/cognee/eval_framework/evaluation/evaluation_executor.py @@ -23,5 +23,6 @@ class EvaluationExecutor: async def execute(self, answers: List[Dict[str, str]], evaluator_metrics: Any) -> Any: if self.evaluate_contexts: evaluator_metrics.append("contextual_relevancy") + evaluator_metrics.append("context_coverage") metrics = await self.eval_adapter.evaluate_answers(answers, evaluator_metrics) return metrics diff --git a/cognee/eval_framework/evaluation/metrics/context_coverage.py b/cognee/eval_framework/evaluation/metrics/context_coverage.py new file mode 100644 index 000000000..9fdd5e14e --- /dev/null +++ b/cognee/eval_framework/evaluation/metrics/context_coverage.py @@ -0,0 +1,50 @@ +from deepeval.metrics import SummarizationMetric +from deepeval.test_case import LLMTestCase +from deepeval.metrics.summarization.schema import ScoreType +from deepeval.metrics.indicator import metric_progress_indicator +from deepeval.utils import get_or_create_event_loop + + +class ContextCoverageMetric(SummarizationMetric): + def measure( + self, + test_case, + _show_indicator: bool = True, + ) -> float: + mapped_test_case = LLMTestCase( + input=test_case.context[0], + actual_output=test_case.retrieval_context[0], + ) + self.assessment_questions = None + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator(self, _show_indicator=_show_indicator): + if self.async_mode: + loop = get_or_create_event_loop() + return loop.run_until_complete( + self.a_measure(mapped_test_case, _show_indicator=False) + ) + else: + self.coverage_verdicts = self._generate_coverage_verdicts(mapped_test_case) + self.alignment_verdicts = [] + self.score = self._calculate_score(ScoreType.COVERAGE) + self.reason = self._generate_reason() + self.success = self.score >= self.threshold + return self.score + + async def a_measure( + self, + test_case, + _show_indicator: bool = True, + ) -> float: + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator( + self, + async_mode=True, + _show_indicator=_show_indicator, + ): + self.coverage_verdicts = await self._a_generate_coverage_verdicts(test_case) + self.alignment_verdicts = [] + self.score = self._calculate_score(ScoreType.COVERAGE) + self.reason = await self._a_generate_reason() + self.success = self.score >= self.threshold + return self.score diff --git a/cognee/eval_framework/metrics_dashboard.py b/cognee/eval_framework/metrics_dashboard.py index 2c917740a..eb4d2ed8e 100644 --- a/cognee/eval_framework/metrics_dashboard.py +++ b/cognee/eval_framework/metrics_dashboard.py @@ -3,6 +3,12 @@ import plotly.graph_objects as go from typing import Dict, List, Tuple from collections import defaultdict +metrics_fields = { + "contextual_relevancy": ["question", "retrieval_context"], + "context_coverage": ["question", "retrieval_context", "golden_context"], +} +default_metrics_fields = ["question", "answer", "golden_answer"] + def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]: """Create distribution histogram plots for each metric.""" @@ -59,38 +65,30 @@ def generate_details_html(metrics_data: List[Dict]) -> List[str]: for metric, values in entry["metrics"].items(): if metric not in metric_details: metric_details[metric] = [] + current_metrics_fields = metrics_fields.get(metric, default_metrics_fields) metric_details[metric].append( - { - "question": entry["question"], - "answer": entry["answer"], - "golden_answer": entry["golden_answer"], + {key: entry[key] for key in current_metrics_fields} + | { "reason": values.get("reason", ""), "score": values["score"], } ) for metric, details in metric_details.items(): + formatted_column_names = [key.replace("_", " ").title() for key in details[0].keys()] details_html.append(f"

{metric} Details

") - details_html.append(""" + details_html.append(f""" - - - - - + {"".join(f"" for col in formatted_column_names)} """) for item in details: - details_html.append( - f"" - f"" - f"" - f"" - f"" - f"" - f"" - ) + details_html.append(f""" + + {"".join(f"" for value in item.values())} + + """) details_html.append("
QuestionAnswerGolden AnswerReasonScore{col}
{item['question']}{item['answer']}{item['golden_answer']}{item['reason']}{item['score']}
{value}
") return details_html diff --git a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py index eda9f0b66..48375221c 100644 --- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py +++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py @@ -5,7 +5,12 @@ import sys with patch.dict( sys.modules, - {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()}, + { + "deepeval": MagicMock(), + "deepeval.metrics": MagicMock(), + "deepeval.test_case": MagicMock(), + "cognee.eval_framework.evaluation.metrics.context_coverage": MagicMock(), + }, ): from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter From 819e411149248e7ab4ddc839e3a619d6a2e8d59a Mon Sep 17 00:00:00 2001 From: Daniel Molnar Date: Mon, 10 Mar 2025 16:07:36 +0100 Subject: [PATCH 05/14] Small clarifications. (#624) ## Description Small clarifications in README.md. ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **Documentation** - Updated documentation to feature a single, centrally positioned demo link for clearer navigation. - Clarified setup instructions to indicate that default configurations are applied when custom environment variables are not provided. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ef37fd157..7af33c120 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ cognee - memory layer for AI apps and Agents

+ Demo + . Learn more · Join Discord - · - Demo

@@ -89,7 +89,7 @@ Add LLM_API_KEY to .env using the command bellow. ``` echo "LLM_API_KEY=YOUR_OPENAI_API_KEY" > .env ``` -You can see available env variables in the repository `.env.template` file. +You can see available env variables in the repository `.env.template` file. If you don't specify it otherwise, like in this example, SQLite (relational database), LanceDB (vector database) and NetworkX (graph store) will be used as default components. This script will run the default pipeline: From 1d4d54c1f58af0a223bc8ee1ae6a5ccd1a1c8af7 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Tue, 11 Mar 2025 03:01:06 +0100 Subject: [PATCH 06/14] Update CONTRIBUTING.md --- CONTRIBUTING.md | 111 +++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 59 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ba9c3df25..734ae62fe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,97 +1,90 @@ -# 🚀 How to Contribute to **cognee** +# 🎉 Welcome to **cognee**! -Thank you for investing time in contributing to our project! Here's a guide to get you started. +We're excited that you're interested in contributing to our project! This guide will help you get started and ensure your contributions can be efficiently integrated into the project. -## 1. 🚀 Getting Started +## 🌟 Quick Links -### 🍴 Fork the Repository +- [Code of Conduct](CODE_OF_CONDUCT.md) +- [Discord Community](https://discord.gg/bcy8xFAtfd) +- [Issue Tracker](https://github.com/topoteretes/cognee/issues) -To start your journey, you'll need your very own copy of **cognee**. Think of it as your own innovation lab. 🧪 +## 1. 🚀 Ways to Contribute -1. Navigate to the [**cognee**](https://github.com/topoteretes/cognee) repository on GitHub. -2. In the upper-right corner, click the **'Fork'** button. +You can contribute to **cognee** in many ways: -### 🚀 Clone the Repository +- 📝 Submitting bug reports or feature requests +- 💡 Improving documentation +- 🔍 Reviewing pull requests +- 🛠️ Contributing code or tests +- 🌐 Helping other users -Next, let's bring your newly forked repository to your local machine. +## 2. 🛠️ Development Setup +### Fork and Clone + +1. Fork the [**cognee**](https://github.com/topoteretes/cognee) repository +2. Clone your fork: ```shell git clone https://github.com//cognee.git +cd cognee ``` -## 2. 🛠️ Making Changes - -### 🌟 Create a Branch - -Get ready to channel your creativity. Begin by creating a new branch for your incredible features. 🧞‍♂️ +### Create a Branch +Create a new branch for your work: ```shell git checkout -b feature/your-feature-name ``` -### ✏️ Make Your Changes +## 3. 🎯 Making Changes -Now's your chance to shine! Dive in and make your contributions. 🌠 - -## 3. 🚀 Submitting Changes - -After making your changes, follow these steps: - -### ✅ Run the Tests - -Ensure your changes do not break the existing codebase: +1. **Code Style**: Follow the project's coding standards +2. **Documentation**: Update relevant documentation +3. **Tests**: Add tests for new features +4. **Commits**: Write clear commit messages +### Running Tests ```shell python cognee/cognee/tests/test_library.py ``` -### 🚢 Push Your Feature Branch +## 4. 📤 Submitting Changes +1. Push your changes: ```shell -# Add your changes to the staging area: git add . - -# Commit changes with an adequate description: -git commit -m "Describe your changes here" - -# Push your feature branch to your forked repository: +git commit -s -m "Description of your changes" git push origin feature/your-feature-name ``` -### 🚀 Create a Pull Request +2. Create a Pull Request: + - Go to the [**cognee** repository](https://github.com/topoteretes/cognee) + - Click "Compare & Pull Request" + - Fill in the PR template with details about your changes -You're on the verge of completion! It's time to showcase your hard work. 🌐 +## 5. 📜 Developer Certificate of Origin (DCO) -1. Go to [**cognee**](https://github.com/topoteretes/cognee) on GitHub. -2. Hit the **"Compare & Pull Request"** button. -3. Select the base branch (main) and the compare branch (the one with your features). -4. Craft a **compelling title** and provide a **detailed description** of your contributions. 🎩 +All contributions must be signed-off to indicate agreement with our DCO: -## 4. 🔍 Review and Approval +```shell +git config alias.cos "commit -s" # Create alias for signed commits +``` -The project maintainers will review your work, possibly suggest improvements, or request further details. Once you receive approval, your contributions will become part of **cognee**! +When your PR is ready, please include: +> "I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin" +## 6. 🤝 Community Guidelines -## 5. Developer Certificate of Origin -All contributions to the topoteretes codebase must be signed-off to indicate you have read and agreed to the Developer Certificate of Origin (DCO), which is in the root directory under name DCO. To sign the DCO, simply add -s after all commits that you make, to do this easily you can make a git alias from the command line, for example: +- Be respectful and inclusive +- Help others learn and grow +- Follow our [Code of Conduct](CODE_OF_CONDUCT.md) +- Provide constructive feedback +- Ask questions when unsure -$ git config alias.cos "commit -s" +## 7. 📫 Getting Help -Will allow you to write git cos which will automatically sign-off your commit. By signing a commit you are agreeing to the DCO and agree that you will be banned from the topoteretes GitHub organisation and Discord server if you violate the DCO. +- Open an [issue](https://github.com/topoteretes/cognee/issues) +- Join our Discord community +- Check existing documentation -"When a commit is ready to be merged please use the following template to agree to our developer certificate of origin: - 'I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin' - -We consider the following as violations to the DCO: - -Signing the DCO with a fake name or pseudonym, if you are registered on GitHub or another platform with a fake name then you will not be able to contribute to topoteretes before updating your name; -Submitting a contribution that you did not have the right to submit whether due to licensing, copyright, or any other restrictions. - -## 6. 📜 Code of Conduct -Ensure you adhere to the project's [Code of Conduct](https://github.com/topoteretes/cognee/blob/main/CODE_OF_CONDUCT.md) throughout your participation. - -## 7. 📫 Contact - -If you need assistance or simply wish to connect, we're here for you. Contact us by filing an issue on the GitHub repository or by messaging us on our Discord server. - -Thanks for helping to evolve **cognee**! +Thank you for contributing to **cognee**! 🌟 From a74c96609f31886c24b8b92e8716aec2a1ad9913 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Tue, 11 Mar 2025 03:07:25 +0100 Subject: [PATCH 07/14] Update CONTRIBUTING.md --- CONTRIBUTING.md | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 734ae62fe..9e97d0d23 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,8 @@ # 🎉 Welcome to **cognee**! -We're excited that you're interested in contributing to our project! This guide will help you get started and ensure your contributions can be efficiently integrated into the project. +We're excited that you're interested in contributing to our project! +We want to ensure that every user and contributor feels welcome, included and supported to participate in cognee community. +This guide will help you get started and ensure your contributions can be efficiently integrated into the project. ## 🌟 Quick Links @@ -18,6 +20,42 @@ You can contribute to **cognee** in many ways: - 🛠️ Contributing code or tests - 🌐 Helping other users +## 📫 Get in Touch + +There are several ways to connect with the **cognee** team and community: + +### GitHub Collaboration +- [Open an issue](https://github.com/topoteretes/cognee/issues) for bug reports, feature requests, or discussions +- Submit pull requests to contribute code or documentation +- Join ongoing discussions in existing issues and PRs + +### Community Channels +- Join our [Discord community](https://discord.gg/bcy8xFAtfd) for real-time discussions +- Participate in community events and discussions +- Get help from other community members + +### Direct Contact +- Email: vasilije@cognee.ai +- For business inquiries or sensitive matters, please reach out via email +- For general questions, prefer public channels like GitHub issues or Discord + +We aim to respond to all communications within 2 business days. For faster responses, consider using our Discord channel where the whole community can help! + +## Issue Labels + +To help you find the most appropriate issues to work on, we use the following labels: + +- `good first issue` - Perfect for newcomers to the project +- `bug` - Something isn't working as expected +- `documentation` - Improvements or additions to documentation +- `enhancement` - New features or improvements +- `help wanted` - Extra attention or assistance needed +- `question` - Further information is requested +- `wontfix` - This will not be worked on + +Looking for a place to start? Try filtering for [good first issues](https://github.com/topoteretes/cognee/labels/good%20first%20issue)! + + ## 2. 🛠️ Development Setup ### Fork and Clone From 3f69234776d5f814d0ec41d2b9d857c73d6a7cbb Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Tue, 11 Mar 2025 16:41:12 +0100 Subject: [PATCH 08/14] fix: remove double install step from Dockerfile --- cognee-mcp/Dockerfile | 11 ++++++----- cognee-mcp/README.md | 2 +- cognee-mcp/pyproject.toml | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cognee-mcp/Dockerfile b/cognee-mcp/Dockerfile index 45e601e0f..e229fb08a 100644 --- a/cognee-mcp/Dockerfile +++ b/cognee-mcp/Dockerfile @@ -20,15 +20,16 @@ RUN apt-get update && apt-get install -y \ gcc \ libpq-dev -RUN apt-get install -y \ - gcc \ - libpq-dev - COPY . /app RUN uv sync --reinstall # Place executables in the environment at the front of the path -ENV PATH="/app/:/app/.venv/bin:$PATH" +ENV PATH="/app:/app/.venv/bin:$PATH" + +# Set environment variables for MCP server +ENV PYTHONUNBUFFERED=1 +ENV MCP_LOG_LEVEL=DEBUG +ENV PYTHONPATH=/app ENTRYPOINT ["cognee"] diff --git a/cognee-mcp/README.md b/cognee-mcp/README.md index c9926270c..fa8888f29 100644 --- a/cognee-mcp/README.md +++ b/cognee-mcp/README.md @@ -82,5 +82,5 @@ http://localhost:5173?timeout=120000 To apply new changes while developing cognee you need to do: 1. `poetry lock` in cognee folder -2. `uv sync --dev --all-extras --reinstall ` +2. `uv sync --dev --all-extras --reinstall` 3. `mcp dev src/server.py` diff --git a/cognee-mcp/pyproject.toml b/cognee-mcp/pyproject.toml index 972b44b05..1f9bae195 100644 --- a/cognee-mcp/pyproject.toml +++ b/cognee-mcp/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cognee-mcp" -version = "0.1.0" +version = "0.2.0" description = "A MCP server project" readme = "README.md" requires-python = ">=3.10" From deb3e0cce146e950c052b7030b7d9bbd362b0e18 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Tue, 11 Mar 2025 16:41:38 +0100 Subject: [PATCH 09/14] version: v0.1.33 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 147ffe798..7aac688db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cognee" -version = "0.1.32" +version = "0.1.33" description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning." authors = ["Vasilije Markovic", "Boris Arzentar"] readme = "README.md" From 40c0015f0d339e75fd0a7ae146455e9feffa7f1f Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Tue, 11 Mar 2025 16:43:22 +0100 Subject: [PATCH 10/14] chore: update uv.lock --- cognee-mcp/uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee-mcp/uv.lock b/cognee-mcp/uv.lock index 182f35103..88f66e626 100644 --- a/cognee-mcp/uv.lock +++ b/cognee-mcp/uv.lock @@ -547,7 +547,7 @@ huggingface = [ [[package]] name = "cognee-mcp" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "cognee", extra = ["codegraph", "gemini", "huggingface"] }, From 2e4aab9a9ad53cc76e97e9a786627f079922ec77 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Tue, 11 Mar 2025 16:44:00 +0100 Subject: [PATCH 11/14] fix: example ruff errors --- examples/python/pokemon_datapoints_example.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/examples/python/pokemon_datapoints_example.py b/examples/python/pokemon_datapoints_example.py index 058492e63..83179cf9f 100644 --- a/examples/python/pokemon_datapoints_example.py +++ b/examples/python/pokemon_datapoints_example.py @@ -21,11 +21,13 @@ BASE_URL = "https://pokeapi.co/api/v2/" os.environ["BUCKET_URL"] = "./.data_storage" os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "true" + # Data Models class Abilities(DataPoint): name: str = "Abilities" metadata: dict = {"index_fields": ["name"]} + class PokemonAbility(DataPoint): name: str ability__name: str @@ -39,11 +41,13 @@ class PokemonAbility(DataPoint): is_type: Abilities metadata: dict = {"index_fields": ["ability__name"]} + class Pokemons(DataPoint): name: str = "Pokemons" have: Abilities metadata: dict = {"index_fields": ["name"]} + class Pokemon(DataPoint): name: str base_experience: int @@ -66,6 +70,7 @@ class Pokemon(DataPoint): abilities: List[PokemonAbility] metadata: dict = {"index_fields": ["name"]} + # Data Collection Functions @dlt.resource(write_disposition="replace") def pokemon_list(limit: int = 50): @@ -73,6 +78,7 @@ def pokemon_list(limit: int = 50): response.raise_for_status() yield response.json()["results"] + @dlt.transformer(data_from=pokemon_list) def pokemon_details(pokemons): """Fetches detailed info for each Pokémon""" @@ -81,6 +87,7 @@ def pokemon_details(pokemons): response.raise_for_status() yield response.json() + # Data Loading Functions def load_abilities_data(jsonl_abilities): abilities_root = Abilities() @@ -97,6 +104,7 @@ def load_abilities_data(jsonl_abilities): return abilities_root, pokemon_abilities + def load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root): pokemons = [] @@ -105,7 +113,8 @@ def load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root): for line in f: pokemon_data = json.loads(line) abilities = [ - ability for ability in pokemon_abilities + ability + for ability in pokemon_abilities if ability["_dlt_parent_id"] == pokemon_data["_dlt_id"] ] pokemon_data["external_id"] = pokemon_data["id"] @@ -116,12 +125,17 @@ def load_pokemon_data(jsonl_pokemons, pokemon_abilities, pokemon_root): return pokemons + # Main Application Logic async def setup_and_process_data(): """Setup configuration and process Pokemon data""" # Setup configuration - data_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".data_storage")).resolve()) - cognee_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system")).resolve()) + data_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".data_storage")).resolve() + ) + cognee_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system")).resolve() + ) cognee.config.data_root_directory(data_directory_path) cognee.config.system_root_directory(cognee_directory_path) @@ -153,6 +167,7 @@ async def setup_and_process_data(): return pokemons + async def pokemon_cognify(pokemons): """Process Pokemon data with Cognee and perform search""" # Setup and run Cognee tasks @@ -165,7 +180,7 @@ async def pokemon_cognify(pokemons): tasks=tasks, data=pokemons, dataset_id=uuid5(NAMESPACE_OID, "Pokemon"), - pipeline_name='pokemon_pipeline', + pipeline_name="pokemon_pipeline", ) async for result in results: @@ -174,17 +189,18 @@ async def pokemon_cognify(pokemons): # Perform search search_results = await cognee.search( - query_type=SearchType.GRAPH_COMPLETION, - query_text="pokemons?" + query_type=SearchType.GRAPH_COMPLETION, query_text="pokemons?" ) print("Search results:") for result_text in search_results: print(result_text) + async def main(): pokemons = await setup_and_process_data() await pokemon_cognify(pokemons) + if __name__ == "__main__": asyncio.run(main()) From d5d01109a2df9b077fc6a496ec82e304681267af Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Tue, 11 Mar 2025 18:02:43 +0100 Subject: [PATCH 12/14] fix: use new Dockerfile for mcp server --- cognee-mcp/Dockerfile | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/cognee-mcp/Dockerfile b/cognee-mcp/Dockerfile index e229fb08a..4d7d455ee 100644 --- a/cognee-mcp/Dockerfile +++ b/cognee-mcp/Dockerfile @@ -1,13 +1,7 @@ # Use a Python image with uv pre-installed FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv -# Set build argument -ARG DEBUG - -# Set environment variable based on the build argument -ENV DEBUG=${DEBUG} -ENV PIP_NO_CACHE_DIR=true - +# Install the project into `/app` WORKDIR /app # Enable bytecode compilation @@ -16,20 +10,27 @@ ENV UV_COMPILE_BYTECODE=1 # Copy from the cache instead of linking since it's a mounted volume ENV UV_LINK_MODE=copy -RUN apt-get update && apt-get install -y \ - gcc \ - libpq-dev +# Install the project's dependencies using the lockfile and settings +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=uv.lock,target=uv.lock \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + uv sync --frozen --no-install-project --no-dev --no-editable -COPY . /app +# Then, add the rest of the project source code and install it +# Installing separately from its dependencies allows optimal layer caching +ADD . /app +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --frozen --no-dev --no-editable -RUN uv sync --reinstall +FROM python:3.12-slim-bookworm + +WORKDIR /app + +COPY --from=uv /root/.local /root/.local +COPY --from=uv --chown=app:app /app/.venv /app/.venv +COPY --from=uv --chown=app:app /app/src /app/src # Place executables in the environment at the front of the path -ENV PATH="/app:/app/.venv/bin:$PATH" - -# Set environment variables for MCP server -ENV PYTHONUNBUFFERED=1 -ENV MCP_LOG_LEVEL=DEBUG -ENV PYTHONPATH=/app +ENV PATH="/app/.venv/bin:$PATH" ENTRYPOINT ["cognee"] From 4719b82c562ab0a95421b9b53d40655092609a5d Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Tue, 11 Mar 2025 18:34:35 +0100 Subject: [PATCH 13/14] fix: don't compile python to bytecode in Dockerfile --- cognee-mcp/Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cognee-mcp/Dockerfile b/cognee-mcp/Dockerfile index 4d7d455ee..8efb7c46e 100644 --- a/cognee-mcp/Dockerfile +++ b/cognee-mcp/Dockerfile @@ -5,7 +5,7 @@ FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv WORKDIR /app # Enable bytecode compilation -ENV UV_COMPILE_BYTECODE=1 +# ENV UV_COMPILE_BYTECODE=1 # Copy from the cache instead of linking since it's a mounted volume ENV UV_LINK_MODE=copy @@ -27,8 +27,7 @@ FROM python:3.12-slim-bookworm WORKDIR /app COPY --from=uv /root/.local /root/.local -COPY --from=uv --chown=app:app /app/.venv /app/.venv -COPY --from=uv --chown=app:app /app/src /app/src +COPY --from=uv --chown=app:app /app /app # Place executables in the environment at the front of the path ENV PATH="/app/.venv/bin:$PATH" From 68b337f0b60492d982005944fb1f32a4676fc86a Mon Sep 17 00:00:00 2001 From: Daniel Molnar Date: Tue, 11 Mar 2025 18:44:56 +0100 Subject: [PATCH 14/14] Cline for VSCode demo runs. (#631) ## Description Missing dependency. ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Enabled PostgreSQL integration, expanding support for additional database options and enhancing overall functionality. --- cognee-mcp/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee-mcp/pyproject.toml b/cognee-mcp/pyproject.toml index 1f9bae195..7ccb826f4 100644 --- a/cognee-mcp/pyproject.toml +++ b/cognee-mcp/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ - "cognee[codegraph,gemini,huggingface]", + "cognee[postgres,codegraph,gemini,huggingface]", "mcp==1.2.1", "uv>=0.6.3", ]