Merge remote-tracking branch 'origin/main'

2024-11-23 16:58:29 +01:00 · 2024-11-23 16:58:29 +01:00 · 74fb3e8028
commit 74fb3e8028
parent a2fa25fb60 9d6081c7f7
8 changed files with 354 additions and 36 deletions
--- a/.data/multimedia/example.png
+++ b/.data/multimedia/example.png
--- a/.data/multimedia/text_to_speech.mp3
+++ b/.data/multimedia/text_to_speech.mp3
--- a/.github/workflows/test_cognee_multimedia_notebook.yml
+++ b/.github/workflows/test_cognee_multimedia_notebook.yml
@ -0,0 +1,63 @@
+name: test | multimedia notebook
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+    types: [labeled, synchronize]
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  RUNTIME__LOG_LEVEL: ERROR
+
+jobs:
+  get_docs_changes:
+    name: docs changes
+    uses: ./.github/workflows/get_docs_changes.yml
+
+  run_notebook_test:
+    name: test
+    needs: get_docs_changes
+    if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && ${{ github.event.label.name == 'run-checks' }}
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Check out
+        uses: actions/checkout@master
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11.x'
+
+      - name: Install Poetry
+        uses: snok/install-poetry@v1.3.2
+        with:
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+          installer-parallel: true
+
+      - name: Install dependencies
+        run: |
+          poetry install --no-interaction
+          poetry add jupyter --no-interaction
+
+      - name: Execute Jupyter Notebook
+        env:
+          ENV: 'dev'
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
+          GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
+        run: |
+          poetry run jupyter nbconvert \
+          --to notebook \
+          --execute notebooks/cognee_multimedia_demo.ipynb \
+          --output executed_notebook.ipynb \
+          --ExecutePreprocessor.timeout=1200
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@ -1,16 +1,51 @@
 from cognee.modules.data.models import Data
-from cognee.modules.data.processing.document_types import Document, PdfDocument, AudioDocument, ImageDocument, TextDocument
+from cognee.modules.data.processing.document_types import (
+    Document,
+    PdfDocument,
+    AudioDocument,
+    ImageDocument,
+    TextDocument,
+)

 EXTENSION_TO_DOCUMENT_CLASS = {
-    "pdf": PdfDocument,
-    "audio": AudioDocument,
-    "image": ImageDocument,
-    "txt": TextDocument
+    "pdf": PdfDocument,  # Text documents
+    "txt": TextDocument,
+    "png": ImageDocument,  # Image documents
+    "dwg": ImageDocument,
+    "xcf": ImageDocument,
+    "jpg": ImageDocument,
+    "jpx": ImageDocument,
+    "apng": ImageDocument,
+    "gif": ImageDocument,
+    "webp": ImageDocument,
+    "cr2": ImageDocument,
+    "tif": ImageDocument,
+    "bmp": ImageDocument,
+    "jxr": ImageDocument,
+    "psd": ImageDocument,
+    "ico": ImageDocument,
+    "heic": ImageDocument,
+    "avif": ImageDocument,
+    "aac": AudioDocument,  # Audio documents
+    "mid": AudioDocument,
+    "mp3": AudioDocument,
+    "m4a": AudioDocument,
+    "ogg": AudioDocument,
+    "flac": AudioDocument,
+    "wav": AudioDocument,
+    "amr": AudioDocument,
+    "aiff": AudioDocument,
 }

+
 def classify_documents(data_documents: list[Data]) -> list[Document]:
    documents = [
-        EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name)
+        EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
+            id=data_item.id,
+            title=f"{data_item.name}.{data_item.extension}",
+            raw_data_location=data_item.raw_data_location,
+            name=data_item.name,
+        )
        for data_item in data_documents
    ]
    return documents
--- a/examples/python/multimedia_example.py
+++ b/examples/python/multimedia_example.py
@ -0,0 +1,48 @@
+import os
+import asyncio
+import pathlib
+
+import cognee
+from cognee.api.v1.search import SearchType
+
+# Prerequisites:
+# 1. Copy `.env.template` and rename it to `.env`.
+# 2. Add your OpenAI API key to the `.env` file in the `LLM_API_KEY` field:
+#    LLM_API_KEY = "your_key_here"
+
+
+async def main():
+    # Create a clean slate for cognee -- reset data and system state
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    # cognee knowledge graph will be created based on the text
+    # and description of these files
+    mp3_file_path = os.path.join(
+        pathlib.Path(__file__).parent.parent.parent,
+        ".data/multimedia/text_to_speech.mp3",
+    )
+    png_file_path = os.path.join(
+        pathlib.Path(__file__).parent.parent.parent,
+        ".data/multimedia/example.png",
+    )
+
+    # Add the files, and make it available for cognify
+    await cognee.add([mp3_file_path, png_file_path])
+
+    # Use LLMs and cognee to create knowledge graph
+    await cognee.cognify()
+
+    # Query cognee for summaries of the data in the multimedia files
+    search_results = await cognee.search(
+        SearchType.SUMMARIES,
+        query_text="What is in the multimedia files?",
+    )
+
+    # Display search results
+    for result_text in search_results:
+        print(result_text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/notebooks/cognee_demo.ipynb
+++ b/notebooks/cognee_demo.ipynb
@ -265,7 +265,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "id": "df16431d0f48b006",
   "metadata": {
    "ExecuteTime": {
@ -304,7 +304,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "id": "9086abf3af077ab4",
   "metadata": {
    "ExecuteTime": {
@ -349,7 +349,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "a9de0cc07f798b7f",
   "metadata": {
    "ExecuteTime": {
@ -393,7 +393,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "id": "185ff1c102d06111",
   "metadata": {
    "ExecuteTime": {
@ -437,7 +437,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "id": "d55ce4c58f8efb67",
   "metadata": {
    "ExecuteTime": {
@ -479,7 +479,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "id": "ca4ecc32721ad332",
   "metadata": {
    "ExecuteTime": {
@ -529,14 +529,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "id": "bce39dc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
-    "# # Setting environment variables\n",
+    "# Setting environment variables\n",
    "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
    "    os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
    "\n",
@ -546,24 +546,26 @@
    "if \"LLM_API_KEY\" not in os.environ:\n",
    "    os.environ[\"LLM_API_KEY\"] = \"\"\n",
    "\n",
-    "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n",
+    "# \"neo4j\" or \"networkx\"\n",
+    "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
    "# Not needed if using networkx\n",
-    "#GRAPH_DATABASE_URL=\"\"\n",
-    "#GRAPH_DATABASE_USERNAME=\"\"\n",
-    "#GRAPH_DATABASE_PASSWORD=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
    "\n",
-    "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" # \"qdrant\", \"weaviate\" or \"lancedb\"\n",
-    "# Not needed if using \"lancedb\"\n",
+    "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
+    "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
+    "# Not needed if using \"lancedb\" or \"pgvector\"\n",
    "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
    "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
    "\n",
-    "# Database provider\n",
-    "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n",
+    "# Relational Database provider \"sqlite\" or \"postgres\"\n",
+    "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n",
    "\n",
    "# Database name\n",
    "os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
    "\n",
-    "# Postgres specific parameters (Only if Postgres is run)\n",
+    "# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
    "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
    "# os.environ[\"DB_PORT\"]=\"5432\"\n",
    "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
@ -620,7 +622,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "id": "7c431fdef4921ae0",
   "metadata": {
    "ExecuteTime": {
--- a/notebooks/cognee_llama_index.ipynb
+++ b/notebooks/cognee_llama_index.ipynb
@ -52,7 +52,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -71,7 +71,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -90,23 +90,23 @@
    "# \"neo4j\" or \"networkx\"\n",
    "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
    "# Not needed if using networkx\n",
-    "#GRAPH_DATABASE_URL=\"\"\n",
-    "#GRAPH_DATABASE_USERNAME=\"\"\n",
-    "#GRAPH_DATABASE_PASSWORD=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
    "\n",
-    "# \"qdrant\", \"weaviate\" or \"lancedb\"\n",
+    "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
    "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
-    "# Not needed if using \"lancedb\"\n",
+    "# Not needed if using \"lancedb\" or \"pgvector\"\n",
    "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
    "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
    "\n",
-    "# Database provider\n",
-    "os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n",
+    "# Relational Database provider \"sqlite\" or \"postgres\"\n",
+    "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n",
    "\n",
    "# Database name\n",
    "os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
    "\n",
-    "# Postgres specific parameters (Only if Postgres is run)\n",
+    "# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
    "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
    "# os.environ[\"DB_PORT\"]=\"5432\"\n",
    "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
@ -130,8 +130,6 @@
    "\n",
    "from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables\n",
    "from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables\n",
-    "from cognee.infrastructure.databases.graph import get_graph_engine\n",
-    "from cognee.shared.utils import render_graph\n",
    "from cognee.modules.users.models import User\n",
    "from cognee.modules.users.methods import get_default_user\n",
    "from cognee.tasks.ingestion.ingest_data_with_metadata import ingest_data_with_metadata\n",
@ -196,6 +194,9 @@
   "source": [
    "import graphistry\n",
    "\n",
+    "from cognee.infrastructure.databases.graph import get_graph_engine\n",
+    "from cognee.shared.utils import render_graph\n",
+    "\n",
    "# Get graph\n",
    "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
    "graph_engine = await get_graph_engine()\n",
--- a/notebooks/cognee_multimedia_demo.ipynb
+++ b/notebooks/cognee_multimedia_demo.ipynb
@ -0,0 +1,169 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cognee GraphRAG with Multimedia files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "## Load Data\n",
+    "\n",
+    "We will use a few sample multimedia files which we have on GitHub for easy access."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pathlib\n",
+    "\n",
+    "# cognee knowledge graph will be created based on the text\n",
+    "# and description of these files\n",
+    "mp3_file_path = os.path.join(\n",
+    "    os.path.abspath(''), \"../\",\n",
+    "    \".data/multimedia/text_to_speech.mp3\",\n",
+    ")\n",
+    "png_file_path = os.path.join(\n",
+    "    os.path.abspath(''), \"../\",\n",
+    "    \".data/multimedia/example.png\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set environment variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Setting environment variables\n",
+    "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
+    "    os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
+    "\n",
+    "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n",
+    "    os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
+    "\n",
+    "if \"LLM_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"LLM_API_KEY\"] = \"\"\n",
+    "\n",
+    "# \"neo4j\" or \"networkx\"\n",
+    "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
+    "# Not needed if using networkx\n",
+    "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
+    "\n",
+    "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
+    "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
+    "# Not needed if using \"lancedb\" or \"pgvector\"\n",
+    "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
+    "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
+    "\n",
+    "# Relational Database provider \"sqlite\" or \"postgres\"\n",
+    "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n",
+    "\n",
+    "# Database name\n",
+    "os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
+    "\n",
+    "# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
+    "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
+    "# os.environ[\"DB_PORT\"]=\"5432\"\n",
+    "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
+    "# os.environ[\"DB_PASSWORD\"]=\"cognee\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Cognee with multimedia files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cognee\n",
+    "\n",
+    "# Create a clean slate for cognee -- reset data and system state\n",
+    "await cognee.prune.prune_data()\n",
+    "await cognee.prune.prune_system(metadata=True)\n",
+    "\n",
+    "# Add multimedia files and make them available for cognify\n",
+    "await cognee.add([mp3_file_path, png_file_path])\n",
+    "\n",
+    "# Create knowledge graph with cognee\n",
+    "await cognee.cognify()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Query Cognee for summaries related to multimedia files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cognee.api.v1.search import SearchType\n",
+    "\n",
+    "# Query cognee for summaries of the data in the multimedia files\n",
+    "search_results = await cognee.search(\n",
+    "    SearchType.SUMMARIES,\n",
+    "    query_text=\"What is in the multimedia files?\",\n",
+    ")\n",
+    "\n",
+    "# Display search results\n",
+    "for result_text in search_results:\n",
+    "    print(result_text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}