Merge remote-tracking branch 'origin/main'

This commit is contained in:
Boris Arzentar 2024-11-23 16:58:29 +01:00
commit 74fb3e8028
8 changed files with 354 additions and 36 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

View file

@ -0,0 +1,63 @@
name: test | multimedia notebook
on:
workflow_dispatch:
pull_request:
branches:
- main
types: [labeled, synchronize]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
RUNTIME__LOG_LEVEL: ERROR
jobs:
get_docs_changes:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
run_notebook_test:
name: test
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && ${{ github.event.label.name == 'run-checks' }}
runs-on: ubuntu-latest
defaults:
run:
shell: bash
steps:
- name: Check out
uses: actions/checkout@master
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11.x'
- name: Install Poetry
uses: snok/install-poetry@v1.3.2
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
- name: Install dependencies
run: |
poetry install --no-interaction
poetry add jupyter --no-interaction
- name: Execute Jupyter Notebook
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
run: |
poetry run jupyter nbconvert \
--to notebook \
--execute notebooks/cognee_multimedia_demo.ipynb \
--output executed_notebook.ipynb \
--ExecutePreprocessor.timeout=1200

View file

@ -1,16 +1,51 @@
from cognee.modules.data.models import Data
from cognee.modules.data.processing.document_types import Document, PdfDocument, AudioDocument, ImageDocument, TextDocument
from cognee.modules.data.processing.document_types import (
Document,
PdfDocument,
AudioDocument,
ImageDocument,
TextDocument,
)
EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument,
"audio": AudioDocument,
"image": ImageDocument,
"txt": TextDocument
"pdf": PdfDocument, # Text documents
"txt": TextDocument,
"png": ImageDocument, # Image documents
"dwg": ImageDocument,
"xcf": ImageDocument,
"jpg": ImageDocument,
"jpx": ImageDocument,
"apng": ImageDocument,
"gif": ImageDocument,
"webp": ImageDocument,
"cr2": ImageDocument,
"tif": ImageDocument,
"bmp": ImageDocument,
"jxr": ImageDocument,
"psd": ImageDocument,
"ico": ImageDocument,
"heic": ImageDocument,
"avif": ImageDocument,
"aac": AudioDocument, # Audio documents
"mid": AudioDocument,
"mp3": AudioDocument,
"m4a": AudioDocument,
"ogg": AudioDocument,
"flac": AudioDocument,
"wav": AudioDocument,
"amr": AudioDocument,
"aiff": AudioDocument,
}
def classify_documents(data_documents: list[Data]) -> list[Document]:
documents = [
EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name)
EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
id=data_item.id,
title=f"{data_item.name}.{data_item.extension}",
raw_data_location=data_item.raw_data_location,
name=data_item.name,
)
for data_item in data_documents
]
return documents

View file

@ -0,0 +1,48 @@
import os
import asyncio
import pathlib
import cognee
from cognee.api.v1.search import SearchType
# Prerequisites:
# 1. Copy `.env.template` and rename it to `.env`.
# 2. Add your OpenAI API key to the `.env` file in the `LLM_API_KEY` field:
# LLM_API_KEY = "your_key_here"
async def main():
# Create a clean slate for cognee -- reset data and system state
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
# cognee knowledge graph will be created based on the text
# and description of these files
mp3_file_path = os.path.join(
pathlib.Path(__file__).parent.parent.parent,
".data/multimedia/text_to_speech.mp3",
)
png_file_path = os.path.join(
pathlib.Path(__file__).parent.parent.parent,
".data/multimedia/example.png",
)
# Add the files, and make it available for cognify
await cognee.add([mp3_file_path, png_file_path])
# Use LLMs and cognee to create knowledge graph
await cognee.cognify()
# Query cognee for summaries of the data in the multimedia files
search_results = await cognee.search(
SearchType.SUMMARIES,
query_text="What is in the multimedia files?",
)
# Display search results
for result_text in search_results:
print(result_text)
if __name__ == "__main__":
asyncio.run(main())

View file

@ -265,7 +265,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "df16431d0f48b006",
"metadata": {
"ExecuteTime": {
@ -304,7 +304,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "9086abf3af077ab4",
"metadata": {
"ExecuteTime": {
@ -349,7 +349,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "a9de0cc07f798b7f",
"metadata": {
"ExecuteTime": {
@ -393,7 +393,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "185ff1c102d06111",
"metadata": {
"ExecuteTime": {
@ -437,7 +437,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "d55ce4c58f8efb67",
"metadata": {
"ExecuteTime": {
@ -479,7 +479,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "ca4ecc32721ad332",
"metadata": {
"ExecuteTime": {
@ -529,14 +529,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "bce39dc6",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# # Setting environment variables\n",
"# Setting environment variables\n",
"if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
"\n",
@ -546,24 +546,26 @@
"if \"LLM_API_KEY\" not in os.environ:\n",
" os.environ[\"LLM_API_KEY\"] = \"\"\n",
"\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" # \"neo4j\" or \"networkx\"\n",
"# \"neo4j\" or \"networkx\"\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
"# Not needed if using networkx\n",
"#GRAPH_DATABASE_URL=\"\"\n",
"#GRAPH_DATABASE_USERNAME=\"\"\n",
"#GRAPH_DATABASE_PASSWORD=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
"\n",
"os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" # \"qdrant\", \"weaviate\" or \"lancedb\"\n",
"# Not needed if using \"lancedb\"\n",
"# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
"os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
"# Not needed if using \"lancedb\" or \"pgvector\"\n",
"# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
"# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
"\n",
"# Database provider\n",
"os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n",
"# Relational Database provider \"sqlite\" or \"postgres\"\n",
"os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n",
"\n",
"# Database name\n",
"os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
"\n",
"# Postgres specific parameters (Only if Postgres is run)\n",
"# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
"# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
"# os.environ[\"DB_PORT\"]=\"5432\"\n",
"# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
@ -620,7 +622,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "7c431fdef4921ae0",
"metadata": {
"ExecuteTime": {

View file

@ -52,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -71,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -90,23 +90,23 @@
"# \"neo4j\" or \"networkx\"\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
"# Not needed if using networkx\n",
"#GRAPH_DATABASE_URL=\"\"\n",
"#GRAPH_DATABASE_USERNAME=\"\"\n",
"#GRAPH_DATABASE_PASSWORD=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
"\n",
"# \"qdrant\", \"weaviate\" or \"lancedb\"\n",
"# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
"os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
"# Not needed if using \"lancedb\"\n",
"# Not needed if using \"lancedb\" or \"pgvector\"\n",
"# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
"# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
"\n",
"# Database provider\n",
"os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n",
"# Relational Database provider \"sqlite\" or \"postgres\"\n",
"os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n",
"\n",
"# Database name\n",
"os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
"\n",
"# Postgres specific parameters (Only if Postgres is run)\n",
"# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
"# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
"# os.environ[\"DB_PORT\"]=\"5432\"\n",
"# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
@ -130,8 +130,6 @@
"\n",
"from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables\n",
"from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables\n",
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
"from cognee.shared.utils import render_graph\n",
"from cognee.modules.users.models import User\n",
"from cognee.modules.users.methods import get_default_user\n",
"from cognee.tasks.ingestion.ingest_data_with_metadata import ingest_data_with_metadata\n",
@ -196,6 +194,9 @@
"source": [
"import graphistry\n",
"\n",
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
"from cognee.shared.utils import render_graph\n",
"\n",
"# Get graph\n",
"graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
"graph_engine = await get_graph_engine()\n",

View file

@ -0,0 +1,169 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cognee GraphRAG with Multimedia files"
]
},
{
"cell_type": "markdown",
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"source": [
"## Load Data\n",
"\n",
"We will use a few sample multimedia files which we have on GitHub for easy access."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pathlib\n",
"\n",
"# cognee knowledge graph will be created based on the text\n",
"# and description of these files\n",
"mp3_file_path = os.path.join(\n",
" os.path.abspath(''), \"../\",\n",
" \".data/multimedia/text_to_speech.mp3\",\n",
")\n",
"png_file_path = os.path.join(\n",
" os.path.abspath(''), \"../\",\n",
" \".data/multimedia/example.png\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set environment variables"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Setting environment variables\n",
"if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
"\n",
"if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
"\n",
"if \"LLM_API_KEY\" not in os.environ:\n",
" os.environ[\"LLM_API_KEY\"] = \"\"\n",
"\n",
"# \"neo4j\" or \"networkx\"\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
"# Not needed if using networkx\n",
"#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
"#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
"\n",
"# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
"os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
"# Not needed if using \"lancedb\" or \"pgvector\"\n",
"# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
"# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
"\n",
"# Relational Database provider \"sqlite\" or \"postgres\"\n",
"os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n",
"\n",
"# Database name\n",
"os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
"\n",
"# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
"# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
"# os.environ[\"DB_PORT\"]=\"5432\"\n",
"# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
"# os.environ[\"DB_PASSWORD\"]=\"cognee\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run Cognee with multimedia files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import cognee\n",
"\n",
"# Create a clean slate for cognee -- reset data and system state\n",
"await cognee.prune.prune_data()\n",
"await cognee.prune.prune_system(metadata=True)\n",
"\n",
"# Add multimedia files and make them available for cognify\n",
"await cognee.add([mp3_file_path, png_file_path])\n",
"\n",
"# Create knowledge graph with cognee\n",
"await cognee.cognify()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query Cognee for summaries related to multimedia files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cognee.api.v1.search import SearchType\n",
"\n",
"# Query cognee for summaries of the data in the multimedia files\n",
"search_results = await cognee.search(\n",
" SearchType.SUMMARIES,\n",
" query_text=\"What is in the multimedia files?\",\n",
")\n",
"\n",
"# Display search results\n",
"for result_text in search_results:\n",
" print(result_text)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}