feat: add sqlalchemy as dlt destination (#137)

* feat: add sqlalchemy as dlt destination

* Fix the demo, update Readme

* fix: add 1.5 notebook

---------

Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
Boris 2024-09-21 15:58:28 +02:00 committed by GitHub
parent a09f7991e2
commit a9433e9283
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
56 changed files with 2435 additions and 2554 deletions

View file

@ -18,13 +18,6 @@ jobs:
name: docs changes name: docs changes
uses: ./.github/workflows/get_docs_changes.yml uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_neo4j_integration_test: run_neo4j_integration_test:
name: test name: test
needs: get_docs_changes needs: get_docs_changes
@ -35,18 +28,6 @@ jobs:
run: run:
shell: bash shell: bash
services:
postgres:
image: postgres:latest
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps: steps:
- name: Check out - name: Check out
uses: actions/checkout@master uses: actions/checkout@master
@ -66,18 +47,6 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run default Neo4j - name: Run default Neo4j
env: env:
ENV: 'dev' ENV: 'dev'
@ -85,14 +54,4 @@ jobs:
GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }} GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }}
GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }} GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }}
GRAPH_DATABASE_USERNAME: "neo4j" GRAPH_DATABASE_USERNAME: "neo4j"
DB_USER: cognee
DB_PASSWORD: cognee
DB_NAME: cognee_db
DB_HOST: localhost
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_neo4j.py run: poetry run python ./cognee/tests/test_neo4j.py

View file

@ -18,15 +18,6 @@ jobs:
name: docs changes name: docs changes
uses: ./.github/workflows/get_docs_changes.yml uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
strategy:
fail-fast: false
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_common: run_common:
name: test name: test
needs: get_docs_changes needs: get_docs_changes
@ -38,19 +29,6 @@ jobs:
run: run:
shell: bash shell: bash
services:
postgres:
image: postgres:latest
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps: steps:
- name: Check out - name: Check out
uses: actions/checkout@master uses: actions/checkout@master
@ -71,23 +49,6 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run tests - name: Run tests
run: poetry run pytest tests/ run: poetry run pytest tests/
@ -95,16 +56,6 @@ jobs:
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DB_HOST: localhost
DB_USERNAME: cognee
DB_PASSWORD: cognee
DB_DATABASE: cognee_db
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_library.py run: poetry run python ./cognee/tests/test_library.py
- name: Clean up disk space - name: Clean up disk space

View file

@ -18,15 +18,6 @@ jobs:
name: docs changes name: docs changes
uses: ./.github/workflows/get_docs_changes.yml uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
strategy:
fail-fast: false
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_common: run_common:
name: test name: test
needs: get_docs_changes needs: get_docs_changes
@ -38,19 +29,6 @@ jobs:
run: run:
shell: bash shell: bash
services:
postgres:
image: postgres:latest
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps: steps:
- name: Check out - name: Check out
uses: actions/checkout@master uses: actions/checkout@master
@ -71,23 +49,6 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run tests - name: Run tests
run: poetry run pytest tests/ run: poetry run pytest tests/
@ -95,16 +56,6 @@ jobs:
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DB_HOST: localhost
DB_USERNAME: cognee
DB_PASSWORD: cognee
DB_DATABASE: cognee_db
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_library.py run: poetry run python ./cognee/tests/test_library.py
- name: Clean up disk space - name: Clean up disk space

View file

@ -18,15 +18,6 @@ jobs:
name: docs changes name: docs changes
uses: ./.github/workflows/get_docs_changes.yml uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
strategy:
fail-fast: false
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_common: run_common:
name: test name: test
needs: get_docs_changes needs: get_docs_changes
@ -38,19 +29,6 @@ jobs:
run: run:
shell: bash shell: bash
services:
postgres:
image: postgres:latest
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps: steps:
- name: Check out - name: Check out
uses: actions/checkout@master uses: actions/checkout@master
@ -71,23 +49,6 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run tests - name: Run tests
run: poetry run pytest tests/ run: poetry run pytest tests/
@ -95,16 +56,6 @@ jobs:
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DB_HOST: localhost
DB_USERNAME: cognee
DB_PASSWORD: cognee
DB_DATABASE: cognee_db
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_library.py run: poetry run python ./cognee/tests/test_library.py
- name: Clean up disk space - name: Clean up disk space

View file

@ -18,13 +18,6 @@ jobs:
name: docs changes name: docs changes
uses: ./.github/workflows/get_docs_changes.yml uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_qdrant_integration_test: run_qdrant_integration_test:
name: test name: test
needs: get_docs_changes needs: get_docs_changes
@ -35,18 +28,6 @@ jobs:
run: run:
shell: bash shell: bash
services:
postgres:
image: postgres:latest
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps: steps:
- name: Check out - name: Check out
uses: actions/checkout@master uses: actions/checkout@master
@ -66,32 +47,10 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run default Qdrant - name: Run default Qdrant
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
VECTOR_DB_URL: ${{ secrets.QDRANT_API_URL }} VECTOR_DB_URL: ${{ secrets.QDRANT_API_URL }}
VECTOR_DB_KEY: ${{ secrets.QDRANT_API_KEY }} VECTOR_DB_KEY: ${{ secrets.QDRANT_API_KEY }}
DB_USER: cognee
DB_PASSWORD: cognee
DB_NAME: cognee_db
DB_HOST: localhost
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_qdrant.py run: poetry run python ./cognee/tests/test_qdrant.py

View file

@ -18,13 +18,6 @@ jobs:
name: docs changes name: docs changes
uses: ./.github/workflows/get_docs_changes.yml uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_weaviate_integration_test: run_weaviate_integration_test:
name: test name: test
needs: get_docs_changes needs: get_docs_changes
@ -35,18 +28,6 @@ jobs:
run: run:
shell: bash shell: bash
services:
postgres:
image: postgres:latest
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps: steps:
- name: Check out - name: Check out
uses: actions/checkout@master uses: actions/checkout@master
@ -66,32 +47,10 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run default Weaviate - name: Run default Weaviate
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
VECTOR_DB_URL: ${{ secrets.WEAVIATE_API_URL }} VECTOR_DB_URL: ${{ secrets.WEAVIATE_API_URL }}
VECTOR_DB_KEY: ${{ secrets.WEAVIATE_API_KEY }} VECTOR_DB_KEY: ${{ secrets.WEAVIATE_API_KEY }}
DB_USER: cognee
DB_PASSWORD: cognee
DB_NAME: cognee_db
DB_HOST: localhost
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_weaviate.py run: poetry run python ./cognee/tests/test_weaviate.py

172
README.md
View file

@ -18,24 +18,12 @@ We build for developers who need a reliable, production-ready data layer for AI
</a> </a>
</p> </p>
cognee implements scalable, modular data pipelines that allow for creating the LLM-enriched data layer using graph and vector stores.
<p>
<i> cognee aims to be dbt for LLMOps</i>
</p>
## What is cognee?
cognee implements scalable, modular ECL (Extract, Cognify, Load) pipelines that allow you ability to interconnect and retrieve past conversations, documents, audio transcriptions, while also reducing hallucinations, developer effort and cost.
Try it in a Google collab <a href="https://colab.research.google.com/drive/1jayZ5JRwDaUGFvCw9UZySBG-iB9gpYfu?usp=sharing">notebook</a> or have a look at our <a href="https://topoteretes.github.io/cognee">documentation</a> Try it in a Google collab <a href="https://colab.research.google.com/drive/1jayZ5JRwDaUGFvCw9UZySBG-iB9gpYfu?usp=sharing">notebook</a> or have a look at our <a href="https://topoteretes.github.io/cognee">documentation</a>
If you have questions, join our <a href="https://discord.gg/NQPKmU5CCg">Discord</a> community If you have questions, join our <a href="https://discord.gg/NQPKmU5CCg">Discord</a> community
@ -58,7 +46,7 @@ poetry add cognee
``` ```
## 💻 Usage ## 💻 Basic Usage
### Setup ### Setup
@ -75,24 +63,6 @@ cognee.config.llm_api_key = "YOUR_OPENAI_API_KEY"
``` ```
You can use different LLM providers, for more info check out our <a href="https://topoteretes.github.io/cognee">documentation</a> You can use different LLM providers, for more info check out our <a href="https://topoteretes.github.io/cognee">documentation</a>
In the next step make sure to launch a Postgres instance. Here is an example from our docker-compose:
```
postgres:
image: postgres:latest
container_name: postgres
environment:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
networks:
- cognee-network
```
If you are using Networkx, create an account on Graphistry to visualize results: If you are using Networkx, create an account on Graphistry to visualize results:
``` ```
@ -106,12 +76,7 @@ docker-compose up cognee
``` ```
Then navigate to localhost:3000/wizard Then navigate to localhost:3000/wizard
### Run the default example ### Simple example
Make sure to launch the Postgres instance first. Navigate to the cognee folder and run:
```
docker compose up postgres
```
Run the default cognee pipeline: Run the default cognee pipeline:
@ -123,7 +88,7 @@ text = """Natural language processing (NLP) is an interdisciplinary
await cognee.add([text], "example_dataset") # Add a new piece of information await cognee.add([text], "example_dataset") # Add a new piece of information
await cognee.cognify() # Use LLMs and cognee to create knowledge await cognee.cognify() # Use LLMs and cognee to create a semantic graph
await search_results = cognee.search("SIMILARITY", {'query': 'Tell me about NLP'}) # Query cognee for the knowledge await search_results = cognee.search("SIMILARITY", {'query': 'Tell me about NLP'}) # Query cognee for the knowledge
@ -132,19 +97,20 @@ print(search_results)
``` ```
### Create your pipelines ### Create your own memory store
cognee framework consists of tasks that can be grouped into pipelines.
Each task can be an independent part of business logic, that can be tied to other tasks to form a pipeline.
These tasks persist data into your memory store enabling you to search for relevant context of past conversations, documents, or any other data you have stored.
### Example: Classify your documents
cognee framework consists of tasks that can be grouped into pipelines. Each task can be an independent part of business logic, that can be tied to other tasks to form a pipeline.
Here is an example of how it looks for a default cognify pipeline: Here is an example of how it looks for a default cognify pipeline:
1. To prepare the data for the pipeline run, first we need to add it to our metastore and normalize it: 1. To prepare the data for the pipeline run, first we need to add it to our metastore and normalize it:
Start with: Start with:
```
docker compose up postgres
```
And then run:
``` ```
text = """Natural language processing (NLP) is an interdisciplinary text = """Natural language processing (NLP) is an interdisciplinary
subfield of computer science and information retrieval""" subfield of computer science and information retrieval"""
@ -158,90 +124,62 @@ Here we show an example of creating a naive LLM classifier that takes a Pydantic
We provided just a snippet for reference, but feel free to check out the implementation in our repo. We provided just a snippet for reference, but feel free to check out the implementation in our repo.
``` ```
async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]): async def chunk_naive_llm_classifier(
if len(data_chunks) == 0: data_chunks: list[DocumentChunk],
return data_chunks classification_model: Type[BaseModel]
):
# Extract classifications asynchronously
chunk_classifications = await asyncio.gather( chunk_classifications = await asyncio.gather(
*[extract_categories(chunk.text, classification_model) for chunk in data_chunks], *(extract_categories(chunk.text, classification_model) for chunk in data_chunks)
) )
classification_data_points = [] # Collect classification data points using a set to avoid duplicates
classification_data_points = {
for chunk_index, chunk in enumerate(data_chunks): uuid5(NAMESPACE_OID, cls.label.type)
chunk_classification = chunk_classifications[chunk_index] for cls in chunk_classifications
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type)) } | {
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type)) uuid5(NAMESPACE_OID, subclass.value)
for cls in chunk_classifications
for classification_subclass in chunk_classification.label.subclass: for subclass in cls.label.subclass
classification_data_points.append(uuid5(NAMESPACE_OID, classification_subclass.value)) }
vector_engine = get_vector_engine() vector_engine = get_vector_engine()
collection_name = "classification"
# Define the payload schema
class Keyword(BaseModel): class Keyword(BaseModel):
uuid: str uuid: str
text: str text: str
chunk_id: str chunk_id: str
document_id: str document_id: str
collection_name = "classification" # Ensure the collection exists and retrieve existing data points
if not await vector_engine.has_collection(collection_name):
if await vector_engine.has_collection(collection_name): await vector_engine.create_collection(collection_name, payload_schema=Keyword)
existing_data_points = await vector_engine.retrieve( existing_points_map = {}
collection_name,
list(set(classification_data_points)),
) if len(classification_data_points) > 0 else []
existing_points_map = {point.id: True for point in existing_data_points}
else: else:
existing_points_map = {} existing_points_map = {}
await vector_engine.create_collection(collection_name, payload_schema=Keyword) return data_chunks
data_points = []
nodes = []
edges = []
for (chunk_index, data_chunk) in enumerate(data_chunks):
chunk_classification = chunk_classifications[chunk_index]
classification_type_label = chunk_classification.label.type
classification_type_id = uuid5(NAMESPACE_OID, classification_type_label)
... ...
``` ```
To see existing tasks, have a look at the cognee.tasks We have a large number of tasks that can be used in your pipelines, and you can also create your own tasks to fit your business logic.
3. Once we have our tasks, it is time to group them into a pipeline. 3. Once we have our tasks, it is time to group them into a pipeline.
This snippet shows how a group of tasks can be added to a pipeline, and how they can pass the information forward from one to another. This simplified snippet demonstrates how tasks can be added to a pipeline, and how they can pass the information forward from one to another.
``` ```
tasks = [
Task(document_to_ontology, root_node_id = root_node_id),
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
Task(chunk_to_graph_decomposition, topology_model = KnowledgeGraph, task_config = { "batch_size": 10 }), # Set the graph topology for the document chunk data
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities"), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
Task(
save_chunks_to_store,
collection_name = "chunks",
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
run_tasks_parallel([
Task(
chunk_extract_summary,
summarization_model = cognee_config.summarization_model,
collection_name = "chunk_summaries",
), # Summarize the document chunks
Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
),
]),
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
]
pipeline = run_tasks(tasks, documents) Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
)
pipeline = run_tasks(tasks, documents)
``` ```
@ -277,3 +215,23 @@ Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/ma
[![Star History Chart](https://api.star-history.com/svg?repos=topoteretes/cognee&type=Date)](https://star-history.com/#topoteretes/cognee&Date) [![Star History Chart](https://api.star-history.com/svg?repos=topoteretes/cognee&type=Date)](https://star-history.com/#topoteretes/cognee&Date)
## Get Started
### Install Server
Please see the [cognee Quick Start Guide](https://topoteretes.github.io/cognee/quickstart/) for important configuration information.
```bash
docker compose up
```
### Install SDK
Please see the cognee [Develoment Guide](https://topoteretes.github.io/cognee/quickstart/) for important beta information and usage instructions.
```bash
pip install cognee
```

View file

@ -14,8 +14,6 @@ from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user from cognee.modules.users.methods import get_authenticated_user
from cognee.infrastructure.databases.relational import create_db_and_tables
# Set up logging # Set up logging
logging.basicConfig( logging.basicConfig(
level=logging.INFO, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL) level=logging.INFO, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
@ -34,8 +32,12 @@ from contextlib import asynccontextmanager
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
from cognee.infrastructure.databases.relational import create_db_and_tables
from cognee.modules.users.methods import get_default_user
# Not needed if you setup a migration system like Alembic # Not needed if you setup a migration system like Alembic
await create_db_and_tables() await create_db_and_tables()
await get_default_user()
yield yield
app = FastAPI(debug = os.getenv("ENV") != "prod", lifespan = lifespan) app = FastAPI(debug = os.getenv("ENV") != "prod", lifespan = lifespan)
@ -394,10 +396,10 @@ def start_api_server(host: str = "0.0.0.0", port: int = 8000):
try: try:
logger.info("Starting server at %s:%s", host, port) logger.info("Starting server at %s:%s", host, port)
import asyncio # import asyncio
from cognee.modules.data.deletion import prune_system, prune_data # from cognee.modules.data.deletion import prune_system, prune_data
asyncio.run(prune_data()) # asyncio.run(prune_data())
asyncio.run(prune_system(metadata = True)) # asyncio.run(prune_system(metadata = True))
uvicorn.run(app, host = host, port = port) uvicorn.run(app, host = host, port = port)
except Exception as e: except Exception as e:

View file

@ -2,7 +2,6 @@ from typing import List, Union, BinaryIO
from os import path from os import path
import asyncio import asyncio
import dlt import dlt
import duckdb
import cognee.modules.ingestion as ingestion import cognee.modules.ingestion as ingestion
from cognee.infrastructure.files.storage import LocalStorage from cognee.infrastructure.files.storage import LocalStorage
@ -81,22 +80,16 @@ async def add_files(file_paths: List[str], dataset_name: str, user: User = None)
relational_config = get_relational_config() relational_config = get_relational_config()
if relational_config.db_provider == "duckdb": destination = dlt.destinations.sqlalchemy(
db = duckdb.connect(relational_config.db_file_path) credentials = {
"host": relational_config.db_host,
destination = dlt.destinations.duckdb( "port": relational_config.db_port,
credentials = db, "username": relational_config.db_username,
) "password": relational_config.db_password,
else: "database": relational_config.db_name,
destination = dlt.destinations.postgres( "drivername": relational_config.db_provider,
credentials = { },
"host": relational_config.db_host, )
"port": relational_config.db_port,
"user": relational_config.db_user,
"password": relational_config.db_password,
"database": relational_config.db_name,
},
)
pipeline = dlt.pipeline( pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem", pipeline_name = "file_load_from_filesystem",

View file

@ -46,72 +46,6 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
if type(datasets[0]) == str: if type(datasets[0]) == str:
datasets = await get_datasets_by_name(datasets, user.id) datasets = await get_datasets_by_name(datasets, user.id)
async def run_cognify_pipeline(dataset: Dataset):
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
document_ids_str = [str(document.id) for document in data_documents]
dataset_id = dataset.id
dataset_name = generate_dataset_name(dataset.name)
async with update_status_lock:
task_status = await get_pipeline_status([dataset_id])
if dataset_id in task_status and task_status[dataset_id] == "DATASET_PROCESSING_STARTED":
logger.info("Dataset %s is already being processed.", dataset_name)
return
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_STARTED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
try:
cognee_config = get_cognify_config()
root_node_id = None
tasks = [
Task(classify_documents),
Task(check_permissions_on_documents, user = user, permissions = ["write"]),
Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
Task(
save_chunks_to_store,
collection_name = "chunks",
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
run_tasks_parallel([
Task(
chunk_extract_summary,
summarization_model = cognee_config.summarization_model,
collection_name = "chunk_summaries",
), # Summarize the document chunks
Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
),
]),
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
]
pipeline = run_tasks(tasks, data_documents)
async for result in pipeline:
print(result)
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_FINISHED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
except Exception as error:
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_ERROR", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
raise error
existing_datasets_map = { existing_datasets_map = {
generate_dataset_name(dataset.name): True for dataset in existing_datasets generate_dataset_name(dataset.name): True for dataset in existing_datasets
} }
@ -122,10 +56,76 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
dataset_name = generate_dataset_name(dataset.name) dataset_name = generate_dataset_name(dataset.name)
if dataset_name in existing_datasets_map: if dataset_name in existing_datasets_map:
awaitables.append(run_cognify_pipeline(dataset)) awaitables.append(run_cognify_pipeline(dataset, user))
return await asyncio.gather(*awaitables) return await asyncio.gather(*awaitables)
async def run_cognify_pipeline(dataset: Dataset, user: User):
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
document_ids_str = [str(document.id) for document in data_documents]
dataset_id = dataset.id
dataset_name = generate_dataset_name(dataset.name)
async with update_status_lock:
task_status = await get_pipeline_status([dataset_id])
if dataset_id in task_status and task_status[dataset_id] == "DATASET_PROCESSING_STARTED":
logger.info("Dataset %s is already being processed.", dataset_name)
return
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_STARTED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
try:
cognee_config = get_cognify_config()
root_node_id = None
tasks = [
Task(classify_documents),
Task(check_permissions_on_documents, user = user, permissions = ["write"]),
Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
Task(
save_chunks_to_store,
collection_name = "chunks",
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
run_tasks_parallel([
Task(
chunk_extract_summary,
summarization_model = cognee_config.summarization_model,
collection_name = "chunk_summaries",
), # Summarize the document chunks
Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
),
]),
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
]
pipeline = run_tasks(tasks, data_documents)
async for result in pipeline:
print(result)
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_FINISHED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
except Exception as error:
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_ERROR", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
raise error
def generate_dataset_name(dataset_name: str) -> str: def generate_dataset_name(dataset_name: str) -> str:
return dataset_name.replace(".", "_").replace(" ", "_") return dataset_name.replace(".", "_").replace(" ", "_")

View file

@ -1,23 +0,0 @@
from typing import Protocol
class DatabaseEngine(Protocol):
async def ensure_tables(self):
pass
def database_exists(self, db_name: str) -> bool:
pass
def create_database(self, db_name: str):
pass
def drop_database(self, db_name: str):
pass
async def table_exists(self, table_name: str) -> bool:
pass
async def create_tables(self):
pass
async def create(self, data):
pass

View file

@ -1,29 +0,0 @@
import inspect
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import Session
class FakeAsyncSession:
def __init__(self, session: Session):
self.session = session
def run_sync(self, *args, **kwargs):
return self.execute(*args, **kwargs)
def __getattr__(self, name: str) -> Any:
"""
If the method being called is async in AsyncSession, create a fake async version
for Session so callers can `await` as usual. Think `commit`, `refresh`,
`delete`, etc.
"""
async_session_attr = getattr(AsyncSession, name, None)
session_attr = getattr(self.session, name)
if not inspect.iscoroutinefunction(async_session_attr):
return session_attr
async def async_wrapper(*args, **kwargs):
return session_attr(*args, **kwargs)
return async_wrapper

View file

@ -1,7 +1,7 @@
from .ModelBase import Base from .ModelBase import Base
from .DatabaseEngine import DatabaseEngine
from .sqlite.SqliteEngine import SqliteEngine
from .duckdb.DuckDBAdapter import DuckDBAdapter
from .config import get_relational_config from .config import get_relational_config
from .create_db_and_tables import create_db_and_tables from .create_db_and_tables import create_db_and_tables
from .get_relational_engine import get_relational_engine from .get_relational_engine import get_relational_engine
# Global data types
from .data_types.UUID import UUID

View file

@ -1,4 +1,5 @@
import os import os
from typing import Union
from functools import lru_cache from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings import BaseSettings, SettingsConfigDict
from cognee.root_dir import get_absolute_path from cognee.root_dir import get_absolute_path
@ -6,13 +7,11 @@ from cognee.root_dir import get_absolute_path
class RelationalConfig(BaseSettings): class RelationalConfig(BaseSettings):
db_path: str = os.path.join(get_absolute_path(".cognee_system"), "databases") db_path: str = os.path.join(get_absolute_path(".cognee_system"), "databases")
db_name: str = "cognee_db" db_name: str = "cognee_db"
db_host: str = "localhost" db_host: Union[str, None] = None # "localhost"
db_port: str = "5432" db_port: Union[str, None] = None # "5432"
db_user: str = "cognee" db_username: Union[str, None] = None # "cognee"
db_password: str = "cognee" db_password: Union[str, None] = None # "cognee"
db_provider: str = "postgresql+asyncpg" db_provider: str = "sqlite"
db_file_path: str = os.path.join(db_path, db_name)
model_config = SettingsConfigDict(env_file = ".env", extra = "allow") model_config = SettingsConfigDict(env_file = ".env", extra = "allow")
@ -22,7 +21,7 @@ class RelationalConfig(BaseSettings):
"db_name": self.db_name, "db_name": self.db_name,
"db_host": self.db_host, "db_host": self.db_host,
"db_port": self.db_port, "db_port": self.db_port,
"db_user": self.db_user, "db_username": self.db_username,
"db_password": self.db_password, "db_password": self.db_password,
"db_provider": self.db_provider, "db_provider": self.db_provider,
} }

View file

@ -1,9 +1,14 @@
from cognee.infrastructure.files.storage import LocalStorage
from .ModelBase import Base from .ModelBase import Base
from .get_relational_engine import get_relational_engine from .get_relational_engine import get_relational_engine, get_relational_config
async def create_db_and_tables(): async def create_db_and_tables():
relational_config = get_relational_config()
relational_engine = get_relational_engine() relational_engine = get_relational_engine()
if relational_engine.engine.dialect.name == "sqlite":
LocalStorage.ensure_directory_exists(relational_config.db_path)
async with relational_engine.engine.begin() as connection: async with relational_engine.engine.begin() as connection:
if len(Base.metadata.tables.keys()) > 0: if len(Base.metadata.tables.keys()) > 0:
await connection.run_sync(Base.metadata.create_all) await connection.run_sync(Base.metadata.create_all)

View file

@ -3,18 +3,16 @@ from .sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter
def create_relational_engine( def create_relational_engine(
db_path: str, db_path: str,
db_name: str, db_name: str,
db_provider: str,
db_host: str, db_host: str,
db_port: str, db_port: str,
db_user: str, db_username: str,
db_password: str, db_password: str,
db_provider: str,
): ):
return SQLAlchemyAdapter( if db_provider == "sqlite":
db_name = db_name, connection_string = f"sqlite+aiosqlite:///{db_path}/{db_name}"
db_path = db_path,
db_type = db_provider, if db_provider == "postgres":
db_host = db_host, connection_string = f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
db_port = db_port,
db_user = db_user, return SQLAlchemyAdapter(connection_string)
db_password = db_password
)

View file

@ -0,0 +1,43 @@
import uuid
from sqlalchemy.types import TypeDecorator, BINARY
from sqlalchemy.dialects.postgresql import UUID as psqlUUID
class UUID(TypeDecorator):
"""Platform-independent GUID type.
Uses Postgresql's UUID type, otherwise uses
BINARY(16), to store UUID.
"""
impl = BINARY
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(psqlUUID())
else:
return dialect.type_descriptor(BINARY(16))
def process_bind_param(self, value, dialect):
if value is None:
return value
else:
if not isinstance(value, uuid.UUID):
if isinstance(value, bytes):
value = uuid.UUID(bytes = value)
elif isinstance(value, int):
value = uuid.UUID(int = value)
elif isinstance(value, str):
value = uuid.UUID(value)
if dialect.name == 'postgresql':
return str(value)
else:
return value.bytes
def process_result_value(self, value, dialect):
if value is None:
return value
if dialect.name == 'postgresql':
return uuid.UUID(value)
else:
return uuid.UUID(bytes = value)

View file

@ -1,169 +0,0 @@
import duckdb
import os
class DuckDBAdapter():
def __init__(self, db_path: str, db_name: str):
self.db_location = os.path.abspath(os.path.join(db_path, db_name))
self.get_connection = lambda: duckdb.connect(self.db_location)
def get_datasets(self):
with self.get_connection() as connection:
tables = connection.sql("SELECT DISTINCT schema_name FROM duckdb_tables();").to_df().to_dict("list")
return list(
filter(
lambda schema_name: not schema_name.endswith("staging") and schema_name != "cognee",
tables["schema_name"]
)
)
def get_files_metadata(self, dataset_name: str):
with self.get_connection() as connection:
return connection.sql(f"SELECT id, name, file_path, extension, mime_type FROM {dataset_name}.file_metadata;").to_df().to_dict("records")
def create_table(self, schema_name: str, table_name: str, table_config: list[dict]):
fields_query_parts = []
for table_config_item in table_config:
fields_query_parts.append(f"{table_config_item['name']} {table_config_item['type']}")
with self.get_connection() as connection:
query = f"CREATE SCHEMA IF NOT EXISTS {schema_name};"
connection.execute(query)
with self.get_connection() as connection:
query = f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({', '.join(fields_query_parts)});"
connection.execute(query)
def delete_table(self, table_name: str):
with self.get_connection() as connection:
query = f"DROP TABLE IF EXISTS {table_name};"
connection.execute(query)
def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
def get_values(data_entry: list):
return ", ".join([f"'{value}'" if isinstance(value, str) else value for value in data_entry])
columns = ", ".join(data[0].keys())
values = ", ".join([f"({get_values(data_entry.values())})" for data_entry in data])
with self.get_connection() as connection:
query = f"INSERT INTO {schema_name}.{table_name} ({columns}) VALUES {values};"
connection.execute(query)
def get_data(self, table_name: str, filters: dict = None):
with self.get_connection() as connection:
def get_values(values: list):
return ", ".join([f"'{value}'" for value in values])
def get_filters(filters: dict):
return " AND ".join([
f"{key} IN ({get_values(value)})" if isinstance(value, list)
else f"{key} = '{value}'" for (key, value) in filters.items()
])
query = f"SELECT * FROM {table_name}" + (";" if filters is None else f" WHERE {get_filters(filters)};")
results = connection.sql(query).to_df().to_dict("records")
return {
result["data_id"]: result["status"] for result in results
}
def execute_query(self, query):
with self.get_connection() as connection:
return connection.sql(query).to_df().to_dict("records")
def load_cognify_data(self, data):
with self.get_connection() as connection:
# Ensure the "cognify" table exists
connection.execute("""
CREATE TABLE IF NOT EXISTS cognify (
document_id STRING,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT NULL,
processed BOOLEAN DEFAULT FALSE,
document_id_target STRING NULL
);
""")
# Prepare the insert statement
insert_query = """
INSERT INTO cognify (document_id)
VALUES (?);
"""
# Insert each record into the "cognify" table
for record in data:
with self.get_connection() as connection:
connection.execute(insert_query, [
record.get("document_id"),
])
def fetch_cognify_data(self, excluded_document_id: str):
# SQL command to create the "cognify" table with the specified columns
create_table_sql = """
CREATE TABLE IF NOT EXISTS cognify (
document_id STRING,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT NULL,
processed BOOLEAN DEFAULT FALSE,
document_id_target STRING NULL
);
"""
with self.get_connection() as connection:
# Execute the SQL command to create the table
connection.execute(create_table_sql)
# SQL command to select data from the "cognify" table
select_data_sql = f"SELECT document_id, created_at, updated_at, processed FROM cognify WHERE document_id != '{excluded_document_id}' AND processed = FALSE;"
with self.get_connection() as connection:
# Execute the query and fetch the results
records = connection.sql(select_data_sql).to_df().to_dict("records")
# If records are fetched, update the "processed" column to "True"
if records:
# Fetching document_ids from the records to update the "processed" column
document_ids = tuple(record["document_id"] for record in records)
# SQL command to update the "processed" column to "True" for fetched records
update_data_sql = f"UPDATE cognify SET processed = TRUE WHERE document_id IN {document_ids};"
with self.get_connection() as connection:
# Execute the update query
connection.execute(update_data_sql)
# Return the fetched records
return records
def delete_cognify_data(self):
# SQL command to create the "cognify" table with the specified columns
create_table_sql = """
CREATE TABLE IF NOT EXISTS cognify (
document_id STRING,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT NULL,
processed BOOLEAN DEFAULT FALSE,
document_id_target STRING NULL
);
"""
with self.get_connection() as connection:
# Execute the SQL command to create the table
connection.execute(create_table_sql)
with self.get_connection() as connection:
# SQL command to select data from the "cognify" table
select_data_sql = "DELETE FROM cognify;"
connection.sql(select_data_sql)
drop_data_sql = "DROP TABLE cognify;"
connection.sql(drop_data_sql)
def delete_database(self):
from cognee.infrastructure.files.storage import LocalStorage
if LocalStorage.file_exists(self.db_location):
LocalStorage.remove(self.db_location)
if LocalStorage.file_exists(self.db_location + ".wal"):
LocalStorage.remove(self.db_location + ".wal")

View file

@ -1,26 +0,0 @@
from abc import abstractmethod
from typing import Protocol, TypeVar, Type, List
RowDataType = TypeVar('RowDataType')
class RelationalDBInterface(Protocol):
@abstractmethod
async def create_database(self, database_name: str, database_path: str): raise NotImplementedError
@abstractmethod
async def create_table(self, table_name: str, table_config: object): raise NotImplementedError
@abstractmethod
async def add_row(self, table_name: str, row_data: Type[RowDataType]): raise NotImplementedError
@abstractmethod
async def add_rows(self, table_name: str, rows_data: List[Type[RowDataType]]): raise NotImplementedError
@abstractmethod
async def get_row(self, table_name: str, row_id: str): raise NotImplementedError
@abstractmethod
async def update_row(self, table_name: str, row_id: str, row_data: Type[RowDataType]): raise NotImplementedError
@abstractmethod
async def delete_row(self, table_name: str, row_id: str): raise NotImplementedError

View file

@ -1,39 +1,18 @@
import os
import asyncio
from typing import AsyncGenerator from typing import AsyncGenerator
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from sqlalchemy import create_engine, text, select from sqlalchemy import text, select
from sqlalchemy.orm import sessionmaker, joinedload from sqlalchemy.orm import joinedload
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.FakeAsyncSession import FakeAsyncSession
from ..ModelBase import Base from ..ModelBase import Base
def make_async_sessionmaker(sessionmaker):
@asynccontextmanager
async def async_session_maker():
await asyncio.sleep(0.1)
session = FakeAsyncSession(sessionmaker())
try:
yield session
finally:
await session.close() # Ensure the session is closed
return async_session_maker
class SQLAlchemyAdapter(): class SQLAlchemyAdapter():
def __init__(self, db_type: str, db_path: str, db_name: str, db_user: str, db_password: str, db_host: str, db_port: str): def __init__(self, connection_string: str):
self.db_location = os.path.abspath(os.path.join(db_path, db_name)) self.engine = create_async_engine(connection_string)
self.db_name = db_name self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
if db_type == "duckdb": if self.engine.dialect.name == "sqlite":
LocalStorage.ensure_directory_exists(db_path) self.db_path = connection_string.split("///")[1]
self.engine = create_engine(f"duckdb:///{self.db_location}")
self.sessionmaker = make_async_sessionmaker(sessionmaker(bind=self.engine))
else:
self.engine = create_async_engine(f"postgresql+asyncpg://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
@asynccontextmanager @asynccontextmanager
async def get_async_session(self) -> AsyncGenerator[AsyncSession, None]: async def get_async_session(self) -> AsyncGenerator[AsyncSession, None]:
@ -72,6 +51,7 @@ class SQLAlchemyAdapter():
await connection.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE;")) await connection.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE;"))
await connection.close() await connection.close()
async def insert_data(self, schema_name: str, table_name: str, data: list[dict]): async def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
columns = ", ".join(data[0].keys()) columns = ", ".join(data[0].keys())
values = ", ".join([f"({', '.join([f':{key}' for key in row.keys()])})" for row in data]) values = ", ".join([f"({', '.join([f':{key}' for key in row.keys()])})" for row in data])
@ -80,6 +60,7 @@ class SQLAlchemyAdapter():
async with self.engine.begin() as connection: async with self.engine.begin() as connection:
await connection.execute(insert_query, data) await connection.execute(insert_query, data)
await connection.close() await connection.close()
async def get_data(self, table_name: str, filters: dict = None): async def get_data(self, table_name: str, filters: dict = None):
async with self.engine.begin() as connection: async with self.engine.begin() as connection:
query = f"SELECT * FROM {table_name}" query = f"SELECT * FROM {table_name}"
@ -113,11 +94,19 @@ class SQLAlchemyAdapter():
print(f"Error dropping database tables: {e}") print(f"Error dropping database tables: {e}")
async def delete_database(self): async def delete_database(self):
async with self.engine.begin() as connection: try:
try: if self.engine.dialect.name == "sqlite":
for table in Base.metadata.sorted_tables: from cognee.infrastructure.files.storage import LocalStorage
drop_table_query = text(f'DROP TABLE IF EXISTS {table.name} CASCADE')
await connection.execute(drop_table_query) LocalStorage.remove(self.db_path)
print("Database deleted successfully.") self.db_path = None
except Exception as e: else:
print(f"Error deleting database: {e}") async with self.engine.begin() as connection:
for table in Base.metadata.sorted_tables:
drop_table_query = text(f'DROP TABLE IF EXISTS {table.name} CASCADE')
await connection.execute(drop_table_query)
except Exception as e:
print(f"Error deleting database: {e}")
print("Database deleted successfully.")

View file

@ -1,82 +0,0 @@
import os
import asyncio
from typing import Callable
from sqlalchemy.inspection import inspect
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncEngine, AsyncSession, async_scoped_session
from sqlalchemy.future import select
from cognee.infrastructure.files.storage.LocalStorage import LocalStorage
from ..DatabaseEngine import DatabaseEngine
from ..ModelBase import Base
from ..utils import with_rollback
class SqliteEngine(DatabaseEngine):
db_path: str = None
db_name: str = None
engine: AsyncEngine = None
session_maker: Callable[[], async_scoped_session[AsyncSession]] = None
is_db_done: bool = False
def __init__(self, db_path: str, db_name: str):
self.db_path = db_path
self.db_name = db_name
self.db_location = db_path + "/" + db_name
self.engine = create_async_engine(
f"sqlite+aiosqlite:///{self.db_location}",
pool_recycle = 3600,
echo = False
)
self.session_maker = lambda: async_scoped_session(
async_sessionmaker(
bind = self.engine,
class_ = AsyncSession
),
scopefunc = asyncio.current_task
)
async def ensure_tables(self):
if not self.database_exists(self.db_name):
self.create_database(self.db_name)
await self.create_tables()
self.is_db_done = True
return True
def database_exists(self, db_name: str) -> bool:
return os.path.exists(self.db_path + "/" + db_name)
def create_database(self, db_name: str):
LocalStorage.ensure_directory_exists(self.db_path)
with open(self.db_path + "/" + db_name, mode = "w+", encoding = "utf-8") as file:
file.write("")
def drop_database(self, db_name: str):
os.remove(self.db_location)
async def table_exists(self, table_name: str) -> bool:
return inspect(self.engine).has_table(table_name)
async def create_tables(self):
async with self.engine.begin() as connection:
return await connection.run_sync(Base.metadata.create_all)
async def create(self, data):
async with with_rollback(self.session_maker()) as session:
session.add(data)
async def query(self, query_term):
async with with_rollback(self.session_maker()) as session:
return await session.execute(query_term)
async def query_entity(self, entity):
async with with_rollback(self.session_maker()) as session:
return await session.execute(
select(type(entity))
.where(type(entity).id == entity.id)
)
async def update(self, data_update_fn):
async with with_rollback(self.session_maker()):
data_update_fn()

View file

@ -1 +0,0 @@
from .with_rollback import with_rollback

View file

@ -1,18 +0,0 @@
import logging
from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import async_scoped_session
logger = logging.getLogger(__name__)
@asynccontextmanager
async def with_rollback(session: async_scoped_session):
"""Provide a transactional scope around a series of operations."""
try:
# async with session.begin():
yield session
await session.commit()
await session.remove()
except Exception as exception:
await session.rollback()
logger.error("Session rolled back due to: %s", str(exception))
raise exception

View file

@ -4,7 +4,7 @@ import litellm
from litellm import aembedding from litellm import aembedding
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
litellm.set_verbose = True litellm.set_verbose = False
class LiteLLMEmbeddingEngine(EmbeddingEngine): class LiteLLMEmbeddingEngine(EmbeddingEngine):
api_key: str api_key: str

View file

@ -106,11 +106,10 @@ class QDrantAdapter(VectorDBInterface):
points = [convert_to_qdrant_point(point) for point in data_points] points = [convert_to_qdrant_point(point) for point in data_points]
try: try:
result = await client.upload_points( client.upload_points(
collection_name = collection_name, collection_name = collection_name,
points = points points = points
) )
return result
except Exception as error: except Exception as error:
logger.error("Error uploading data points to Qdrant: %s", str(error)) logger.error("Error uploading data points to Qdrant: %s", str(error))
raise error raise error

View file

@ -1,7 +1,7 @@
from datetime import datetime from datetime import datetime
from sqlalchemy.orm import Mapped, MappedColumn from sqlalchemy.orm import Mapped, MappedColumn
from sqlalchemy import Column, String, DateTime, ForeignKey, Enum, UUID, JSON from sqlalchemy import Column, DateTime, ForeignKey, Enum, JSON
from cognee.infrastructure.databases.relational import ModelBase from cognee.infrastructure.databases.relational import Base, UUID
class OperationType(Enum): class OperationType(Enum):
MERGE_DATA = "MERGE_DATA" MERGE_DATA = "MERGE_DATA"
@ -14,10 +14,10 @@ class OperationStatus(Enum):
ERROR = "OPERATION_ERROR" ERROR = "OPERATION_ERROR"
CANCELLED = "OPERATION_CANCELLED" CANCELLED = "OPERATION_CANCELLED"
class Operation(ModelBase): class Operation(Base):
__tablename__ = "operation" __tablename__ = "operation"
id = Column(String, primary_key = True) id = Column(UUID, primary_key = True)
status = Column(Enum(OperationStatus)) status = Column(Enum(OperationStatus))
operation_type = Column(Enum(OperationType)) operation_type = Column(Enum(OperationType))

View file

@ -7,6 +7,8 @@ async def get_datasets_by_name(dataset_names: list[str], user_id: UUID) -> list[
db_engine = get_relational_engine() db_engine = get_relational_engine()
async with db_engine.get_async_session() as session: async with db_engine.get_async_session() as session:
if isinstance(dataset_names, str):
dataset_names = [dataset_names]
datasets = (await session.scalars( datasets = (await session.scalars(
select(Dataset) select(Dataset)
.filter(Dataset.owner_id == user_id) .filter(Dataset.owner_id == user_id)

View file

@ -2,14 +2,14 @@ from uuid import uuid4
from typing import List from typing import List
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, DateTime, UUID from sqlalchemy import Column, String, DateTime
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
from .DatasetData import DatasetData from .DatasetData import DatasetData
class Data(Base): class Data(Base):
__tablename__ = "data" __tablename__ = "data"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
name = Column(String) name = Column(String)
extension = Column(String) extension = Column(String)

View file

@ -2,14 +2,14 @@ from uuid import uuid4
from typing import List from typing import List
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, Text, DateTime, UUID from sqlalchemy import Column, Text, DateTime
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
from .DatasetData import DatasetData from .DatasetData import DatasetData
class Dataset(Base): class Dataset(Base):
__tablename__ = "datasets" __tablename__ = "datasets"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
name = Column(Text) name = Column(Text)

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, UUID, ForeignKey from sqlalchemy import Column, DateTime, ForeignKey
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class DatasetData(Base): class DatasetData(Base):
__tablename__ = "dataset_data" __tablename__ = "dataset_data"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
dataset_id = Column(UUID(as_uuid = True), ForeignKey("datasets.id"), primary_key = True) dataset_id = Column(UUID, ForeignKey("datasets.id"), primary_key = True)
data_id = Column(UUID(as_uuid = True), ForeignKey("data.id"), primary_key = True) data_id = Column(UUID, ForeignKey("data.id"), primary_key = True)

View file

@ -6,18 +6,18 @@ from .Document import Document
class AudioDocument(Document): class AudioDocument(Document):
type: str = "audio" type: str = "audio"
title: str title: str
file_path: str raw_data_location: str
chunking_strategy:str chunking_strategy: str
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"): def __init__(self, id: UUID, title: str, raw_data_location: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title) self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title self.title = title
self.file_path = file_path self.raw_data_location = raw_data_location
self.chunking_strategy = chunking_strategy self.chunking_strategy = chunking_strategy
def read(self): def read(self):
# Transcribe the audio file # Transcribe the audio file
result = get_llm_client().create_transcript(self.file_path) result = get_llm_client().create_transcript(self.raw_data_location)
text = result.text text = result.text
chunker = TextChunker(self.id, get_text = lambda: text) chunker = TextChunker(self.id, get_text = lambda: text)
@ -30,5 +30,5 @@ class AudioDocument(Document):
id=str(self.id), id=str(self.id),
type=self.type, type=self.type,
title=self.title, title=self.title,
file_path=self.file_path, raw_data_location=self.raw_data_location,
) )

View file

@ -5,7 +5,7 @@ class Document(Protocol):
id: UUID id: UUID
type: str type: str
title: str title: str
file_path: str raw_data_location: str
def read(self) -> str: def read(self) -> str:
pass pass

View file

@ -7,16 +7,16 @@ from .Document import Document
class ImageDocument(Document): class ImageDocument(Document):
type: str = "image" type: str = "image"
title: str title: str
file_path: str raw_data_location: str
def __init__(self, id: UUID, title: str, file_path: str): def __init__(self, id: UUID, title: str, raw_data_location: str):
self.id = id or uuid5(NAMESPACE_OID, title) self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title self.title = title
self.file_path = file_path self.raw_data_location = raw_data_location
def read(self): def read(self):
# Transcribe the image file # Transcribe the image file
result = get_llm_client().transcribe_image(self.file_path) result = get_llm_client().transcribe_image(self.raw_data_location)
text = result.choices[0].message.content text = result.choices[0].message.content
chunker = TextChunker(self.id, get_text = lambda: text) chunker = TextChunker(self.id, get_text = lambda: text)
@ -29,5 +29,5 @@ class ImageDocument(Document):
id=str(self.id), id=str(self.id),
type=self.type, type=self.type,
title=self.title, title=self.title,
file_path=self.file_path, raw_data_location=self.raw_data_location,
) )

View file

@ -6,15 +6,15 @@ from .Document import Document
class PdfDocument(Document): class PdfDocument(Document):
type: str = "pdf" type: str = "pdf"
title: str title: str
file_path: str raw_data_location: str
def __init__(self, id: UUID, title: str, file_path: str): def __init__(self, id: UUID, title: str, raw_data_location: str):
self.id = id or uuid5(NAMESPACE_OID, title) self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title self.title = title
self.file_path = file_path self.raw_data_location = raw_data_location
def read(self) -> PdfReader: def read(self) -> PdfReader:
file = PdfReader(self.file_path) file = PdfReader(self.raw_data_location)
def get_text(): def get_text():
for page in file.pages: for page in file.pages:
@ -32,5 +32,5 @@ class PdfDocument(Document):
id = str(self.id), id = str(self.id),
type = self.type, type = self.type,
title = self.title, title = self.title,
file_path = self.file_path, raw_data_location = self.raw_data_location,
) )

View file

@ -5,16 +5,16 @@ from .Document import Document
class TextDocument(Document): class TextDocument(Document):
type: str = "text" type: str = "text"
title: str title: str
file_path: str raw_data_location: str
def __init__(self, id: UUID, title: str, file_path: str): def __init__(self, id: UUID, title: str, raw_data_location: str):
self.id = id or uuid5(NAMESPACE_OID, title) self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title self.title = title
self.file_path = file_path self.raw_data_location = raw_data_location
def read(self): def read(self):
def get_text(): def get_text():
with open(self.file_path, mode = "r", encoding = "utf-8") as file: with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
while True: while True:
text = file.read(1024) text = file.read(1024)
@ -34,5 +34,5 @@ class TextDocument(Document):
id = str(self.id), id = str(self.id),
type = self.type, type = self.type,
title = self.title, title = self.title,
file_path = self.file_path, raw_data_location = self.raw_data_location,
) )

View file

@ -1,14 +1,14 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, UUID, DateTime, String, Text from sqlalchemy import Column, DateTime, String, Text
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
from .PipelineTask import PipelineTask from .PipelineTask import PipelineTask
class Pipeline(Base): class Pipeline(Base):
__tablename__ = "pipelines" __tablename__ = "pipelines"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
name = Column(String) name = Column(String)
description = Column(Text, nullable = True) description = Column(Text, nullable = True)

View file

@ -1,16 +1,16 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, UUID, DateTime, String, JSON from sqlalchemy import Column, DateTime, String, JSON
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class PipelineRun(Base): class PipelineRun(Base):
__tablename__ = "pipeline_runs" __tablename__ = "pipeline_runs"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
status = Column(String) status = Column(String)
run_id = Column(UUID(as_uuid = True), index = True) run_id = Column(UUID, index = True)
run_info = Column(JSON) run_info = Column(JSON)

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, UUID, ForeignKey from sqlalchemy import Column, DateTime, ForeignKey
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class PipelineTask(Base): class PipelineTask(Base):
__tablename__ = "pipeline_task" __tablename__ = "pipeline_task"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
pipeline_id = Column("pipeline", UUID(as_uuid = True), ForeignKey("pipeline.id"), primary_key = True) pipeline_id = Column("pipeline", UUID, ForeignKey("pipeline.id"), primary_key = True)
task_id = Column("task", UUID(as_uuid = True), ForeignKey("task.id"), primary_key = True) task_id = Column("task", UUID, ForeignKey("task.id"), primary_key = True)

View file

@ -1,14 +1,14 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, DateTime, UUID, Text from sqlalchemy import Column, String, DateTime, Text
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
from .PipelineTask import PipelineTask from .PipelineTask import PipelineTask
class Task(Base): class Task(Base):
__tablename__ = "tasks" __tablename__ = "tasks"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
name = Column(String) name = Column(String)
description = Column(Text, nullable = True) description = Column(Text, nullable = True)

View file

@ -1,12 +1,12 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, UUID, DateTime, String, JSON from sqlalchemy import Column, DateTime, String, JSON
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class TaskRun(Base): class TaskRun(Base):
__tablename__ = "task_runs" __tablename__ = "task_runs"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
task_name = Column(String) task_name = Column(String)

View file

@ -1,20 +1,20 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, ForeignKey, DateTime, UUID from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
from .ACLResources import ACLResources from .ACLResources import ACLResources
class ACL(Base): class ACL(Base):
__tablename__ = "acls" __tablename__ = "acls"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
principal_id = Column(UUID(as_uuid = True), ForeignKey("principals.id")) principal_id = Column(UUID, ForeignKey("principals.id"))
permission_id = Column(UUID(as_uuid = True), ForeignKey("permissions.id")) permission_id = Column(UUID, ForeignKey("permissions.id"))
principal = relationship("Principal") principal = relationship("Principal")
permission = relationship("Permission") permission = relationship("Permission")

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, UUID, DateTime from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class ACLResources(Base): class ACLResources(Base):
__tablename__ = "acl_resources" __tablename__ = "acl_resources"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
acl_id = Column(UUID(as_uuid = True), ForeignKey("acls.id"), primary_key = True) acl_id = Column(UUID, ForeignKey("acls.id"), primary_key = True)
resource_id = Column(UUID(as_uuid = True), ForeignKey("resources.id"), primary_key = True) resource_id = Column(UUID, ForeignKey("resources.id"), primary_key = True)

View file

@ -1,12 +1,13 @@
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, ForeignKey, UUID from sqlalchemy import Column, String, ForeignKey
from cognee.infrastructure.databases.relational import UUID
from .Principal import Principal from .Principal import Principal
from .UserGroup import UserGroup from .UserGroup import UserGroup
class Group(Principal): class Group(Principal):
__tablename__ = "groups" __tablename__ = "groups"
id = Column(UUID(as_uuid = True), ForeignKey("principals.id"), primary_key = True) id = Column(UUID, ForeignKey("principals.id"), primary_key = True)
name = Column(String, unique = True, nullable = False, index = True) name = Column(String, unique = True, nullable = False, index = True)

View file

@ -1,8 +1,8 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
# from sqlalchemy.orm import relationship # from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime, UUID, String from sqlalchemy import Column, DateTime, String
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class Permission(Base): class Permission(Base):
__tablename__ = "permissions" __tablename__ = "permissions"

View file

@ -1,12 +1,12 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, String, DateTime, UUID from sqlalchemy import Column, String, DateTime
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class Principal(Base): class Principal(Base):
__tablename__ = "principals" __tablename__ = "principals"
id = Column(UUID(as_uuid = True), primary_key = True, index = True, default = uuid4) id = Column(UUID, primary_key = True, index = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))

View file

@ -1,18 +1,18 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime, UUID from sqlalchemy import Column, DateTime
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
from .ACLResources import ACLResources from .ACLResources import ACLResources
class Resource(Base): class Resource(Base):
__tablename__ = "resources" __tablename__ = "resources"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4) id = Column(UUID, primary_key = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
resource_id = Column(UUID(as_uuid = True), nullable = False) resource_id = Column(UUID, nullable = False)
acls = relationship("ACL", secondary = ACLResources.__tablename__, back_populates = "resources") acls = relationship("ACL", secondary = ACLResources.__tablename__, back_populates = "resources")

View file

@ -1,14 +1,15 @@
from uuid import UUID as uuid_UUID from uuid import UUID as uuid_UUID
from sqlalchemy import ForeignKey, UUID, Column from sqlalchemy import ForeignKey, Column
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from fastapi_users.db import SQLAlchemyBaseUserTableUUID from fastapi_users.db import SQLAlchemyBaseUserTableUUID
from cognee.infrastructure.databases.relational import UUID
from .Principal import Principal from .Principal import Principal
from .UserGroup import UserGroup from .UserGroup import UserGroup
class User(SQLAlchemyBaseUserTableUUID, Principal): class User(SQLAlchemyBaseUserTableUUID, Principal):
__tablename__ = "users" __tablename__ = "users"
id = Column(UUID(as_uuid = True), ForeignKey("principals.id"), primary_key = True) id = Column(UUID, ForeignKey("principals.id"), primary_key = True)
groups: Mapped[list["Group"]] = relationship( groups: Mapped[list["Group"]] = relationship(
secondary = UserGroup.__tablename__, secondary = UserGroup.__tablename__,

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, DateTime, UUID from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base from cognee.infrastructure.databases.relational import Base, UUID
class UserGroup(Base): class UserGroup(Base):
__tablename__ = "user_groups" __tablename__ = "user_groups"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc)) created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
user_id = Column(UUID(as_uuid = True), ForeignKey("users.id"), primary_key = True) user_id = Column(UUID, ForeignKey("users.id"), primary_key = True)
group_id = Column(UUID(as_uuid = True), ForeignKey("groups.id"), primary_key = True) group_id = Column(UUID, ForeignKey("groups.id"), primary_key = True)

View file

@ -24,7 +24,7 @@ def chunk_by_word(data: str):
while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "): while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "):
j += 1 j += 1
next_character = data[j] if j < len(data) else None next_character = data[j] if j < len(data) else None
if next_character.isupper(): if next_character and next_character.isupper():
return True return True
return False return False

View file

@ -3,10 +3,10 @@ from cognee.modules.data.processing.document_types import Document, PdfDocument,
def classify_documents(data_documents: list[Data]) -> list[Document]: def classify_documents(data_documents: list[Data]) -> list[Document]:
documents = [ documents = [
PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "pdf" else PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "pdf" else
AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "audio" else AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "audio" else
ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "image" else ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "image" else
TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location)
for data_item in data_documents for data_item in data_documents
] ]

View file

@ -89,7 +89,7 @@ class OntologyEngine:
chunk_strategy = chunk_config.chunk_strategy chunk_strategy = chunk_config.chunk_strategy
for base_file in documents: for base_file in documents:
with open(base_file.file_path, "rb") as file: with open(base_file.raw_data_location, "rb") as file:
try: try:
file_type = guess_file_type(file) file_type = guess_file_type(file)
text = extract_text_from_file(file, file_type) text = extract_text_from_file(file, file_type)
@ -175,7 +175,7 @@ async def infer_data_ontology(documents, ontology_model = KnowledgeGraph, root_n
ontology_engine = OntologyEngine() ontology_engine = OntologyEngine()
root_node_id = await ontology_engine.add_graph_ontology(documents = documents) root_node_id = await ontology_engine.add_graph_ontology(documents = documents)
else: else:
graph_engine = get_graph_engine() graph_engine = await get_graph_engine()
await add_model_class_to_graph(ontology_model, graph_engine) await add_model_class_to_graph(ontology_model, graph_engine)
yield (documents, root_node_id) yield (documents, root_node_id)

View file

@ -0,0 +1,512 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "958375a6ffc0c2e4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:47.336283Z",
"start_time": "2024-09-20T14:02:43.652444Z"
}
},
"outputs": [],
"source": [
"import asyncio\n",
"import logging\n",
"from typing import Union\n",
"\n",
"from cognee.modules.cognify.config import get_cognify_config\n",
"from cognee.shared.data_models import KnowledgeGraph\n",
"from cognee.modules.data.models import Dataset, Data\n",
"from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
"from cognee.modules.data.methods import get_datasets, get_datasets_by_name\n",
"from cognee.modules.pipelines.tasks.Task import Task\n",
"from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n",
"from cognee.modules.users.models import User\n",
"from cognee.modules.users.methods import get_default_user\n",
"from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status\n",
"from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status\n",
"from cognee.tasks import chunk_extract_summary, \\\n",
" chunk_naive_llm_classifier, \\\n",
" chunk_remove_disconnected, \\\n",
" infer_data_ontology, \\\n",
" save_chunks_to_store, \\\n",
" chunk_update_check, \\\n",
" chunks_into_graph, \\\n",
" source_documents_to_chunks, \\\n",
" check_permissions_on_documents, \\\n",
" classify_documents"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "df16431d0f48b006",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:48.519686Z",
"start_time": "2024-09-20T14:02:48.515589Z"
}
},
"outputs": [],
"source": [
"job_position = \"\"\"Senior Data Scientist (Machine Learning)\n",
"\n",
"Company: TechNova Solutions\n",
"Location: San Francisco, CA\n",
"\n",
"Job Description:\n",
"\n",
"TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n",
"\n",
"Responsibilities:\n",
"\n",
"Develop and implement advanced machine learning algorithms and models.\n",
"Analyze large, complex datasets to extract meaningful patterns and insights.\n",
"Collaborate with cross-functional teams to integrate predictive models into products.\n",
"Stay updated with the latest advancements in machine learning and data science.\n",
"Mentor junior data scientists and provide technical guidance.\n",
"Qualifications:\n",
"\n",
"Masters or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n",
"5+ years of experience in data science and machine learning.\n",
"Proficient in Python, R, and SQL.\n",
"Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n",
"Strong problem-solving skills and attention to detail.\n",
"Candidate CVs\n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9086abf3af077ab4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:49.120838Z",
"start_time": "2024-09-20T14:02:49.118294Z"
}
},
"outputs": [],
"source": [
"job_1 = \"\"\"\n",
"CV 1: Relevant\n",
"Name: Dr. Emily Carter\n",
"Contact Information:\n",
"\n",
"Email: emily.carter@example.com\n",
"Phone: (555) 123-4567\n",
"Summary:\n",
"\n",
"Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n",
"\n",
"Education:\n",
"\n",
"Ph.D. in Computer Science, Stanford University (2014)\n",
"B.S. in Mathematics, University of California, Berkeley (2010)\n",
"Experience:\n",
"\n",
"Senior Data Scientist, InnovateAI Labs (2016 Present)\n",
"Led a team in developing machine learning models for natural language processing applications.\n",
"Implemented deep learning algorithms that improved prediction accuracy by 25%.\n",
"Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n",
"Data Scientist, DataWave Analytics (2014 2016)\n",
"Developed predictive models for customer segmentation and churn analysis.\n",
"Analyzed large datasets using Hadoop and Spark frameworks.\n",
"Skills:\n",
"\n",
"Programming Languages: Python, R, SQL\n",
"Machine Learning: TensorFlow, Keras, Scikit-Learn\n",
"Big Data Technologies: Hadoop, Spark\n",
"Data Visualization: Tableau, Matplotlib\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a9de0cc07f798b7f",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:49.675003Z",
"start_time": "2024-09-20T14:02:49.671615Z"
}
},
"outputs": [],
"source": [
"job_2 = \"\"\"\n",
"CV 2: Relevant\n",
"Name: Michael Rodriguez\n",
"Contact Information:\n",
"\n",
"Email: michael.rodriguez@example.com\n",
"Phone: (555) 234-5678\n",
"Summary:\n",
"\n",
"Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n",
"\n",
"Education:\n",
"\n",
"M.S. in Data Science, Carnegie Mellon University (2013)\n",
"B.S. in Computer Science, University of Michigan (2011)\n",
"Experience:\n",
"\n",
"Senior Data Scientist, Alpha Analytics (2017 Present)\n",
"Developed machine learning models to optimize marketing strategies.\n",
"Reduced customer acquisition cost by 15% through predictive modeling.\n",
"Data Scientist, TechInsights (2013 2017)\n",
"Analyzed user behavior data to improve product features.\n",
"Implemented A/B testing frameworks to evaluate product changes.\n",
"Skills:\n",
"\n",
"Programming Languages: Python, Java, SQL\n",
"Machine Learning: Scikit-Learn, XGBoost\n",
"Data Visualization: Seaborn, Plotly\n",
"Databases: MySQL, MongoDB\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "185ff1c102d06111",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:50.286828Z",
"start_time": "2024-09-20T14:02:50.284369Z"
}
},
"outputs": [],
"source": [
"job_3 = \"\"\"\n",
"CV 3: Relevant\n",
"Name: Sarah Nguyen\n",
"Contact Information:\n",
"\n",
"Email: sarah.nguyen@example.com\n",
"Phone: (555) 345-6789\n",
"Summary:\n",
"\n",
"Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n",
"\n",
"Education:\n",
"\n",
"M.S. in Statistics, University of Washington (2014)\n",
"B.S. in Applied Mathematics, University of Texas at Austin (2012)\n",
"Experience:\n",
"\n",
"Data Scientist, QuantumTech (2016 Present)\n",
"Designed and implemented machine learning algorithms for financial forecasting.\n",
"Improved model efficiency by 20% through algorithm optimization.\n",
"Junior Data Scientist, DataCore Solutions (2014 2016)\n",
"Assisted in developing predictive models for supply chain optimization.\n",
"Conducted data cleaning and preprocessing on large datasets.\n",
"Skills:\n",
"\n",
"Programming Languages: Python, R\n",
"Machine Learning Frameworks: PyTorch, Scikit-Learn\n",
"Statistical Analysis: SAS, SPSS\n",
"Cloud Platforms: AWS, Azure\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d55ce4c58f8efb67",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:50.950343Z",
"start_time": "2024-09-20T14:02:50.946378Z"
}
},
"outputs": [],
"source": [
"job_4 = \"\"\"\n",
"CV 4: Not Relevant\n",
"Name: David Thompson\n",
"Contact Information:\n",
"\n",
"Email: david.thompson@example.com\n",
"Phone: (555) 456-7890\n",
"Summary:\n",
"\n",
"Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n",
"\n",
"Education:\n",
"\n",
"B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n",
"Experience:\n",
"\n",
"Senior Graphic Designer, CreativeWorks Agency (2015 Present)\n",
"Led design projects for clients in various industries.\n",
"Created branding materials that increased client engagement by 30%.\n",
"Graphic Designer, Visual Innovations (2012 2015)\n",
"Designed marketing collateral, including brochures, logos, and websites.\n",
"Collaborated with the marketing team to develop cohesive brand strategies.\n",
"Skills:\n",
"\n",
"Design Software: Adobe Photoshop, Illustrator, InDesign\n",
"Web Design: HTML, CSS\n",
"Specialties: Branding and Identity, Typography\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ca4ecc32721ad332",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:51.548191Z",
"start_time": "2024-09-20T14:02:51.545520Z"
}
},
"outputs": [],
"source": [
"job_5 = \"\"\"\n",
"CV 5: Not Relevant\n",
"Name: Jessica Miller\n",
"Contact Information:\n",
"\n",
"Email: jessica.miller@example.com\n",
"Phone: (555) 567-8901\n",
"Summary:\n",
"\n",
"Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n",
"\n",
"Education:\n",
"\n",
"B.A. in Business Administration, University of Southern California (2010)\n",
"Experience:\n",
"\n",
"Sales Manager, Global Enterprises (2015 Present)\n",
"Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n",
"Developed sales strategies that expanded customer base by 25%.\n",
"Sales Representative, Market Leaders Inc. (2010 2015)\n",
"Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n",
"Skills:\n",
"\n",
"Sales Strategy and Planning\n",
"Team Leadership and Development\n",
"CRM Software: Salesforce, Zoho\n",
"Negotiation and Relationship Building\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "904df61ba484a8e5",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:54.243987Z",
"start_time": "2024-09-20T14:02:52.498195Z"
}
},
"outputs": [],
"source": [
"import cognee\n",
"from os import listdir, path\n",
"\n",
"data_path = path.abspath(\".data\")\n",
"\n",
"results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], \"example\")\n",
"\n",
"for result in results:\n",
" print(result)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6f9b564de121713d",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:55.564445Z",
"start_time": "2024-09-20T14:02:55.562784Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8911f8bd4f8c440a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:56.714408Z",
"start_time": "2024-09-20T14:02:56.711812Z"
}
},
"outputs": [],
"source": [
"# from enum import Enum, auto\n",
"# from typing import Optional, List, Union, Dict, Any\n",
"# from pydantic import BaseModel, Field\n",
"# \n",
"# class Node(BaseModel):\n",
"# \"\"\"Node in a knowledge graph.\"\"\"\n",
"# id: str\n",
"# name: str\n",
"# type: str\n",
"# description: str\n",
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the node.\")\n",
"# \n",
"# class Edge(BaseModel):\n",
"# \"\"\"Edge in a knowledge graph.\"\"\"\n",
"# source_node_id: str\n",
"# target_node_id: str\n",
"# relationship_name: str\n",
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the edge.\")\n",
"# \n",
"# class KnowledgeGraph(BaseModel):\n",
"# \"\"\"Knowledge graph.\"\"\"\n",
"# nodes: List[Node] = Field(..., default_factory=list)\n",
"# edges: List[Edge] = Field(..., default_factory=list)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7c431fdef4921ae0",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:57.925667Z",
"start_time": "2024-09-20T14:02:57.922353Z"
}
},
"outputs": [],
"source": [
"async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
" data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n",
"\n",
" try:\n",
"\n",
" root_node_id = None\n",
"\n",
" tasks = [\n",
" Task(classify_documents),\n",
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
" Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
" Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
" Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
" Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
" Task(\n",
" save_chunks_to_store,\n",
" collection_name = \"chunks\",\n",
" ), \n",
" Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n",
" ]\n",
"\n",
" pipeline = run_tasks(tasks, data_documents)\n",
"\n",
" async for result in pipeline:\n",
" print(result)\n",
" except Exception as error:\n",
" raise error"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0a91b99c6215e09",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:58.905774Z",
"start_time": "2024-09-20T14:02:58.625915Z"
}
},
"outputs": [],
"source": [
"user = await get_default_user()\n",
"datasets = await get_datasets_by_name([\"example\"], user.id)\n",
"await run_cognify_pipeline(datasets[0], user)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "080389e5",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from cognee.shared.utils import render_graph\n",
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
"import graphistry\n",
"\n",
"# # Setting an environment variable\n",
"# os.environ[\"GRAPHISTRY_USERNAME\"] = placeholder\n",
"# os.environ[\"GRAPHISTRY_PASSWORD\"] = placeholder\n",
"\n",
"\n",
"graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
"\n",
"graph_engine = await get_graph_engine()\n",
"\n",
"graph_url = await render_graph(graph_engine.graph)\n",
"print(graph_url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5e7dfc8",
"metadata": {},
"outputs": [],
"source": [
"async def search(\n",
" vector_engine,\n",
" collection_name: str,\n",
" query_text: str = None,\n",
"):\n",
" query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n",
"\n",
" connection = await vector_engine.get_connection()\n",
" collection = await connection.open_table(collection_name)\n",
"\n",
" results = await collection.vector_search(query_vector).limit(10).to_pandas()\n",
"\n",
" result_values = list(results.to_dict(\"index\").values())\n",
"\n",
" return [dict(\n",
" id = str(result[\"id\"]),\n",
" payload = result[\"payload\"],\n",
" score = result[\"_distance\"],\n",
" ) for result in result_values]\n",
"\n",
"\n",
"from cognee.infrastructure.databases.vector import get_vector_engine\n",
"\n",
"vector_engine = get_vector_engine()\n",
"results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n",
"for result in results:\n",
" print(result)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

3167
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -38,12 +38,10 @@ greenlet = "^3.0.3"
ruff = "^0.2.2" ruff = "^0.2.2"
filetype = "^1.2.0" filetype = "^1.2.0"
nltk = "^3.8.1" nltk = "^3.8.1"
dlt = {extras = ["postgres"], version = "^0.5.2"} dlt = {extras = ["sqlalchemy"], version = "^1.0.0"}
duckdb = {version = "^0.10.0", extras = ["dlt"]}
overrides = "^7.7.0" overrides = "^7.7.0"
aiofiles = "^23.2.1" aiofiles = "^23.2.1"
qdrant-client = "^1.9.0" qdrant-client = "^1.9.0"
duckdb-engine = "0.13.0"
graphistry = "^0.33.5" graphistry = "^0.33.5"
tenacity = "^8.2.3" tenacity = "^8.2.3"
weaviate-client = "4.6.7" weaviate-client = "4.6.7"
@ -75,14 +73,12 @@ asyncpg = "^0.29.0"
[tool.poetry.extras] [tool.poetry.extras]
duckdb = ["duckdb"]
filesystem = ["s3fs", "botocore"] filesystem = ["s3fs", "botocore"]
motherduck = ["duckdb"]
cli = ["pipdeptree", "cron-descriptor"] cli = ["pipdeptree", "cron-descriptor"]
weaviate = ["weaviate-client"] weaviate = ["weaviate-client"]
qdrant = ["qdrant-client"] qdrant = ["qdrant-client"]
neo4j = ["neo4j", "py2neo"] neo4j = ["neo4j"]
notebook = ["ipykernel","overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"] notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.0" pytest = "^7.4.0"