feat: add sqlalchemy as dlt destination (#137)

* feat: add sqlalchemy as dlt destination

* Fix the demo, update Readme

* fix: add 1.5 notebook

---------

Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
Boris 2024-09-21 15:58:28 +02:00 committed by GitHub
parent a09f7991e2
commit a9433e9283
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
56 changed files with 2435 additions and 2554 deletions

View file

@ -18,13 +18,6 @@ jobs:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_neo4j_integration_test:
name: test
needs: get_docs_changes
@ -35,18 +28,6 @@ jobs:
run:
shell: bash
services:
postgres:
image: postgres:latest
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps:
- name: Check out
uses: actions/checkout@master
@ -66,18 +47,6 @@ jobs:
- name: Install dependencies
run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run default Neo4j
env:
ENV: 'dev'
@ -85,14 +54,4 @@ jobs:
GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }}
GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }}
GRAPH_DATABASE_USERNAME: "neo4j"
DB_USER: cognee
DB_PASSWORD: cognee
DB_NAME: cognee_db
DB_HOST: localhost
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_neo4j.py

View file

@ -18,15 +18,6 @@ jobs:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
strategy:
fail-fast: false
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_common:
name: test
needs: get_docs_changes
@ -38,19 +29,6 @@ jobs:
run:
shell: bash
services:
postgres:
image: postgres:latest
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps:
- name: Check out
uses: actions/checkout@master
@ -71,23 +49,6 @@ jobs:
- name: Install dependencies
run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run tests
run: poetry run pytest tests/
@ -95,16 +56,6 @@ jobs:
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DB_HOST: localhost
DB_USERNAME: cognee
DB_PASSWORD: cognee
DB_DATABASE: cognee_db
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_library.py
- name: Clean up disk space

View file

@ -18,15 +18,6 @@ jobs:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
strategy:
fail-fast: false
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_common:
name: test
needs: get_docs_changes
@ -38,19 +29,6 @@ jobs:
run:
shell: bash
services:
postgres:
image: postgres:latest
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps:
- name: Check out
uses: actions/checkout@master
@ -71,23 +49,6 @@ jobs:
- name: Install dependencies
run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run tests
run: poetry run pytest tests/
@ -95,16 +56,6 @@ jobs:
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DB_HOST: localhost
DB_USERNAME: cognee
DB_PASSWORD: cognee
DB_DATABASE: cognee_db
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_library.py
- name: Clean up disk space

View file

@ -18,15 +18,6 @@ jobs:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
strategy:
fail-fast: false
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_common:
name: test
needs: get_docs_changes
@ -38,19 +29,6 @@ jobs:
run:
shell: bash
services:
postgres:
image: postgres:latest
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps:
- name: Check out
uses: actions/checkout@master
@ -71,23 +49,6 @@ jobs:
- name: Install dependencies
run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
env:
PGUSER: cognee
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run tests
run: poetry run pytest tests/
@ -95,16 +56,6 @@ jobs:
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
DB_HOST: localhost
DB_USERNAME: cognee
DB_PASSWORD: cognee
DB_DATABASE: cognee_db
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_library.py
- name: Clean up disk space

View file

@ -18,13 +18,6 @@ jobs:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_qdrant_integration_test:
name: test
needs: get_docs_changes
@ -35,18 +28,6 @@ jobs:
run:
shell: bash
services:
postgres:
image: postgres:latest
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps:
- name: Check out
uses: actions/checkout@master
@ -66,32 +47,10 @@ jobs:
- name: Install dependencies
run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run default Qdrant
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
VECTOR_DB_URL: ${{ secrets.QDRANT_API_URL }}
VECTOR_DB_KEY: ${{ secrets.QDRANT_API_KEY }}
DB_USER: cognee
DB_PASSWORD: cognee
DB_NAME: cognee_db
DB_HOST: localhost
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_qdrant.py

View file

@ -18,13 +18,6 @@ jobs:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
setup_docker:
name: Set up Docker Buildx
runs-on: ubuntu-latest
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
run_weaviate_integration_test:
name: test
needs: get_docs_changes
@ -35,18 +28,6 @@ jobs:
run:
shell: bash
services:
postgres:
image: postgres:latest
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
steps:
- name: Check out
uses: actions/checkout@master
@ -66,32 +47,10 @@ jobs:
- name: Install dependencies
run: poetry install --no-interaction
- name: Create .cognee_system directory and print path
run: |
mkdir .cognee_system
echo $(pwd)/.cognee_system
- name: Wait for PostgreSQL to be ready
run: |
echo "Waiting for PostgreSQL to be ready..."
until pg_isready -h localhost -p 5432 -U cognee; do
sleep 1
done
- name: Run default Weaviate
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
VECTOR_DB_URL: ${{ secrets.WEAVIATE_API_URL }}
VECTOR_DB_KEY: ${{ secrets.WEAVIATE_API_KEY }}
DB_USER: cognee
DB_PASSWORD: cognee
DB_NAME: cognee_db
DB_HOST: localhost
DB_PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
run: poetry run python ./cognee/tests/test_weaviate.py

172
README.md
View file

@ -18,24 +18,12 @@ We build for developers who need a reliable, production-ready data layer for AI
</a>
</p>
cognee implements scalable, modular data pipelines that allow for creating the LLM-enriched data layer using graph and vector stores.
<p>
<i> cognee aims to be dbt for LLMOps</i>
</p>
## What is cognee?
cognee implements scalable, modular ECL (Extract, Cognify, Load) pipelines that allow you ability to interconnect and retrieve past conversations, documents, audio transcriptions, while also reducing hallucinations, developer effort and cost.
Try it in a Google collab <a href="https://colab.research.google.com/drive/1jayZ5JRwDaUGFvCw9UZySBG-iB9gpYfu?usp=sharing">notebook</a> or have a look at our <a href="https://topoteretes.github.io/cognee">documentation</a>
If you have questions, join our <a href="https://discord.gg/NQPKmU5CCg">Discord</a> community
@ -58,7 +46,7 @@ poetry add cognee
```
## 💻 Usage
## 💻 Basic Usage
### Setup
@ -75,24 +63,6 @@ cognee.config.llm_api_key = "YOUR_OPENAI_API_KEY"
```
You can use different LLM providers, for more info check out our <a href="https://topoteretes.github.io/cognee">documentation</a>
In the next step make sure to launch a Postgres instance. Here is an example from our docker-compose:
```
postgres:
image: postgres:latest
container_name: postgres
environment:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- 5432:5432
networks:
- cognee-network
```
If you are using Networkx, create an account on Graphistry to visualize results:
```
@ -106,12 +76,7 @@ docker-compose up cognee
```
Then navigate to localhost:3000/wizard
### Run the default example
Make sure to launch the Postgres instance first. Navigate to the cognee folder and run:
```
docker compose up postgres
```
### Simple example
Run the default cognee pipeline:
@ -123,7 +88,7 @@ text = """Natural language processing (NLP) is an interdisciplinary
await cognee.add([text], "example_dataset") # Add a new piece of information
await cognee.cognify() # Use LLMs and cognee to create knowledge
await cognee.cognify() # Use LLMs and cognee to create a semantic graph
await search_results = cognee.search("SIMILARITY", {'query': 'Tell me about NLP'}) # Query cognee for the knowledge
@ -132,19 +97,20 @@ print(search_results)
```
### Create your pipelines
### Create your own memory store
cognee framework consists of tasks that can be grouped into pipelines.
Each task can be an independent part of business logic, that can be tied to other tasks to form a pipeline.
These tasks persist data into your memory store enabling you to search for relevant context of past conversations, documents, or any other data you have stored.
### Example: Classify your documents
cognee framework consists of tasks that can be grouped into pipelines. Each task can be an independent part of business logic, that can be tied to other tasks to form a pipeline.
Here is an example of how it looks for a default cognify pipeline:
1. To prepare the data for the pipeline run, first we need to add it to our metastore and normalize it:
Start with:
```
docker compose up postgres
```
And then run:
Start with:
```
text = """Natural language processing (NLP) is an interdisciplinary
subfield of computer science and information retrieval"""
@ -158,90 +124,62 @@ Here we show an example of creating a naive LLM classifier that takes a Pydantic
We provided just a snippet for reference, but feel free to check out the implementation in our repo.
```
async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]):
if len(data_chunks) == 0:
return data_chunks
async def chunk_naive_llm_classifier(
data_chunks: list[DocumentChunk],
classification_model: Type[BaseModel]
):
# Extract classifications asynchronously
chunk_classifications = await asyncio.gather(
*[extract_categories(chunk.text, classification_model) for chunk in data_chunks],
*(extract_categories(chunk.text, classification_model) for chunk in data_chunks)
)
classification_data_points = []
for chunk_index, chunk in enumerate(data_chunks):
chunk_classification = chunk_classifications[chunk_index]
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
for classification_subclass in chunk_classification.label.subclass:
classification_data_points.append(uuid5(NAMESPACE_OID, classification_subclass.value))
# Collect classification data points using a set to avoid duplicates
classification_data_points = {
uuid5(NAMESPACE_OID, cls.label.type)
for cls in chunk_classifications
} | {
uuid5(NAMESPACE_OID, subclass.value)
for cls in chunk_classifications
for subclass in cls.label.subclass
}
vector_engine = get_vector_engine()
collection_name = "classification"
# Define the payload schema
class Keyword(BaseModel):
uuid: str
text: str
chunk_id: str
document_id: str
collection_name = "classification"
if await vector_engine.has_collection(collection_name):
existing_data_points = await vector_engine.retrieve(
collection_name,
list(set(classification_data_points)),
) if len(classification_data_points) > 0 else []
existing_points_map = {point.id: True for point in existing_data_points}
# Ensure the collection exists and retrieve existing data points
if not await vector_engine.has_collection(collection_name):
await vector_engine.create_collection(collection_name, payload_schema=Keyword)
existing_points_map = {}
else:
existing_points_map = {}
await vector_engine.create_collection(collection_name, payload_schema=Keyword)
data_points = []
nodes = []
edges = []
for (chunk_index, data_chunk) in enumerate(data_chunks):
chunk_classification = chunk_classifications[chunk_index]
classification_type_label = chunk_classification.label.type
classification_type_id = uuid5(NAMESPACE_OID, classification_type_label)
return data_chunks
...
```
To see existing tasks, have a look at the cognee.tasks
We have a large number of tasks that can be used in your pipelines, and you can also create your own tasks to fit your business logic.
3. Once we have our tasks, it is time to group them into a pipeline.
This snippet shows how a group of tasks can be added to a pipeline, and how they can pass the information forward from one to another.
This simplified snippet demonstrates how tasks can be added to a pipeline, and how they can pass the information forward from one to another.
```
tasks = [
Task(document_to_ontology, root_node_id = root_node_id),
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
Task(chunk_to_graph_decomposition, topology_model = KnowledgeGraph, task_config = { "batch_size": 10 }), # Set the graph topology for the document chunk data
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities"), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
Task(
save_chunks_to_store,
collection_name = "chunks",
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
run_tasks_parallel([
Task(
chunk_extract_summary,
summarization_model = cognee_config.summarization_model,
collection_name = "chunk_summaries",
), # Summarize the document chunks
Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
),
]),
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
]
pipeline = run_tasks(tasks, documents)
Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
)
pipeline = run_tasks(tasks, documents)
```
@ -277,3 +215,23 @@ Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/ma
[![Star History Chart](https://api.star-history.com/svg?repos=topoteretes/cognee&type=Date)](https://star-history.com/#topoteretes/cognee&Date)
## Get Started
### Install Server
Please see the [cognee Quick Start Guide](https://topoteretes.github.io/cognee/quickstart/) for important configuration information.
```bash
docker compose up
```
### Install SDK
Please see the cognee [Develoment Guide](https://topoteretes.github.io/cognee/quickstart/) for important beta information and usage instructions.
```bash
pip install cognee
```

View file

@ -14,8 +14,6 @@ from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user
from cognee.infrastructure.databases.relational import create_db_and_tables
# Set up logging
logging.basicConfig(
level=logging.INFO, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
@ -34,8 +32,12 @@ from contextlib import asynccontextmanager
@asynccontextmanager
async def lifespan(app: FastAPI):
from cognee.infrastructure.databases.relational import create_db_and_tables
from cognee.modules.users.methods import get_default_user
# Not needed if you setup a migration system like Alembic
await create_db_and_tables()
await get_default_user()
yield
app = FastAPI(debug = os.getenv("ENV") != "prod", lifespan = lifespan)
@ -394,10 +396,10 @@ def start_api_server(host: str = "0.0.0.0", port: int = 8000):
try:
logger.info("Starting server at %s:%s", host, port)
import asyncio
from cognee.modules.data.deletion import prune_system, prune_data
asyncio.run(prune_data())
asyncio.run(prune_system(metadata = True))
# import asyncio
# from cognee.modules.data.deletion import prune_system, prune_data
# asyncio.run(prune_data())
# asyncio.run(prune_system(metadata = True))
uvicorn.run(app, host = host, port = port)
except Exception as e:

View file

@ -2,7 +2,6 @@ from typing import List, Union, BinaryIO
from os import path
import asyncio
import dlt
import duckdb
import cognee.modules.ingestion as ingestion
from cognee.infrastructure.files.storage import LocalStorage
@ -81,22 +80,16 @@ async def add_files(file_paths: List[str], dataset_name: str, user: User = None)
relational_config = get_relational_config()
if relational_config.db_provider == "duckdb":
db = duckdb.connect(relational_config.db_file_path)
destination = dlt.destinations.duckdb(
credentials = db,
)
else:
destination = dlt.destinations.postgres(
credentials = {
"host": relational_config.db_host,
"port": relational_config.db_port,
"user": relational_config.db_user,
"password": relational_config.db_password,
"database": relational_config.db_name,
},
)
destination = dlt.destinations.sqlalchemy(
credentials = {
"host": relational_config.db_host,
"port": relational_config.db_port,
"username": relational_config.db_username,
"password": relational_config.db_password,
"database": relational_config.db_name,
"drivername": relational_config.db_provider,
},
)
pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem",

View file

@ -46,72 +46,6 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
if type(datasets[0]) == str:
datasets = await get_datasets_by_name(datasets, user.id)
async def run_cognify_pipeline(dataset: Dataset):
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
document_ids_str = [str(document.id) for document in data_documents]
dataset_id = dataset.id
dataset_name = generate_dataset_name(dataset.name)
async with update_status_lock:
task_status = await get_pipeline_status([dataset_id])
if dataset_id in task_status and task_status[dataset_id] == "DATASET_PROCESSING_STARTED":
logger.info("Dataset %s is already being processed.", dataset_name)
return
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_STARTED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
try:
cognee_config = get_cognify_config()
root_node_id = None
tasks = [
Task(classify_documents),
Task(check_permissions_on_documents, user = user, permissions = ["write"]),
Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
Task(
save_chunks_to_store,
collection_name = "chunks",
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
run_tasks_parallel([
Task(
chunk_extract_summary,
summarization_model = cognee_config.summarization_model,
collection_name = "chunk_summaries",
), # Summarize the document chunks
Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
),
]),
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
]
pipeline = run_tasks(tasks, data_documents)
async for result in pipeline:
print(result)
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_FINISHED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
except Exception as error:
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_ERROR", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
raise error
existing_datasets_map = {
generate_dataset_name(dataset.name): True for dataset in existing_datasets
}
@ -122,10 +56,76 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
dataset_name = generate_dataset_name(dataset.name)
if dataset_name in existing_datasets_map:
awaitables.append(run_cognify_pipeline(dataset))
awaitables.append(run_cognify_pipeline(dataset, user))
return await asyncio.gather(*awaitables)
async def run_cognify_pipeline(dataset: Dataset, user: User):
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
document_ids_str = [str(document.id) for document in data_documents]
dataset_id = dataset.id
dataset_name = generate_dataset_name(dataset.name)
async with update_status_lock:
task_status = await get_pipeline_status([dataset_id])
if dataset_id in task_status and task_status[dataset_id] == "DATASET_PROCESSING_STARTED":
logger.info("Dataset %s is already being processed.", dataset_name)
return
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_STARTED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
try:
cognee_config = get_cognify_config()
root_node_id = None
tasks = [
Task(classify_documents),
Task(check_permissions_on_documents, user = user, permissions = ["write"]),
Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
Task(
save_chunks_to_store,
collection_name = "chunks",
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
run_tasks_parallel([
Task(
chunk_extract_summary,
summarization_model = cognee_config.summarization_model,
collection_name = "chunk_summaries",
), # Summarize the document chunks
Task(
chunk_naive_llm_classifier,
classification_model = cognee_config.classification_model,
),
]),
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
]
pipeline = run_tasks(tasks, data_documents)
async for result in pipeline:
print(result)
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_FINISHED", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
except Exception as error:
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_ERROR", {
"dataset_name": dataset_name,
"files": document_ids_str,
})
raise error
def generate_dataset_name(dataset_name: str) -> str:
return dataset_name.replace(".", "_").replace(" ", "_")

View file

@ -1,23 +0,0 @@
from typing import Protocol
class DatabaseEngine(Protocol):
async def ensure_tables(self):
pass
def database_exists(self, db_name: str) -> bool:
pass
def create_database(self, db_name: str):
pass
def drop_database(self, db_name: str):
pass
async def table_exists(self, table_name: str) -> bool:
pass
async def create_tables(self):
pass
async def create(self, data):
pass

View file

@ -1,29 +0,0 @@
import inspect
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import Session
class FakeAsyncSession:
def __init__(self, session: Session):
self.session = session
def run_sync(self, *args, **kwargs):
return self.execute(*args, **kwargs)
def __getattr__(self, name: str) -> Any:
"""
If the method being called is async in AsyncSession, create a fake async version
for Session so callers can `await` as usual. Think `commit`, `refresh`,
`delete`, etc.
"""
async_session_attr = getattr(AsyncSession, name, None)
session_attr = getattr(self.session, name)
if not inspect.iscoroutinefunction(async_session_attr):
return session_attr
async def async_wrapper(*args, **kwargs):
return session_attr(*args, **kwargs)
return async_wrapper

View file

@ -1,7 +1,7 @@
from .ModelBase import Base
from .DatabaseEngine import DatabaseEngine
from .sqlite.SqliteEngine import SqliteEngine
from .duckdb.DuckDBAdapter import DuckDBAdapter
from .config import get_relational_config
from .create_db_and_tables import create_db_and_tables
from .get_relational_engine import get_relational_engine
# Global data types
from .data_types.UUID import UUID

View file

@ -1,4 +1,5 @@
import os
from typing import Union
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
from cognee.root_dir import get_absolute_path
@ -6,13 +7,11 @@ from cognee.root_dir import get_absolute_path
class RelationalConfig(BaseSettings):
db_path: str = os.path.join(get_absolute_path(".cognee_system"), "databases")
db_name: str = "cognee_db"
db_host: str = "localhost"
db_port: str = "5432"
db_user: str = "cognee"
db_password: str = "cognee"
db_provider: str = "postgresql+asyncpg"
db_file_path: str = os.path.join(db_path, db_name)
db_host: Union[str, None] = None # "localhost"
db_port: Union[str, None] = None # "5432"
db_username: Union[str, None] = None # "cognee"
db_password: Union[str, None] = None # "cognee"
db_provider: str = "sqlite"
model_config = SettingsConfigDict(env_file = ".env", extra = "allow")
@ -22,7 +21,7 @@ class RelationalConfig(BaseSettings):
"db_name": self.db_name,
"db_host": self.db_host,
"db_port": self.db_port,
"db_user": self.db_user,
"db_username": self.db_username,
"db_password": self.db_password,
"db_provider": self.db_provider,
}

View file

@ -1,9 +1,14 @@
from cognee.infrastructure.files.storage import LocalStorage
from .ModelBase import Base
from .get_relational_engine import get_relational_engine
from .get_relational_engine import get_relational_engine, get_relational_config
async def create_db_and_tables():
relational_config = get_relational_config()
relational_engine = get_relational_engine()
if relational_engine.engine.dialect.name == "sqlite":
LocalStorage.ensure_directory_exists(relational_config.db_path)
async with relational_engine.engine.begin() as connection:
if len(Base.metadata.tables.keys()) > 0:
await connection.run_sync(Base.metadata.create_all)

View file

@ -3,18 +3,16 @@ from .sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter
def create_relational_engine(
db_path: str,
db_name: str,
db_provider: str,
db_host: str,
db_port: str,
db_user: str,
db_username: str,
db_password: str,
db_provider: str,
):
return SQLAlchemyAdapter(
db_name = db_name,
db_path = db_path,
db_type = db_provider,
db_host = db_host,
db_port = db_port,
db_user = db_user,
db_password = db_password
)
if db_provider == "sqlite":
connection_string = f"sqlite+aiosqlite:///{db_path}/{db_name}"
if db_provider == "postgres":
connection_string = f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
return SQLAlchemyAdapter(connection_string)

View file

@ -0,0 +1,43 @@
import uuid
from sqlalchemy.types import TypeDecorator, BINARY
from sqlalchemy.dialects.postgresql import UUID as psqlUUID
class UUID(TypeDecorator):
"""Platform-independent GUID type.
Uses Postgresql's UUID type, otherwise uses
BINARY(16), to store UUID.
"""
impl = BINARY
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(psqlUUID())
else:
return dialect.type_descriptor(BINARY(16))
def process_bind_param(self, value, dialect):
if value is None:
return value
else:
if not isinstance(value, uuid.UUID):
if isinstance(value, bytes):
value = uuid.UUID(bytes = value)
elif isinstance(value, int):
value = uuid.UUID(int = value)
elif isinstance(value, str):
value = uuid.UUID(value)
if dialect.name == 'postgresql':
return str(value)
else:
return value.bytes
def process_result_value(self, value, dialect):
if value is None:
return value
if dialect.name == 'postgresql':
return uuid.UUID(value)
else:
return uuid.UUID(bytes = value)

View file

@ -1,169 +0,0 @@
import duckdb
import os
class DuckDBAdapter():
def __init__(self, db_path: str, db_name: str):
self.db_location = os.path.abspath(os.path.join(db_path, db_name))
self.get_connection = lambda: duckdb.connect(self.db_location)
def get_datasets(self):
with self.get_connection() as connection:
tables = connection.sql("SELECT DISTINCT schema_name FROM duckdb_tables();").to_df().to_dict("list")
return list(
filter(
lambda schema_name: not schema_name.endswith("staging") and schema_name != "cognee",
tables["schema_name"]
)
)
def get_files_metadata(self, dataset_name: str):
with self.get_connection() as connection:
return connection.sql(f"SELECT id, name, file_path, extension, mime_type FROM {dataset_name}.file_metadata;").to_df().to_dict("records")
def create_table(self, schema_name: str, table_name: str, table_config: list[dict]):
fields_query_parts = []
for table_config_item in table_config:
fields_query_parts.append(f"{table_config_item['name']} {table_config_item['type']}")
with self.get_connection() as connection:
query = f"CREATE SCHEMA IF NOT EXISTS {schema_name};"
connection.execute(query)
with self.get_connection() as connection:
query = f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({', '.join(fields_query_parts)});"
connection.execute(query)
def delete_table(self, table_name: str):
with self.get_connection() as connection:
query = f"DROP TABLE IF EXISTS {table_name};"
connection.execute(query)
def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
def get_values(data_entry: list):
return ", ".join([f"'{value}'" if isinstance(value, str) else value for value in data_entry])
columns = ", ".join(data[0].keys())
values = ", ".join([f"({get_values(data_entry.values())})" for data_entry in data])
with self.get_connection() as connection:
query = f"INSERT INTO {schema_name}.{table_name} ({columns}) VALUES {values};"
connection.execute(query)
def get_data(self, table_name: str, filters: dict = None):
with self.get_connection() as connection:
def get_values(values: list):
return ", ".join([f"'{value}'" for value in values])
def get_filters(filters: dict):
return " AND ".join([
f"{key} IN ({get_values(value)})" if isinstance(value, list)
else f"{key} = '{value}'" for (key, value) in filters.items()
])
query = f"SELECT * FROM {table_name}" + (";" if filters is None else f" WHERE {get_filters(filters)};")
results = connection.sql(query).to_df().to_dict("records")
return {
result["data_id"]: result["status"] for result in results
}
def execute_query(self, query):
with self.get_connection() as connection:
return connection.sql(query).to_df().to_dict("records")
def load_cognify_data(self, data):
with self.get_connection() as connection:
# Ensure the "cognify" table exists
connection.execute("""
CREATE TABLE IF NOT EXISTS cognify (
document_id STRING,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT NULL,
processed BOOLEAN DEFAULT FALSE,
document_id_target STRING NULL
);
""")
# Prepare the insert statement
insert_query = """
INSERT INTO cognify (document_id)
VALUES (?);
"""
# Insert each record into the "cognify" table
for record in data:
with self.get_connection() as connection:
connection.execute(insert_query, [
record.get("document_id"),
])
def fetch_cognify_data(self, excluded_document_id: str):
# SQL command to create the "cognify" table with the specified columns
create_table_sql = """
CREATE TABLE IF NOT EXISTS cognify (
document_id STRING,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT NULL,
processed BOOLEAN DEFAULT FALSE,
document_id_target STRING NULL
);
"""
with self.get_connection() as connection:
# Execute the SQL command to create the table
connection.execute(create_table_sql)
# SQL command to select data from the "cognify" table
select_data_sql = f"SELECT document_id, created_at, updated_at, processed FROM cognify WHERE document_id != '{excluded_document_id}' AND processed = FALSE;"
with self.get_connection() as connection:
# Execute the query and fetch the results
records = connection.sql(select_data_sql).to_df().to_dict("records")
# If records are fetched, update the "processed" column to "True"
if records:
# Fetching document_ids from the records to update the "processed" column
document_ids = tuple(record["document_id"] for record in records)
# SQL command to update the "processed" column to "True" for fetched records
update_data_sql = f"UPDATE cognify SET processed = TRUE WHERE document_id IN {document_ids};"
with self.get_connection() as connection:
# Execute the update query
connection.execute(update_data_sql)
# Return the fetched records
return records
def delete_cognify_data(self):
# SQL command to create the "cognify" table with the specified columns
create_table_sql = """
CREATE TABLE IF NOT EXISTS cognify (
document_id STRING,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT NULL,
processed BOOLEAN DEFAULT FALSE,
document_id_target STRING NULL
);
"""
with self.get_connection() as connection:
# Execute the SQL command to create the table
connection.execute(create_table_sql)
with self.get_connection() as connection:
# SQL command to select data from the "cognify" table
select_data_sql = "DELETE FROM cognify;"
connection.sql(select_data_sql)
drop_data_sql = "DROP TABLE cognify;"
connection.sql(drop_data_sql)
def delete_database(self):
from cognee.infrastructure.files.storage import LocalStorage
if LocalStorage.file_exists(self.db_location):
LocalStorage.remove(self.db_location)
if LocalStorage.file_exists(self.db_location + ".wal"):
LocalStorage.remove(self.db_location + ".wal")

View file

@ -1,26 +0,0 @@
from abc import abstractmethod
from typing import Protocol, TypeVar, Type, List
RowDataType = TypeVar('RowDataType')
class RelationalDBInterface(Protocol):
@abstractmethod
async def create_database(self, database_name: str, database_path: str): raise NotImplementedError
@abstractmethod
async def create_table(self, table_name: str, table_config: object): raise NotImplementedError
@abstractmethod
async def add_row(self, table_name: str, row_data: Type[RowDataType]): raise NotImplementedError
@abstractmethod
async def add_rows(self, table_name: str, rows_data: List[Type[RowDataType]]): raise NotImplementedError
@abstractmethod
async def get_row(self, table_name: str, row_id: str): raise NotImplementedError
@abstractmethod
async def update_row(self, table_name: str, row_id: str, row_data: Type[RowDataType]): raise NotImplementedError
@abstractmethod
async def delete_row(self, table_name: str, row_id: str): raise NotImplementedError

View file

@ -1,39 +1,18 @@
import os
import asyncio
from typing import AsyncGenerator
from contextlib import asynccontextmanager
from sqlalchemy import create_engine, text, select
from sqlalchemy.orm import sessionmaker, joinedload
from sqlalchemy import text, select
from sqlalchemy.orm import joinedload
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.FakeAsyncSession import FakeAsyncSession
from ..ModelBase import Base
def make_async_sessionmaker(sessionmaker):
@asynccontextmanager
async def async_session_maker():
await asyncio.sleep(0.1)
session = FakeAsyncSession(sessionmaker())
try:
yield session
finally:
await session.close() # Ensure the session is closed
return async_session_maker
class SQLAlchemyAdapter():
def __init__(self, db_type: str, db_path: str, db_name: str, db_user: str, db_password: str, db_host: str, db_port: str):
self.db_location = os.path.abspath(os.path.join(db_path, db_name))
self.db_name = db_name
def __init__(self, connection_string: str):
self.engine = create_async_engine(connection_string)
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
if db_type == "duckdb":
LocalStorage.ensure_directory_exists(db_path)
self.engine = create_engine(f"duckdb:///{self.db_location}")
self.sessionmaker = make_async_sessionmaker(sessionmaker(bind=self.engine))
else:
self.engine = create_async_engine(f"postgresql+asyncpg://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
if self.engine.dialect.name == "sqlite":
self.db_path = connection_string.split("///")[1]
@asynccontextmanager
async def get_async_session(self) -> AsyncGenerator[AsyncSession, None]:
@ -72,6 +51,7 @@ class SQLAlchemyAdapter():
await connection.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE;"))
await connection.close()
async def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
columns = ", ".join(data[0].keys())
values = ", ".join([f"({', '.join([f':{key}' for key in row.keys()])})" for row in data])
@ -80,6 +60,7 @@ class SQLAlchemyAdapter():
async with self.engine.begin() as connection:
await connection.execute(insert_query, data)
await connection.close()
async def get_data(self, table_name: str, filters: dict = None):
async with self.engine.begin() as connection:
query = f"SELECT * FROM {table_name}"
@ -113,11 +94,19 @@ class SQLAlchemyAdapter():
print(f"Error dropping database tables: {e}")
async def delete_database(self):
async with self.engine.begin() as connection:
try:
for table in Base.metadata.sorted_tables:
drop_table_query = text(f'DROP TABLE IF EXISTS {table.name} CASCADE')
await connection.execute(drop_table_query)
print("Database deleted successfully.")
except Exception as e:
print(f"Error deleting database: {e}")
try:
if self.engine.dialect.name == "sqlite":
from cognee.infrastructure.files.storage import LocalStorage
LocalStorage.remove(self.db_path)
self.db_path = None
else:
async with self.engine.begin() as connection:
for table in Base.metadata.sorted_tables:
drop_table_query = text(f'DROP TABLE IF EXISTS {table.name} CASCADE')
await connection.execute(drop_table_query)
except Exception as e:
print(f"Error deleting database: {e}")
print("Database deleted successfully.")

View file

@ -1,82 +0,0 @@
import os
import asyncio
from typing import Callable
from sqlalchemy.inspection import inspect
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncEngine, AsyncSession, async_scoped_session
from sqlalchemy.future import select
from cognee.infrastructure.files.storage.LocalStorage import LocalStorage
from ..DatabaseEngine import DatabaseEngine
from ..ModelBase import Base
from ..utils import with_rollback
class SqliteEngine(DatabaseEngine):
db_path: str = None
db_name: str = None
engine: AsyncEngine = None
session_maker: Callable[[], async_scoped_session[AsyncSession]] = None
is_db_done: bool = False
def __init__(self, db_path: str, db_name: str):
self.db_path = db_path
self.db_name = db_name
self.db_location = db_path + "/" + db_name
self.engine = create_async_engine(
f"sqlite+aiosqlite:///{self.db_location}",
pool_recycle = 3600,
echo = False
)
self.session_maker = lambda: async_scoped_session(
async_sessionmaker(
bind = self.engine,
class_ = AsyncSession
),
scopefunc = asyncio.current_task
)
async def ensure_tables(self):
if not self.database_exists(self.db_name):
self.create_database(self.db_name)
await self.create_tables()
self.is_db_done = True
return True
def database_exists(self, db_name: str) -> bool:
return os.path.exists(self.db_path + "/" + db_name)
def create_database(self, db_name: str):
LocalStorage.ensure_directory_exists(self.db_path)
with open(self.db_path + "/" + db_name, mode = "w+", encoding = "utf-8") as file:
file.write("")
def drop_database(self, db_name: str):
os.remove(self.db_location)
async def table_exists(self, table_name: str) -> bool:
return inspect(self.engine).has_table(table_name)
async def create_tables(self):
async with self.engine.begin() as connection:
return await connection.run_sync(Base.metadata.create_all)
async def create(self, data):
async with with_rollback(self.session_maker()) as session:
session.add(data)
async def query(self, query_term):
async with with_rollback(self.session_maker()) as session:
return await session.execute(query_term)
async def query_entity(self, entity):
async with with_rollback(self.session_maker()) as session:
return await session.execute(
select(type(entity))
.where(type(entity).id == entity.id)
)
async def update(self, data_update_fn):
async with with_rollback(self.session_maker()):
data_update_fn()

View file

@ -1 +0,0 @@
from .with_rollback import with_rollback

View file

@ -1,18 +0,0 @@
import logging
from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import async_scoped_session
logger = logging.getLogger(__name__)
@asynccontextmanager
async def with_rollback(session: async_scoped_session):
"""Provide a transactional scope around a series of operations."""
try:
# async with session.begin():
yield session
await session.commit()
await session.remove()
except Exception as exception:
await session.rollback()
logger.error("Session rolled back due to: %s", str(exception))
raise exception

View file

@ -4,7 +4,7 @@ import litellm
from litellm import aembedding
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
litellm.set_verbose = True
litellm.set_verbose = False
class LiteLLMEmbeddingEngine(EmbeddingEngine):
api_key: str

View file

@ -106,11 +106,10 @@ class QDrantAdapter(VectorDBInterface):
points = [convert_to_qdrant_point(point) for point in data_points]
try:
result = await client.upload_points(
client.upload_points(
collection_name = collection_name,
points = points
)
return result
except Exception as error:
logger.error("Error uploading data points to Qdrant: %s", str(error))
raise error

View file

@ -1,7 +1,7 @@
from datetime import datetime
from sqlalchemy.orm import Mapped, MappedColumn
from sqlalchemy import Column, String, DateTime, ForeignKey, Enum, UUID, JSON
from cognee.infrastructure.databases.relational import ModelBase
from sqlalchemy import Column, DateTime, ForeignKey, Enum, JSON
from cognee.infrastructure.databases.relational import Base, UUID
class OperationType(Enum):
MERGE_DATA = "MERGE_DATA"
@ -14,10 +14,10 @@ class OperationStatus(Enum):
ERROR = "OPERATION_ERROR"
CANCELLED = "OPERATION_CANCELLED"
class Operation(ModelBase):
class Operation(Base):
__tablename__ = "operation"
id = Column(String, primary_key = True)
id = Column(UUID, primary_key = True)
status = Column(Enum(OperationStatus))
operation_type = Column(Enum(OperationType))

View file

@ -7,6 +7,8 @@ async def get_datasets_by_name(dataset_names: list[str], user_id: UUID) -> list[
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
if isinstance(dataset_names, str):
dataset_names = [dataset_names]
datasets = (await session.scalars(
select(Dataset)
.filter(Dataset.owner_id == user_id)

View file

@ -2,14 +2,14 @@ from uuid import uuid4
from typing import List
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, String, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from .DatasetData import DatasetData
class Data(Base):
__tablename__ = "data"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
name = Column(String)
extension = Column(String)

View file

@ -2,14 +2,14 @@ from uuid import uuid4
from typing import List
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, Text, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, Text, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from .DatasetData import DatasetData
class Dataset(Base):
__tablename__ = "datasets"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
name = Column(Text)

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, UUID, ForeignKey
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, DateTime, ForeignKey
from cognee.infrastructure.databases.relational import Base, UUID
class DatasetData(Base):
__tablename__ = "dataset_data"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
dataset_id = Column(UUID(as_uuid = True), ForeignKey("datasets.id"), primary_key = True)
data_id = Column(UUID(as_uuid = True), ForeignKey("data.id"), primary_key = True)
dataset_id = Column(UUID, ForeignKey("datasets.id"), primary_key = True)
data_id = Column(UUID, ForeignKey("data.id"), primary_key = True)

View file

@ -6,18 +6,18 @@ from .Document import Document
class AudioDocument(Document):
type: str = "audio"
title: str
file_path: str
chunking_strategy:str
raw_data_location: str
chunking_strategy: str
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
def __init__(self, id: UUID, title: str, raw_data_location: str, chunking_strategy:str="paragraph"):
self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title
self.file_path = file_path
self.raw_data_location = raw_data_location
self.chunking_strategy = chunking_strategy
def read(self):
# Transcribe the audio file
result = get_llm_client().create_transcript(self.file_path)
result = get_llm_client().create_transcript(self.raw_data_location)
text = result.text
chunker = TextChunker(self.id, get_text = lambda: text)
@ -30,5 +30,5 @@ class AudioDocument(Document):
id=str(self.id),
type=self.type,
title=self.title,
file_path=self.file_path,
raw_data_location=self.raw_data_location,
)

View file

@ -5,7 +5,7 @@ class Document(Protocol):
id: UUID
type: str
title: str
file_path: str
raw_data_location: str
def read(self) -> str:
pass

View file

@ -7,16 +7,16 @@ from .Document import Document
class ImageDocument(Document):
type: str = "image"
title: str
file_path: str
raw_data_location: str
def __init__(self, id: UUID, title: str, file_path: str):
def __init__(self, id: UUID, title: str, raw_data_location: str):
self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title
self.file_path = file_path
self.raw_data_location = raw_data_location
def read(self):
# Transcribe the image file
result = get_llm_client().transcribe_image(self.file_path)
result = get_llm_client().transcribe_image(self.raw_data_location)
text = result.choices[0].message.content
chunker = TextChunker(self.id, get_text = lambda: text)
@ -29,5 +29,5 @@ class ImageDocument(Document):
id=str(self.id),
type=self.type,
title=self.title,
file_path=self.file_path,
raw_data_location=self.raw_data_location,
)

View file

@ -6,15 +6,15 @@ from .Document import Document
class PdfDocument(Document):
type: str = "pdf"
title: str
file_path: str
raw_data_location: str
def __init__(self, id: UUID, title: str, file_path: str):
def __init__(self, id: UUID, title: str, raw_data_location: str):
self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title
self.file_path = file_path
self.raw_data_location = raw_data_location
def read(self) -> PdfReader:
file = PdfReader(self.file_path)
file = PdfReader(self.raw_data_location)
def get_text():
for page in file.pages:
@ -32,5 +32,5 @@ class PdfDocument(Document):
id = str(self.id),
type = self.type,
title = self.title,
file_path = self.file_path,
raw_data_location = self.raw_data_location,
)

View file

@ -5,16 +5,16 @@ from .Document import Document
class TextDocument(Document):
type: str = "text"
title: str
file_path: str
raw_data_location: str
def __init__(self, id: UUID, title: str, file_path: str):
def __init__(self, id: UUID, title: str, raw_data_location: str):
self.id = id or uuid5(NAMESPACE_OID, title)
self.title = title
self.file_path = file_path
self.raw_data_location = raw_data_location
def read(self):
def get_text():
with open(self.file_path, mode = "r", encoding = "utf-8") as file:
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
while True:
text = file.read(1024)
@ -34,5 +34,5 @@ class TextDocument(Document):
id = str(self.id),
type = self.type,
title = self.title,
file_path = self.file_path,
raw_data_location = self.raw_data_location,
)

View file

@ -1,14 +1,14 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, UUID, DateTime, String, Text
from sqlalchemy import Column, DateTime, String, Text
from sqlalchemy.orm import relationship, Mapped
from cognee.infrastructure.databases.relational import Base
from cognee.infrastructure.databases.relational import Base, UUID
from .PipelineTask import PipelineTask
class Pipeline(Base):
__tablename__ = "pipelines"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
name = Column(String)
description = Column(Text, nullable = True)

View file

@ -1,16 +1,16 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, UUID, DateTime, String, JSON
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, DateTime, String, JSON
from cognee.infrastructure.databases.relational import Base, UUID
class PipelineRun(Base):
__tablename__ = "pipeline_runs"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
status = Column(String)
run_id = Column(UUID(as_uuid = True), index = True)
run_id = Column(UUID, index = True)
run_info = Column(JSON)

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, UUID, ForeignKey
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, DateTime, ForeignKey
from cognee.infrastructure.databases.relational import Base, UUID
class PipelineTask(Base):
__tablename__ = "pipeline_task"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
pipeline_id = Column("pipeline", UUID(as_uuid = True), ForeignKey("pipeline.id"), primary_key = True)
task_id = Column("task", UUID(as_uuid = True), ForeignKey("task.id"), primary_key = True)
pipeline_id = Column("pipeline", UUID, ForeignKey("pipeline.id"), primary_key = True)
task_id = Column("task", UUID, ForeignKey("task.id"), primary_key = True)

View file

@ -1,14 +1,14 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, DateTime, UUID, Text
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, String, DateTime, Text
from cognee.infrastructure.databases.relational import Base, UUID
from .PipelineTask import PipelineTask
class Task(Base):
__tablename__ = "tasks"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
name = Column(String)
description = Column(Text, nullable = True)

View file

@ -1,12 +1,12 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, UUID, DateTime, String, JSON
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, DateTime, String, JSON
from cognee.infrastructure.databases.relational import Base, UUID
class TaskRun(Base):
__tablename__ = "task_runs"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
task_name = Column(String)

View file

@ -1,20 +1,20 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from .ACLResources import ACLResources
class ACL(Base):
__tablename__ = "acls"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
principal_id = Column(UUID(as_uuid = True), ForeignKey("principals.id"))
permission_id = Column(UUID(as_uuid = True), ForeignKey("permissions.id"))
principal_id = Column(UUID, ForeignKey("principals.id"))
permission_id = Column(UUID, ForeignKey("permissions.id"))
principal = relationship("Principal")
permission = relationship("Permission")

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, UUID, DateTime
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
class ACLResources(Base):
__tablename__ = "acl_resources"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
acl_id = Column(UUID(as_uuid = True), ForeignKey("acls.id"), primary_key = True)
resource_id = Column(UUID(as_uuid = True), ForeignKey("resources.id"), primary_key = True)
acl_id = Column(UUID, ForeignKey("acls.id"), primary_key = True)
resource_id = Column(UUID, ForeignKey("resources.id"), primary_key = True)

View file

@ -1,12 +1,13 @@
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, ForeignKey, UUID
from sqlalchemy import Column, String, ForeignKey
from cognee.infrastructure.databases.relational import UUID
from .Principal import Principal
from .UserGroup import UserGroup
class Group(Principal):
__tablename__ = "groups"
id = Column(UUID(as_uuid = True), ForeignKey("principals.id"), primary_key = True)
id = Column(UUID, ForeignKey("principals.id"), primary_key = True)
name = Column(String, unique = True, nullable = False, index = True)

View file

@ -1,8 +1,8 @@
from uuid import uuid4
from datetime import datetime, timezone
# from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime, UUID, String
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, DateTime, String
from cognee.infrastructure.databases.relational import Base, UUID
class Permission(Base):
__tablename__ = "permissions"

View file

@ -1,12 +1,12 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, String, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, String, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
class Principal(Base):
__tablename__ = "principals"
id = Column(UUID(as_uuid = True), primary_key = True, index = True, default = uuid4)
id = Column(UUID, primary_key = True, index = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))

View file

@ -1,18 +1,18 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from .ACLResources import ACLResources
class Resource(Base):
__tablename__ = "resources"
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
id = Column(UUID, primary_key = True, default = uuid4)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
resource_id = Column(UUID(as_uuid = True), nullable = False)
resource_id = Column(UUID, nullable = False)
acls = relationship("ACL", secondary = ACLResources.__tablename__, back_populates = "resources")

View file

@ -1,14 +1,15 @@
from uuid import UUID as uuid_UUID
from sqlalchemy import ForeignKey, UUID, Column
from sqlalchemy import ForeignKey, Column
from sqlalchemy.orm import relationship, Mapped
from fastapi_users.db import SQLAlchemyBaseUserTableUUID
from cognee.infrastructure.databases.relational import UUID
from .Principal import Principal
from .UserGroup import UserGroup
class User(SQLAlchemyBaseUserTableUUID, Principal):
__tablename__ = "users"
id = Column(UUID(as_uuid = True), ForeignKey("principals.id"), primary_key = True)
id = Column(UUID, ForeignKey("principals.id"), primary_key = True)
groups: Mapped[list["Group"]] = relationship(
secondary = UserGroup.__tablename__,

View file

@ -1,11 +1,11 @@
from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
class UserGroup(Base):
__tablename__ = "user_groups"
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
user_id = Column(UUID(as_uuid = True), ForeignKey("users.id"), primary_key = True)
group_id = Column(UUID(as_uuid = True), ForeignKey("groups.id"), primary_key = True)
user_id = Column(UUID, ForeignKey("users.id"), primary_key = True)
group_id = Column(UUID, ForeignKey("groups.id"), primary_key = True)

View file

@ -24,7 +24,7 @@ def chunk_by_word(data: str):
while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "):
j += 1
next_character = data[j] if j < len(data) else None
if next_character.isupper():
if next_character and next_character.isupper():
return True
return False

View file

@ -3,10 +3,10 @@ from cognee.modules.data.processing.document_types import Document, PdfDocument,
def classify_documents(data_documents: list[Data]) -> list[Document]:
documents = [
PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "pdf" else
AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "audio" else
ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "image" else
TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location)
PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "pdf" else
AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "audio" else
ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "image" else
TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location)
for data_item in data_documents
]

View file

@ -89,7 +89,7 @@ class OntologyEngine:
chunk_strategy = chunk_config.chunk_strategy
for base_file in documents:
with open(base_file.file_path, "rb") as file:
with open(base_file.raw_data_location, "rb") as file:
try:
file_type = guess_file_type(file)
text = extract_text_from_file(file, file_type)
@ -175,7 +175,7 @@ async def infer_data_ontology(documents, ontology_model = KnowledgeGraph, root_n
ontology_engine = OntologyEngine()
root_node_id = await ontology_engine.add_graph_ontology(documents = documents)
else:
graph_engine = get_graph_engine()
graph_engine = await get_graph_engine()
await add_model_class_to_graph(ontology_model, graph_engine)
yield (documents, root_node_id)

View file

@ -0,0 +1,512 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "958375a6ffc0c2e4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:47.336283Z",
"start_time": "2024-09-20T14:02:43.652444Z"
}
},
"outputs": [],
"source": [
"import asyncio\n",
"import logging\n",
"from typing import Union\n",
"\n",
"from cognee.modules.cognify.config import get_cognify_config\n",
"from cognee.shared.data_models import KnowledgeGraph\n",
"from cognee.modules.data.models import Dataset, Data\n",
"from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
"from cognee.modules.data.methods import get_datasets, get_datasets_by_name\n",
"from cognee.modules.pipelines.tasks.Task import Task\n",
"from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n",
"from cognee.modules.users.models import User\n",
"from cognee.modules.users.methods import get_default_user\n",
"from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status\n",
"from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status\n",
"from cognee.tasks import chunk_extract_summary, \\\n",
" chunk_naive_llm_classifier, \\\n",
" chunk_remove_disconnected, \\\n",
" infer_data_ontology, \\\n",
" save_chunks_to_store, \\\n",
" chunk_update_check, \\\n",
" chunks_into_graph, \\\n",
" source_documents_to_chunks, \\\n",
" check_permissions_on_documents, \\\n",
" classify_documents"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "df16431d0f48b006",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:48.519686Z",
"start_time": "2024-09-20T14:02:48.515589Z"
}
},
"outputs": [],
"source": [
"job_position = \"\"\"Senior Data Scientist (Machine Learning)\n",
"\n",
"Company: TechNova Solutions\n",
"Location: San Francisco, CA\n",
"\n",
"Job Description:\n",
"\n",
"TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n",
"\n",
"Responsibilities:\n",
"\n",
"Develop and implement advanced machine learning algorithms and models.\n",
"Analyze large, complex datasets to extract meaningful patterns and insights.\n",
"Collaborate with cross-functional teams to integrate predictive models into products.\n",
"Stay updated with the latest advancements in machine learning and data science.\n",
"Mentor junior data scientists and provide technical guidance.\n",
"Qualifications:\n",
"\n",
"Masters or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n",
"5+ years of experience in data science and machine learning.\n",
"Proficient in Python, R, and SQL.\n",
"Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n",
"Strong problem-solving skills and attention to detail.\n",
"Candidate CVs\n",
"\"\"\"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9086abf3af077ab4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:49.120838Z",
"start_time": "2024-09-20T14:02:49.118294Z"
}
},
"outputs": [],
"source": [
"job_1 = \"\"\"\n",
"CV 1: Relevant\n",
"Name: Dr. Emily Carter\n",
"Contact Information:\n",
"\n",
"Email: emily.carter@example.com\n",
"Phone: (555) 123-4567\n",
"Summary:\n",
"\n",
"Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n",
"\n",
"Education:\n",
"\n",
"Ph.D. in Computer Science, Stanford University (2014)\n",
"B.S. in Mathematics, University of California, Berkeley (2010)\n",
"Experience:\n",
"\n",
"Senior Data Scientist, InnovateAI Labs (2016 Present)\n",
"Led a team in developing machine learning models for natural language processing applications.\n",
"Implemented deep learning algorithms that improved prediction accuracy by 25%.\n",
"Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n",
"Data Scientist, DataWave Analytics (2014 2016)\n",
"Developed predictive models for customer segmentation and churn analysis.\n",
"Analyzed large datasets using Hadoop and Spark frameworks.\n",
"Skills:\n",
"\n",
"Programming Languages: Python, R, SQL\n",
"Machine Learning: TensorFlow, Keras, Scikit-Learn\n",
"Big Data Technologies: Hadoop, Spark\n",
"Data Visualization: Tableau, Matplotlib\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a9de0cc07f798b7f",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:49.675003Z",
"start_time": "2024-09-20T14:02:49.671615Z"
}
},
"outputs": [],
"source": [
"job_2 = \"\"\"\n",
"CV 2: Relevant\n",
"Name: Michael Rodriguez\n",
"Contact Information:\n",
"\n",
"Email: michael.rodriguez@example.com\n",
"Phone: (555) 234-5678\n",
"Summary:\n",
"\n",
"Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n",
"\n",
"Education:\n",
"\n",
"M.S. in Data Science, Carnegie Mellon University (2013)\n",
"B.S. in Computer Science, University of Michigan (2011)\n",
"Experience:\n",
"\n",
"Senior Data Scientist, Alpha Analytics (2017 Present)\n",
"Developed machine learning models to optimize marketing strategies.\n",
"Reduced customer acquisition cost by 15% through predictive modeling.\n",
"Data Scientist, TechInsights (2013 2017)\n",
"Analyzed user behavior data to improve product features.\n",
"Implemented A/B testing frameworks to evaluate product changes.\n",
"Skills:\n",
"\n",
"Programming Languages: Python, Java, SQL\n",
"Machine Learning: Scikit-Learn, XGBoost\n",
"Data Visualization: Seaborn, Plotly\n",
"Databases: MySQL, MongoDB\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "185ff1c102d06111",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:50.286828Z",
"start_time": "2024-09-20T14:02:50.284369Z"
}
},
"outputs": [],
"source": [
"job_3 = \"\"\"\n",
"CV 3: Relevant\n",
"Name: Sarah Nguyen\n",
"Contact Information:\n",
"\n",
"Email: sarah.nguyen@example.com\n",
"Phone: (555) 345-6789\n",
"Summary:\n",
"\n",
"Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n",
"\n",
"Education:\n",
"\n",
"M.S. in Statistics, University of Washington (2014)\n",
"B.S. in Applied Mathematics, University of Texas at Austin (2012)\n",
"Experience:\n",
"\n",
"Data Scientist, QuantumTech (2016 Present)\n",
"Designed and implemented machine learning algorithms for financial forecasting.\n",
"Improved model efficiency by 20% through algorithm optimization.\n",
"Junior Data Scientist, DataCore Solutions (2014 2016)\n",
"Assisted in developing predictive models for supply chain optimization.\n",
"Conducted data cleaning and preprocessing on large datasets.\n",
"Skills:\n",
"\n",
"Programming Languages: Python, R\n",
"Machine Learning Frameworks: PyTorch, Scikit-Learn\n",
"Statistical Analysis: SAS, SPSS\n",
"Cloud Platforms: AWS, Azure\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d55ce4c58f8efb67",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:50.950343Z",
"start_time": "2024-09-20T14:02:50.946378Z"
}
},
"outputs": [],
"source": [
"job_4 = \"\"\"\n",
"CV 4: Not Relevant\n",
"Name: David Thompson\n",
"Contact Information:\n",
"\n",
"Email: david.thompson@example.com\n",
"Phone: (555) 456-7890\n",
"Summary:\n",
"\n",
"Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n",
"\n",
"Education:\n",
"\n",
"B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n",
"Experience:\n",
"\n",
"Senior Graphic Designer, CreativeWorks Agency (2015 Present)\n",
"Led design projects for clients in various industries.\n",
"Created branding materials that increased client engagement by 30%.\n",
"Graphic Designer, Visual Innovations (2012 2015)\n",
"Designed marketing collateral, including brochures, logos, and websites.\n",
"Collaborated with the marketing team to develop cohesive brand strategies.\n",
"Skills:\n",
"\n",
"Design Software: Adobe Photoshop, Illustrator, InDesign\n",
"Web Design: HTML, CSS\n",
"Specialties: Branding and Identity, Typography\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ca4ecc32721ad332",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:51.548191Z",
"start_time": "2024-09-20T14:02:51.545520Z"
}
},
"outputs": [],
"source": [
"job_5 = \"\"\"\n",
"CV 5: Not Relevant\n",
"Name: Jessica Miller\n",
"Contact Information:\n",
"\n",
"Email: jessica.miller@example.com\n",
"Phone: (555) 567-8901\n",
"Summary:\n",
"\n",
"Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n",
"\n",
"Education:\n",
"\n",
"B.A. in Business Administration, University of Southern California (2010)\n",
"Experience:\n",
"\n",
"Sales Manager, Global Enterprises (2015 Present)\n",
"Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n",
"Developed sales strategies that expanded customer base by 25%.\n",
"Sales Representative, Market Leaders Inc. (2010 2015)\n",
"Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n",
"Skills:\n",
"\n",
"Sales Strategy and Planning\n",
"Team Leadership and Development\n",
"CRM Software: Salesforce, Zoho\n",
"Negotiation and Relationship Building\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "904df61ba484a8e5",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:54.243987Z",
"start_time": "2024-09-20T14:02:52.498195Z"
}
},
"outputs": [],
"source": [
"import cognee\n",
"from os import listdir, path\n",
"\n",
"data_path = path.abspath(\".data\")\n",
"\n",
"results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], \"example\")\n",
"\n",
"for result in results:\n",
" print(result)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6f9b564de121713d",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:55.564445Z",
"start_time": "2024-09-20T14:02:55.562784Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8911f8bd4f8c440a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:56.714408Z",
"start_time": "2024-09-20T14:02:56.711812Z"
}
},
"outputs": [],
"source": [
"# from enum import Enum, auto\n",
"# from typing import Optional, List, Union, Dict, Any\n",
"# from pydantic import BaseModel, Field\n",
"# \n",
"# class Node(BaseModel):\n",
"# \"\"\"Node in a knowledge graph.\"\"\"\n",
"# id: str\n",
"# name: str\n",
"# type: str\n",
"# description: str\n",
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the node.\")\n",
"# \n",
"# class Edge(BaseModel):\n",
"# \"\"\"Edge in a knowledge graph.\"\"\"\n",
"# source_node_id: str\n",
"# target_node_id: str\n",
"# relationship_name: str\n",
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the edge.\")\n",
"# \n",
"# class KnowledgeGraph(BaseModel):\n",
"# \"\"\"Knowledge graph.\"\"\"\n",
"# nodes: List[Node] = Field(..., default_factory=list)\n",
"# edges: List[Edge] = Field(..., default_factory=list)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7c431fdef4921ae0",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:57.925667Z",
"start_time": "2024-09-20T14:02:57.922353Z"
}
},
"outputs": [],
"source": [
"async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
" data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n",
"\n",
" try:\n",
"\n",
" root_node_id = None\n",
"\n",
" tasks = [\n",
" Task(classify_documents),\n",
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
" Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
" Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
" Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
" Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
" Task(\n",
" save_chunks_to_store,\n",
" collection_name = \"chunks\",\n",
" ), \n",
" Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n",
" ]\n",
"\n",
" pipeline = run_tasks(tasks, data_documents)\n",
"\n",
" async for result in pipeline:\n",
" print(result)\n",
" except Exception as error:\n",
" raise error"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0a91b99c6215e09",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-20T14:02:58.905774Z",
"start_time": "2024-09-20T14:02:58.625915Z"
}
},
"outputs": [],
"source": [
"user = await get_default_user()\n",
"datasets = await get_datasets_by_name([\"example\"], user.id)\n",
"await run_cognify_pipeline(datasets[0], user)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "080389e5",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from cognee.shared.utils import render_graph\n",
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
"import graphistry\n",
"\n",
"# # Setting an environment variable\n",
"# os.environ[\"GRAPHISTRY_USERNAME\"] = placeholder\n",
"# os.environ[\"GRAPHISTRY_PASSWORD\"] = placeholder\n",
"\n",
"\n",
"graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
"\n",
"graph_engine = await get_graph_engine()\n",
"\n",
"graph_url = await render_graph(graph_engine.graph)\n",
"print(graph_url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5e7dfc8",
"metadata": {},
"outputs": [],
"source": [
"async def search(\n",
" vector_engine,\n",
" collection_name: str,\n",
" query_text: str = None,\n",
"):\n",
" query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n",
"\n",
" connection = await vector_engine.get_connection()\n",
" collection = await connection.open_table(collection_name)\n",
"\n",
" results = await collection.vector_search(query_vector).limit(10).to_pandas()\n",
"\n",
" result_values = list(results.to_dict(\"index\").values())\n",
"\n",
" return [dict(\n",
" id = str(result[\"id\"]),\n",
" payload = result[\"payload\"],\n",
" score = result[\"_distance\"],\n",
" ) for result in result_values]\n",
"\n",
"\n",
"from cognee.infrastructure.databases.vector import get_vector_engine\n",
"\n",
"vector_engine = get_vector_engine()\n",
"results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n",
"for result in results:\n",
" print(result)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

3167
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -38,12 +38,10 @@ greenlet = "^3.0.3"
ruff = "^0.2.2"
filetype = "^1.2.0"
nltk = "^3.8.1"
dlt = {extras = ["postgres"], version = "^0.5.2"}
duckdb = {version = "^0.10.0", extras = ["dlt"]}
dlt = {extras = ["sqlalchemy"], version = "^1.0.0"}
overrides = "^7.7.0"
aiofiles = "^23.2.1"
qdrant-client = "^1.9.0"
duckdb-engine = "0.13.0"
graphistry = "^0.33.5"
tenacity = "^8.2.3"
weaviate-client = "4.6.7"
@ -75,14 +73,12 @@ asyncpg = "^0.29.0"
[tool.poetry.extras]
duckdb = ["duckdb"]
filesystem = ["s3fs", "botocore"]
motherduck = ["duckdb"]
cli = ["pipdeptree", "cron-descriptor"]
weaviate = ["weaviate-client"]
qdrant = ["qdrant-client"]
neo4j = ["neo4j", "py2neo"]
notebook = ["ipykernel","overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
neo4j = ["neo4j"]
notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.0"