feat: add sqlalchemy as dlt destination (#137)
* feat: add sqlalchemy as dlt destination * Fix the demo, update Readme * fix: add 1.5 notebook --------- Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
parent
a09f7991e2
commit
a9433e9283
56 changed files with 2435 additions and 2554 deletions
41
.github/workflows/test_neo4j.yml
vendored
41
.github/workflows/test_neo4j.yml
vendored
|
|
@ -18,13 +18,6 @@ jobs:
|
|||
name: docs changes
|
||||
uses: ./.github/workflows/get_docs_changes.yml
|
||||
|
||||
setup_docker:
|
||||
name: Set up Docker Buildx
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
run_neo4j_integration_test:
|
||||
name: test
|
||||
needs: get_docs_changes
|
||||
|
|
@ -35,18 +28,6 @@ jobs:
|
|||
run:
|
||||
shell: bash
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:latest
|
||||
env:
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
steps:
|
||||
- name: Check out
|
||||
uses: actions/checkout@master
|
||||
|
|
@ -66,18 +47,6 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry install --no-interaction
|
||||
|
||||
- name: Create .cognee_system directory and print path
|
||||
run: |
|
||||
mkdir .cognee_system
|
||||
echo $(pwd)/.cognee_system
|
||||
|
||||
- name: Wait for PostgreSQL to be ready
|
||||
run: |
|
||||
echo "Waiting for PostgreSQL to be ready..."
|
||||
until pg_isready -h localhost -p 5432 -U cognee; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Run default Neo4j
|
||||
env:
|
||||
ENV: 'dev'
|
||||
|
|
@ -85,14 +54,4 @@ jobs:
|
|||
GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }}
|
||||
GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }}
|
||||
GRAPH_DATABASE_USERNAME: "neo4j"
|
||||
DB_USER: cognee
|
||||
DB_PASSWORD: cognee
|
||||
DB_NAME: cognee_db
|
||||
DB_HOST: localhost
|
||||
DB_PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
|
||||
run: poetry run python ./cognee/tests/test_neo4j.py
|
||||
|
|
|
|||
49
.github/workflows/test_python_3_10.yml
vendored
49
.github/workflows/test_python_3_10.yml
vendored
|
|
@ -18,15 +18,6 @@ jobs:
|
|||
name: docs changes
|
||||
uses: ./.github/workflows/get_docs_changes.yml
|
||||
|
||||
setup_docker:
|
||||
name: Set up Docker Buildx
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
run_common:
|
||||
name: test
|
||||
needs: get_docs_changes
|
||||
|
|
@ -38,19 +29,6 @@ jobs:
|
|||
run:
|
||||
shell: bash
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:latest
|
||||
env:
|
||||
PGUSER: cognee
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
steps:
|
||||
- name: Check out
|
||||
uses: actions/checkout@master
|
||||
|
|
@ -71,23 +49,6 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry install --no-interaction
|
||||
|
||||
- name: Create .cognee_system directory and print path
|
||||
run: |
|
||||
mkdir .cognee_system
|
||||
echo $(pwd)/.cognee_system
|
||||
|
||||
- name: Wait for PostgreSQL to be ready
|
||||
env:
|
||||
PGUSER: cognee
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
run: |
|
||||
echo "Waiting for PostgreSQL to be ready..."
|
||||
until pg_isready -h localhost -p 5432 -U cognee; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Run tests
|
||||
run: poetry run pytest tests/
|
||||
|
||||
|
|
@ -95,16 +56,6 @@ jobs:
|
|||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
DB_HOST: localhost
|
||||
DB_USERNAME: cognee
|
||||
DB_PASSWORD: cognee
|
||||
DB_DATABASE: cognee_db
|
||||
DB_PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
|
||||
run: poetry run python ./cognee/tests/test_library.py
|
||||
|
||||
- name: Clean up disk space
|
||||
|
|
|
|||
49
.github/workflows/test_python_3_11.yml
vendored
49
.github/workflows/test_python_3_11.yml
vendored
|
|
@ -18,15 +18,6 @@ jobs:
|
|||
name: docs changes
|
||||
uses: ./.github/workflows/get_docs_changes.yml
|
||||
|
||||
setup_docker:
|
||||
name: Set up Docker Buildx
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
run_common:
|
||||
name: test
|
||||
needs: get_docs_changes
|
||||
|
|
@ -38,19 +29,6 @@ jobs:
|
|||
run:
|
||||
shell: bash
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:latest
|
||||
env:
|
||||
PGUSER: cognee
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
steps:
|
||||
- name: Check out
|
||||
uses: actions/checkout@master
|
||||
|
|
@ -71,23 +49,6 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry install --no-interaction
|
||||
|
||||
- name: Create .cognee_system directory and print path
|
||||
run: |
|
||||
mkdir .cognee_system
|
||||
echo $(pwd)/.cognee_system
|
||||
|
||||
- name: Wait for PostgreSQL to be ready
|
||||
env:
|
||||
PGUSER: cognee
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
run: |
|
||||
echo "Waiting for PostgreSQL to be ready..."
|
||||
until pg_isready -h localhost -p 5432 -U cognee; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Run tests
|
||||
run: poetry run pytest tests/
|
||||
|
||||
|
|
@ -95,16 +56,6 @@ jobs:
|
|||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
DB_HOST: localhost
|
||||
DB_USERNAME: cognee
|
||||
DB_PASSWORD: cognee
|
||||
DB_DATABASE: cognee_db
|
||||
DB_PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
|
||||
run: poetry run python ./cognee/tests/test_library.py
|
||||
|
||||
- name: Clean up disk space
|
||||
|
|
|
|||
49
.github/workflows/test_python_3_9.yml
vendored
49
.github/workflows/test_python_3_9.yml
vendored
|
|
@ -18,15 +18,6 @@ jobs:
|
|||
name: docs changes
|
||||
uses: ./.github/workflows/get_docs_changes.yml
|
||||
|
||||
setup_docker:
|
||||
name: Set up Docker Buildx
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
run_common:
|
||||
name: test
|
||||
needs: get_docs_changes
|
||||
|
|
@ -38,19 +29,6 @@ jobs:
|
|||
run:
|
||||
shell: bash
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:latest
|
||||
env:
|
||||
PGUSER: cognee
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
steps:
|
||||
- name: Check out
|
||||
uses: actions/checkout@master
|
||||
|
|
@ -71,23 +49,6 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry install --no-interaction
|
||||
|
||||
- name: Create .cognee_system directory and print path
|
||||
run: |
|
||||
mkdir .cognee_system
|
||||
echo $(pwd)/.cognee_system
|
||||
|
||||
- name: Wait for PostgreSQL to be ready
|
||||
env:
|
||||
PGUSER: cognee
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
run: |
|
||||
echo "Waiting for PostgreSQL to be ready..."
|
||||
until pg_isready -h localhost -p 5432 -U cognee; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Run tests
|
||||
run: poetry run pytest tests/
|
||||
|
||||
|
|
@ -95,16 +56,6 @@ jobs:
|
|||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
DB_HOST: localhost
|
||||
DB_USERNAME: cognee
|
||||
DB_PASSWORD: cognee
|
||||
DB_DATABASE: cognee_db
|
||||
DB_PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
|
||||
run: poetry run python ./cognee/tests/test_library.py
|
||||
|
||||
- name: Clean up disk space
|
||||
|
|
|
|||
41
.github/workflows/test_qdrant.yml
vendored
41
.github/workflows/test_qdrant.yml
vendored
|
|
@ -18,13 +18,6 @@ jobs:
|
|||
name: docs changes
|
||||
uses: ./.github/workflows/get_docs_changes.yml
|
||||
|
||||
setup_docker:
|
||||
name: Set up Docker Buildx
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
run_qdrant_integration_test:
|
||||
name: test
|
||||
needs: get_docs_changes
|
||||
|
|
@ -35,18 +28,6 @@ jobs:
|
|||
run:
|
||||
shell: bash
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:latest
|
||||
env:
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
steps:
|
||||
- name: Check out
|
||||
uses: actions/checkout@master
|
||||
|
|
@ -66,32 +47,10 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry install --no-interaction
|
||||
|
||||
- name: Create .cognee_system directory and print path
|
||||
run: |
|
||||
mkdir .cognee_system
|
||||
echo $(pwd)/.cognee_system
|
||||
|
||||
- name: Wait for PostgreSQL to be ready
|
||||
run: |
|
||||
echo "Waiting for PostgreSQL to be ready..."
|
||||
until pg_isready -h localhost -p 5432 -U cognee; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Run default Qdrant
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
VECTOR_DB_URL: ${{ secrets.QDRANT_API_URL }}
|
||||
VECTOR_DB_KEY: ${{ secrets.QDRANT_API_KEY }}
|
||||
DB_USER: cognee
|
||||
DB_PASSWORD: cognee
|
||||
DB_NAME: cognee_db
|
||||
DB_HOST: localhost
|
||||
DB_PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
|
||||
run: poetry run python ./cognee/tests/test_qdrant.py
|
||||
|
|
|
|||
41
.github/workflows/test_weaviate.yml
vendored
41
.github/workflows/test_weaviate.yml
vendored
|
|
@ -18,13 +18,6 @@ jobs:
|
|||
name: docs changes
|
||||
uses: ./.github/workflows/get_docs_changes.yml
|
||||
|
||||
setup_docker:
|
||||
name: Set up Docker Buildx
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
run_weaviate_integration_test:
|
||||
name: test
|
||||
needs: get_docs_changes
|
||||
|
|
@ -35,18 +28,6 @@ jobs:
|
|||
run:
|
||||
shell: bash
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:latest
|
||||
env:
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
steps:
|
||||
- name: Check out
|
||||
uses: actions/checkout@master
|
||||
|
|
@ -66,32 +47,10 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: poetry install --no-interaction
|
||||
|
||||
- name: Create .cognee_system directory and print path
|
||||
run: |
|
||||
mkdir .cognee_system
|
||||
echo $(pwd)/.cognee_system
|
||||
|
||||
- name: Wait for PostgreSQL to be ready
|
||||
run: |
|
||||
echo "Waiting for PostgreSQL to be ready..."
|
||||
until pg_isready -h localhost -p 5432 -U cognee; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- name: Run default Weaviate
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
VECTOR_DB_URL: ${{ secrets.WEAVIATE_API_URL }}
|
||||
VECTOR_DB_KEY: ${{ secrets.WEAVIATE_API_KEY }}
|
||||
DB_USER: cognee
|
||||
DB_PASSWORD: cognee
|
||||
DB_NAME: cognee_db
|
||||
DB_HOST: localhost
|
||||
DB_PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__HOST: localhost
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PORT: 5432
|
||||
DESTINATION__POSTGRES__CREDENTIALS__USERNAME: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__PASSWORD: cognee
|
||||
DESTINATION__POSTGRES__CREDENTIALS__DATABASE: cognee_db
|
||||
run: poetry run python ./cognee/tests/test_weaviate.py
|
||||
|
|
|
|||
172
README.md
172
README.md
|
|
@ -18,24 +18,12 @@ We build for developers who need a reliable, production-ready data layer for AI
|
|||
</a>
|
||||
</p>
|
||||
|
||||
cognee implements scalable, modular data pipelines that allow for creating the LLM-enriched data layer using graph and vector stores.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<p>
|
||||
<i> cognee aims to be dbt for LLMOps</i>
|
||||
</p>
|
||||
|
||||
|
||||
## What is cognee?
|
||||
|
||||
cognee implements scalable, modular ECL (Extract, Cognify, Load) pipelines that allow you ability to interconnect and retrieve past conversations, documents, audio transcriptions, while also reducing hallucinations, developer effort and cost.
|
||||
Try it in a Google collab <a href="https://colab.research.google.com/drive/1jayZ5JRwDaUGFvCw9UZySBG-iB9gpYfu?usp=sharing">notebook</a> or have a look at our <a href="https://topoteretes.github.io/cognee">documentation</a>
|
||||
|
||||
|
||||
|
||||
|
||||
If you have questions, join our <a href="https://discord.gg/NQPKmU5CCg">Discord</a> community
|
||||
|
||||
|
||||
|
|
@ -58,7 +46,7 @@ poetry add cognee
|
|||
```
|
||||
|
||||
|
||||
## 💻 Usage
|
||||
## 💻 Basic Usage
|
||||
|
||||
### Setup
|
||||
|
||||
|
|
@ -75,24 +63,6 @@ cognee.config.llm_api_key = "YOUR_OPENAI_API_KEY"
|
|||
```
|
||||
You can use different LLM providers, for more info check out our <a href="https://topoteretes.github.io/cognee">documentation</a>
|
||||
|
||||
In the next step make sure to launch a Postgres instance. Here is an example from our docker-compose:
|
||||
```
|
||||
postgres:
|
||||
image: postgres:latest
|
||||
container_name: postgres
|
||||
environment:
|
||||
POSTGRES_USER: cognee
|
||||
POSTGRES_PASSWORD: cognee
|
||||
POSTGRES_DB: cognee_db
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
networks:
|
||||
- cognee-network
|
||||
```
|
||||
|
||||
|
||||
If you are using Networkx, create an account on Graphistry to visualize results:
|
||||
```
|
||||
|
||||
|
|
@ -106,12 +76,7 @@ docker-compose up cognee
|
|||
```
|
||||
Then navigate to localhost:3000/wizard
|
||||
|
||||
### Run the default example
|
||||
|
||||
Make sure to launch the Postgres instance first. Navigate to the cognee folder and run:
|
||||
```
|
||||
docker compose up postgres
|
||||
```
|
||||
### Simple example
|
||||
|
||||
Run the default cognee pipeline:
|
||||
|
||||
|
|
@ -123,7 +88,7 @@ text = """Natural language processing (NLP) is an interdisciplinary
|
|||
|
||||
await cognee.add([text], "example_dataset") # Add a new piece of information
|
||||
|
||||
await cognee.cognify() # Use LLMs and cognee to create knowledge
|
||||
await cognee.cognify() # Use LLMs and cognee to create a semantic graph
|
||||
|
||||
await search_results = cognee.search("SIMILARITY", {'query': 'Tell me about NLP'}) # Query cognee for the knowledge
|
||||
|
||||
|
|
@ -132,19 +97,20 @@ print(search_results)
|
|||
```
|
||||
|
||||
|
||||
### Create your pipelines
|
||||
### Create your own memory store
|
||||
|
||||
cognee framework consists of tasks that can be grouped into pipelines.
|
||||
Each task can be an independent part of business logic, that can be tied to other tasks to form a pipeline.
|
||||
These tasks persist data into your memory store enabling you to search for relevant context of past conversations, documents, or any other data you have stored.
|
||||
|
||||
|
||||
### Example: Classify your documents
|
||||
|
||||
cognee framework consists of tasks that can be grouped into pipelines. Each task can be an independent part of business logic, that can be tied to other tasks to form a pipeline.
|
||||
Here is an example of how it looks for a default cognify pipeline:
|
||||
|
||||
|
||||
1. To prepare the data for the pipeline run, first we need to add it to our metastore and normalize it:
|
||||
|
||||
Start with:
|
||||
```
|
||||
docker compose up postgres
|
||||
```
|
||||
And then run:
|
||||
Start with:
|
||||
```
|
||||
text = """Natural language processing (NLP) is an interdisciplinary
|
||||
subfield of computer science and information retrieval"""
|
||||
|
|
@ -158,90 +124,62 @@ Here we show an example of creating a naive LLM classifier that takes a Pydantic
|
|||
We provided just a snippet for reference, but feel free to check out the implementation in our repo.
|
||||
|
||||
```
|
||||
async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]):
|
||||
if len(data_chunks) == 0:
|
||||
return data_chunks
|
||||
|
||||
async def chunk_naive_llm_classifier(
|
||||
data_chunks: list[DocumentChunk],
|
||||
classification_model: Type[BaseModel]
|
||||
):
|
||||
# Extract classifications asynchronously
|
||||
chunk_classifications = await asyncio.gather(
|
||||
*[extract_categories(chunk.text, classification_model) for chunk in data_chunks],
|
||||
*(extract_categories(chunk.text, classification_model) for chunk in data_chunks)
|
||||
)
|
||||
|
||||
classification_data_points = []
|
||||
|
||||
for chunk_index, chunk in enumerate(data_chunks):
|
||||
chunk_classification = chunk_classifications[chunk_index]
|
||||
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
|
||||
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
|
||||
|
||||
for classification_subclass in chunk_classification.label.subclass:
|
||||
classification_data_points.append(uuid5(NAMESPACE_OID, classification_subclass.value))
|
||||
# Collect classification data points using a set to avoid duplicates
|
||||
classification_data_points = {
|
||||
uuid5(NAMESPACE_OID, cls.label.type)
|
||||
for cls in chunk_classifications
|
||||
} | {
|
||||
uuid5(NAMESPACE_OID, subclass.value)
|
||||
for cls in chunk_classifications
|
||||
for subclass in cls.label.subclass
|
||||
}
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
collection_name = "classification"
|
||||
|
||||
# Define the payload schema
|
||||
class Keyword(BaseModel):
|
||||
uuid: str
|
||||
text: str
|
||||
chunk_id: str
|
||||
document_id: str
|
||||
|
||||
collection_name = "classification"
|
||||
|
||||
if await vector_engine.has_collection(collection_name):
|
||||
existing_data_points = await vector_engine.retrieve(
|
||||
collection_name,
|
||||
list(set(classification_data_points)),
|
||||
) if len(classification_data_points) > 0 else []
|
||||
|
||||
existing_points_map = {point.id: True for point in existing_data_points}
|
||||
# Ensure the collection exists and retrieve existing data points
|
||||
if not await vector_engine.has_collection(collection_name):
|
||||
await vector_engine.create_collection(collection_name, payload_schema=Keyword)
|
||||
existing_points_map = {}
|
||||
else:
|
||||
existing_points_map = {}
|
||||
await vector_engine.create_collection(collection_name, payload_schema=Keyword)
|
||||
|
||||
data_points = []
|
||||
nodes = []
|
||||
edges = []
|
||||
|
||||
for (chunk_index, data_chunk) in enumerate(data_chunks):
|
||||
chunk_classification = chunk_classifications[chunk_index]
|
||||
classification_type_label = chunk_classification.label.type
|
||||
classification_type_id = uuid5(NAMESPACE_OID, classification_type_label)
|
||||
return data_chunks
|
||||
|
||||
...
|
||||
|
||||
```
|
||||
|
||||
To see existing tasks, have a look at the cognee.tasks
|
||||
We have a large number of tasks that can be used in your pipelines, and you can also create your own tasks to fit your business logic.
|
||||
|
||||
|
||||
3. Once we have our tasks, it is time to group them into a pipeline.
|
||||
This snippet shows how a group of tasks can be added to a pipeline, and how they can pass the information forward from one to another.
|
||||
This simplified snippet demonstrates how tasks can be added to a pipeline, and how they can pass the information forward from one to another.
|
||||
|
||||
```
|
||||
tasks = [
|
||||
Task(document_to_ontology, root_node_id = root_node_id),
|
||||
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
|
||||
Task(chunk_to_graph_decomposition, topology_model = KnowledgeGraph, task_config = { "batch_size": 10 }), # Set the graph topology for the document chunk data
|
||||
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities"), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
|
||||
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
|
||||
Task(
|
||||
save_chunks_to_store,
|
||||
collection_name = "chunks",
|
||||
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
|
||||
run_tasks_parallel([
|
||||
Task(
|
||||
chunk_extract_summary,
|
||||
summarization_model = cognee_config.summarization_model,
|
||||
collection_name = "chunk_summaries",
|
||||
), # Summarize the document chunks
|
||||
Task(
|
||||
chunk_naive_llm_classifier,
|
||||
classification_model = cognee_config.classification_model,
|
||||
),
|
||||
]),
|
||||
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
|
||||
]
|
||||
|
||||
|
||||
pipeline = run_tasks(tasks, documents)
|
||||
Task(
|
||||
chunk_naive_llm_classifier,
|
||||
classification_model = cognee_config.classification_model,
|
||||
)
|
||||
|
||||
pipeline = run_tasks(tasks, documents)
|
||||
|
||||
```
|
||||
|
||||
|
|
@ -277,3 +215,23 @@ Check out our demo notebook [here](https://github.com/topoteretes/cognee/blob/ma
|
|||
|
||||
|
||||
[](https://star-history.com/#topoteretes/cognee&Date)
|
||||
|
||||
## Get Started
|
||||
|
||||
### Install Server
|
||||
|
||||
Please see the [cognee Quick Start Guide](https://topoteretes.github.io/cognee/quickstart/) for important configuration information.
|
||||
|
||||
```bash
|
||||
docker compose up
|
||||
```
|
||||
|
||||
|
||||
|
||||
### Install SDK
|
||||
|
||||
Please see the cognee [Develoment Guide](https://topoteretes.github.io/cognee/quickstart/) for important beta information and usage instructions.
|
||||
|
||||
```bash
|
||||
pip install cognee
|
||||
```
|
||||
|
|
|
|||
|
|
@ -14,8 +14,6 @@ from cognee.modules.users.models import User
|
|||
from cognee.modules.users.methods import get_authenticated_user
|
||||
|
||||
|
||||
from cognee.infrastructure.databases.relational import create_db_and_tables
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, # Set the logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
|
|
@ -34,8 +32,12 @@ from contextlib import asynccontextmanager
|
|||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
from cognee.infrastructure.databases.relational import create_db_and_tables
|
||||
from cognee.modules.users.methods import get_default_user
|
||||
|
||||
# Not needed if you setup a migration system like Alembic
|
||||
await create_db_and_tables()
|
||||
await get_default_user()
|
||||
yield
|
||||
|
||||
app = FastAPI(debug = os.getenv("ENV") != "prod", lifespan = lifespan)
|
||||
|
|
@ -394,10 +396,10 @@ def start_api_server(host: str = "0.0.0.0", port: int = 8000):
|
|||
try:
|
||||
logger.info("Starting server at %s:%s", host, port)
|
||||
|
||||
import asyncio
|
||||
from cognee.modules.data.deletion import prune_system, prune_data
|
||||
asyncio.run(prune_data())
|
||||
asyncio.run(prune_system(metadata = True))
|
||||
# import asyncio
|
||||
# from cognee.modules.data.deletion import prune_system, prune_data
|
||||
# asyncio.run(prune_data())
|
||||
# asyncio.run(prune_system(metadata = True))
|
||||
|
||||
uvicorn.run(app, host = host, port = port)
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ from typing import List, Union, BinaryIO
|
|||
from os import path
|
||||
import asyncio
|
||||
import dlt
|
||||
import duckdb
|
||||
|
||||
import cognee.modules.ingestion as ingestion
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
|
|
@ -81,22 +80,16 @@ async def add_files(file_paths: List[str], dataset_name: str, user: User = None)
|
|||
|
||||
relational_config = get_relational_config()
|
||||
|
||||
if relational_config.db_provider == "duckdb":
|
||||
db = duckdb.connect(relational_config.db_file_path)
|
||||
|
||||
destination = dlt.destinations.duckdb(
|
||||
credentials = db,
|
||||
)
|
||||
else:
|
||||
destination = dlt.destinations.postgres(
|
||||
credentials = {
|
||||
"host": relational_config.db_host,
|
||||
"port": relational_config.db_port,
|
||||
"user": relational_config.db_user,
|
||||
"password": relational_config.db_password,
|
||||
"database": relational_config.db_name,
|
||||
},
|
||||
)
|
||||
destination = dlt.destinations.sqlalchemy(
|
||||
credentials = {
|
||||
"host": relational_config.db_host,
|
||||
"port": relational_config.db_port,
|
||||
"username": relational_config.db_username,
|
||||
"password": relational_config.db_password,
|
||||
"database": relational_config.db_name,
|
||||
"drivername": relational_config.db_provider,
|
||||
},
|
||||
)
|
||||
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name = "file_load_from_filesystem",
|
||||
|
|
|
|||
|
|
@ -46,72 +46,6 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
|
|||
if type(datasets[0]) == str:
|
||||
datasets = await get_datasets_by_name(datasets, user.id)
|
||||
|
||||
async def run_cognify_pipeline(dataset: Dataset):
|
||||
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
|
||||
|
||||
document_ids_str = [str(document.id) for document in data_documents]
|
||||
|
||||
dataset_id = dataset.id
|
||||
dataset_name = generate_dataset_name(dataset.name)
|
||||
|
||||
async with update_status_lock:
|
||||
task_status = await get_pipeline_status([dataset_id])
|
||||
|
||||
if dataset_id in task_status and task_status[dataset_id] == "DATASET_PROCESSING_STARTED":
|
||||
logger.info("Dataset %s is already being processed.", dataset_name)
|
||||
return
|
||||
|
||||
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_STARTED", {
|
||||
"dataset_name": dataset_name,
|
||||
"files": document_ids_str,
|
||||
})
|
||||
try:
|
||||
cognee_config = get_cognify_config()
|
||||
|
||||
root_node_id = None
|
||||
|
||||
tasks = [
|
||||
Task(classify_documents),
|
||||
Task(check_permissions_on_documents, user = user, permissions = ["write"]),
|
||||
Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
|
||||
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
|
||||
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
|
||||
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
|
||||
Task(
|
||||
save_chunks_to_store,
|
||||
collection_name = "chunks",
|
||||
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
|
||||
run_tasks_parallel([
|
||||
Task(
|
||||
chunk_extract_summary,
|
||||
summarization_model = cognee_config.summarization_model,
|
||||
collection_name = "chunk_summaries",
|
||||
), # Summarize the document chunks
|
||||
Task(
|
||||
chunk_naive_llm_classifier,
|
||||
classification_model = cognee_config.classification_model,
|
||||
),
|
||||
]),
|
||||
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
|
||||
]
|
||||
|
||||
pipeline = run_tasks(tasks, data_documents)
|
||||
|
||||
async for result in pipeline:
|
||||
print(result)
|
||||
|
||||
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_FINISHED", {
|
||||
"dataset_name": dataset_name,
|
||||
"files": document_ids_str,
|
||||
})
|
||||
except Exception as error:
|
||||
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_ERROR", {
|
||||
"dataset_name": dataset_name,
|
||||
"files": document_ids_str,
|
||||
})
|
||||
raise error
|
||||
|
||||
|
||||
existing_datasets_map = {
|
||||
generate_dataset_name(dataset.name): True for dataset in existing_datasets
|
||||
}
|
||||
|
|
@ -122,10 +56,76 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
|
|||
dataset_name = generate_dataset_name(dataset.name)
|
||||
|
||||
if dataset_name in existing_datasets_map:
|
||||
awaitables.append(run_cognify_pipeline(dataset))
|
||||
awaitables.append(run_cognify_pipeline(dataset, user))
|
||||
|
||||
return await asyncio.gather(*awaitables)
|
||||
|
||||
|
||||
async def run_cognify_pipeline(dataset: Dataset, user: User):
|
||||
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
|
||||
|
||||
document_ids_str = [str(document.id) for document in data_documents]
|
||||
|
||||
dataset_id = dataset.id
|
||||
dataset_name = generate_dataset_name(dataset.name)
|
||||
|
||||
async with update_status_lock:
|
||||
task_status = await get_pipeline_status([dataset_id])
|
||||
|
||||
if dataset_id in task_status and task_status[dataset_id] == "DATASET_PROCESSING_STARTED":
|
||||
logger.info("Dataset %s is already being processed.", dataset_name)
|
||||
return
|
||||
|
||||
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_STARTED", {
|
||||
"dataset_name": dataset_name,
|
||||
"files": document_ids_str,
|
||||
})
|
||||
try:
|
||||
cognee_config = get_cognify_config()
|
||||
|
||||
root_node_id = None
|
||||
|
||||
tasks = [
|
||||
Task(classify_documents),
|
||||
Task(check_permissions_on_documents, user = user, permissions = ["write"]),
|
||||
Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),
|
||||
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
|
||||
Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = "entities", task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
|
||||
Task(chunk_update_check, collection_name = "chunks"), # Find all affected chunks, so we don't process unchanged chunks
|
||||
Task(
|
||||
save_chunks_to_store,
|
||||
collection_name = "chunks",
|
||||
), # Save the document chunks in vector db and as nodes in graph db (connected to the document node and between each other)
|
||||
run_tasks_parallel([
|
||||
Task(
|
||||
chunk_extract_summary,
|
||||
summarization_model = cognee_config.summarization_model,
|
||||
collection_name = "chunk_summaries",
|
||||
), # Summarize the document chunks
|
||||
Task(
|
||||
chunk_naive_llm_classifier,
|
||||
classification_model = cognee_config.classification_model,
|
||||
),
|
||||
]),
|
||||
Task(chunk_remove_disconnected), # Remove the obsolete document chunks.
|
||||
]
|
||||
|
||||
pipeline = run_tasks(tasks, data_documents)
|
||||
|
||||
async for result in pipeline:
|
||||
print(result)
|
||||
|
||||
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_FINISHED", {
|
||||
"dataset_name": dataset_name,
|
||||
"files": document_ids_str,
|
||||
})
|
||||
except Exception as error:
|
||||
await log_pipeline_status(dataset_id, "DATASET_PROCESSING_ERROR", {
|
||||
"dataset_name": dataset_name,
|
||||
"files": document_ids_str,
|
||||
})
|
||||
raise error
|
||||
|
||||
|
||||
def generate_dataset_name(dataset_name: str) -> str:
|
||||
return dataset_name.replace(".", "_").replace(" ", "_")
|
||||
|
|
|
|||
|
|
@ -1,23 +0,0 @@
|
|||
from typing import Protocol
|
||||
|
||||
class DatabaseEngine(Protocol):
|
||||
async def ensure_tables(self):
|
||||
pass
|
||||
|
||||
def database_exists(self, db_name: str) -> bool:
|
||||
pass
|
||||
|
||||
def create_database(self, db_name: str):
|
||||
pass
|
||||
|
||||
def drop_database(self, db_name: str):
|
||||
pass
|
||||
|
||||
async def table_exists(self, table_name: str) -> bool:
|
||||
pass
|
||||
|
||||
async def create_tables(self):
|
||||
pass
|
||||
|
||||
async def create(self, data):
|
||||
pass
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
import inspect
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
class FakeAsyncSession:
|
||||
def __init__(self, session: Session):
|
||||
self.session = session
|
||||
|
||||
def run_sync(self, *args, **kwargs):
|
||||
return self.execute(*args, **kwargs)
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
"""
|
||||
If the method being called is async in AsyncSession, create a fake async version
|
||||
for Session so callers can `await` as usual. Think `commit`, `refresh`,
|
||||
`delete`, etc.
|
||||
"""
|
||||
async_session_attr = getattr(AsyncSession, name, None)
|
||||
session_attr = getattr(self.session, name)
|
||||
|
||||
if not inspect.iscoroutinefunction(async_session_attr):
|
||||
return session_attr
|
||||
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
return session_attr(*args, **kwargs)
|
||||
|
||||
return async_wrapper
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
from .ModelBase import Base
|
||||
from .DatabaseEngine import DatabaseEngine
|
||||
from .sqlite.SqliteEngine import SqliteEngine
|
||||
from .duckdb.DuckDBAdapter import DuckDBAdapter
|
||||
from .config import get_relational_config
|
||||
from .create_db_and_tables import create_db_and_tables
|
||||
from .get_relational_engine import get_relational_engine
|
||||
|
||||
# Global data types
|
||||
from .data_types.UUID import UUID
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
from typing import Union
|
||||
from functools import lru_cache
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from cognee.root_dir import get_absolute_path
|
||||
|
|
@ -6,13 +7,11 @@ from cognee.root_dir import get_absolute_path
|
|||
class RelationalConfig(BaseSettings):
|
||||
db_path: str = os.path.join(get_absolute_path(".cognee_system"), "databases")
|
||||
db_name: str = "cognee_db"
|
||||
db_host: str = "localhost"
|
||||
db_port: str = "5432"
|
||||
db_user: str = "cognee"
|
||||
db_password: str = "cognee"
|
||||
db_provider: str = "postgresql+asyncpg"
|
||||
db_file_path: str = os.path.join(db_path, db_name)
|
||||
|
||||
db_host: Union[str, None] = None # "localhost"
|
||||
db_port: Union[str, None] = None # "5432"
|
||||
db_username: Union[str, None] = None # "cognee"
|
||||
db_password: Union[str, None] = None # "cognee"
|
||||
db_provider: str = "sqlite"
|
||||
|
||||
model_config = SettingsConfigDict(env_file = ".env", extra = "allow")
|
||||
|
||||
|
|
@ -22,7 +21,7 @@ class RelationalConfig(BaseSettings):
|
|||
"db_name": self.db_name,
|
||||
"db_host": self.db_host,
|
||||
"db_port": self.db_port,
|
||||
"db_user": self.db_user,
|
||||
"db_username": self.db_username,
|
||||
"db_password": self.db_password,
|
||||
"db_provider": self.db_provider,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,14 @@
|
|||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
from .ModelBase import Base
|
||||
from .get_relational_engine import get_relational_engine
|
||||
from .get_relational_engine import get_relational_engine, get_relational_config
|
||||
|
||||
async def create_db_and_tables():
|
||||
relational_config = get_relational_config()
|
||||
relational_engine = get_relational_engine()
|
||||
|
||||
if relational_engine.engine.dialect.name == "sqlite":
|
||||
LocalStorage.ensure_directory_exists(relational_config.db_path)
|
||||
|
||||
async with relational_engine.engine.begin() as connection:
|
||||
if len(Base.metadata.tables.keys()) > 0:
|
||||
await connection.run_sync(Base.metadata.create_all)
|
||||
|
|
|
|||
|
|
@ -3,18 +3,16 @@ from .sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter
|
|||
def create_relational_engine(
|
||||
db_path: str,
|
||||
db_name: str,
|
||||
db_provider: str,
|
||||
db_host: str,
|
||||
db_port: str,
|
||||
db_user: str,
|
||||
db_username: str,
|
||||
db_password: str,
|
||||
db_provider: str,
|
||||
):
|
||||
return SQLAlchemyAdapter(
|
||||
db_name = db_name,
|
||||
db_path = db_path,
|
||||
db_type = db_provider,
|
||||
db_host = db_host,
|
||||
db_port = db_port,
|
||||
db_user = db_user,
|
||||
db_password = db_password
|
||||
)
|
||||
if db_provider == "sqlite":
|
||||
connection_string = f"sqlite+aiosqlite:///{db_path}/{db_name}"
|
||||
|
||||
if db_provider == "postgres":
|
||||
connection_string = f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
|
||||
|
||||
return SQLAlchemyAdapter(connection_string)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
import uuid
|
||||
|
||||
from sqlalchemy.types import TypeDecorator, BINARY
|
||||
from sqlalchemy.dialects.postgresql import UUID as psqlUUID
|
||||
|
||||
class UUID(TypeDecorator):
|
||||
"""Platform-independent GUID type.
|
||||
|
||||
Uses Postgresql's UUID type, otherwise uses
|
||||
BINARY(16), to store UUID.
|
||||
|
||||
"""
|
||||
impl = BINARY
|
||||
|
||||
def load_dialect_impl(self, dialect):
|
||||
if dialect.name == 'postgresql':
|
||||
return dialect.type_descriptor(psqlUUID())
|
||||
else:
|
||||
return dialect.type_descriptor(BINARY(16))
|
||||
|
||||
def process_bind_param(self, value, dialect):
|
||||
if value is None:
|
||||
return value
|
||||
else:
|
||||
if not isinstance(value, uuid.UUID):
|
||||
if isinstance(value, bytes):
|
||||
value = uuid.UUID(bytes = value)
|
||||
elif isinstance(value, int):
|
||||
value = uuid.UUID(int = value)
|
||||
elif isinstance(value, str):
|
||||
value = uuid.UUID(value)
|
||||
if dialect.name == 'postgresql':
|
||||
return str(value)
|
||||
else:
|
||||
return value.bytes
|
||||
|
||||
def process_result_value(self, value, dialect):
|
||||
if value is None:
|
||||
return value
|
||||
if dialect.name == 'postgresql':
|
||||
return uuid.UUID(value)
|
||||
else:
|
||||
return uuid.UUID(bytes = value)
|
||||
|
|
@ -1,169 +0,0 @@
|
|||
import duckdb
|
||||
import os
|
||||
class DuckDBAdapter():
|
||||
def __init__(self, db_path: str, db_name: str):
|
||||
self.db_location = os.path.abspath(os.path.join(db_path, db_name))
|
||||
|
||||
self.get_connection = lambda: duckdb.connect(self.db_location)
|
||||
|
||||
def get_datasets(self):
|
||||
with self.get_connection() as connection:
|
||||
tables = connection.sql("SELECT DISTINCT schema_name FROM duckdb_tables();").to_df().to_dict("list")
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda schema_name: not schema_name.endswith("staging") and schema_name != "cognee",
|
||||
tables["schema_name"]
|
||||
)
|
||||
)
|
||||
|
||||
def get_files_metadata(self, dataset_name: str):
|
||||
with self.get_connection() as connection:
|
||||
return connection.sql(f"SELECT id, name, file_path, extension, mime_type FROM {dataset_name}.file_metadata;").to_df().to_dict("records")
|
||||
|
||||
def create_table(self, schema_name: str, table_name: str, table_config: list[dict]):
|
||||
fields_query_parts = []
|
||||
|
||||
for table_config_item in table_config:
|
||||
fields_query_parts.append(f"{table_config_item['name']} {table_config_item['type']}")
|
||||
|
||||
with self.get_connection() as connection:
|
||||
query = f"CREATE SCHEMA IF NOT EXISTS {schema_name};"
|
||||
connection.execute(query)
|
||||
|
||||
with self.get_connection() as connection:
|
||||
query = f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ({', '.join(fields_query_parts)});"
|
||||
connection.execute(query)
|
||||
|
||||
def delete_table(self, table_name: str):
|
||||
with self.get_connection() as connection:
|
||||
query = f"DROP TABLE IF EXISTS {table_name};"
|
||||
connection.execute(query)
|
||||
|
||||
def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
|
||||
def get_values(data_entry: list):
|
||||
return ", ".join([f"'{value}'" if isinstance(value, str) else value for value in data_entry])
|
||||
|
||||
columns = ", ".join(data[0].keys())
|
||||
values = ", ".join([f"({get_values(data_entry.values())})" for data_entry in data])
|
||||
|
||||
with self.get_connection() as connection:
|
||||
query = f"INSERT INTO {schema_name}.{table_name} ({columns}) VALUES {values};"
|
||||
connection.execute(query)
|
||||
|
||||
def get_data(self, table_name: str, filters: dict = None):
|
||||
with self.get_connection() as connection:
|
||||
def get_values(values: list):
|
||||
return ", ".join([f"'{value}'" for value in values])
|
||||
|
||||
def get_filters(filters: dict):
|
||||
return " AND ".join([
|
||||
f"{key} IN ({get_values(value)})" if isinstance(value, list)
|
||||
else f"{key} = '{value}'" for (key, value) in filters.items()
|
||||
])
|
||||
|
||||
query = f"SELECT * FROM {table_name}" + (";" if filters is None else f" WHERE {get_filters(filters)};")
|
||||
results = connection.sql(query).to_df().to_dict("records")
|
||||
|
||||
return {
|
||||
result["data_id"]: result["status"] for result in results
|
||||
}
|
||||
|
||||
def execute_query(self, query):
|
||||
with self.get_connection() as connection:
|
||||
return connection.sql(query).to_df().to_dict("records")
|
||||
|
||||
def load_cognify_data(self, data):
|
||||
with self.get_connection() as connection:
|
||||
# Ensure the "cognify" table exists
|
||||
connection.execute("""
|
||||
CREATE TABLE IF NOT EXISTS cognify (
|
||||
document_id STRING,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT NULL,
|
||||
processed BOOLEAN DEFAULT FALSE,
|
||||
document_id_target STRING NULL
|
||||
);
|
||||
""")
|
||||
|
||||
# Prepare the insert statement
|
||||
insert_query = """
|
||||
INSERT INTO cognify (document_id)
|
||||
VALUES (?);
|
||||
"""
|
||||
|
||||
# Insert each record into the "cognify" table
|
||||
for record in data:
|
||||
with self.get_connection() as connection:
|
||||
connection.execute(insert_query, [
|
||||
record.get("document_id"),
|
||||
])
|
||||
|
||||
def fetch_cognify_data(self, excluded_document_id: str):
|
||||
# SQL command to create the "cognify" table with the specified columns
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS cognify (
|
||||
document_id STRING,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT NULL,
|
||||
processed BOOLEAN DEFAULT FALSE,
|
||||
document_id_target STRING NULL
|
||||
);
|
||||
"""
|
||||
with self.get_connection() as connection:
|
||||
# Execute the SQL command to create the table
|
||||
connection.execute(create_table_sql)
|
||||
|
||||
# SQL command to select data from the "cognify" table
|
||||
select_data_sql = f"SELECT document_id, created_at, updated_at, processed FROM cognify WHERE document_id != '{excluded_document_id}' AND processed = FALSE;"
|
||||
|
||||
with self.get_connection() as connection:
|
||||
# Execute the query and fetch the results
|
||||
records = connection.sql(select_data_sql).to_df().to_dict("records")
|
||||
|
||||
# If records are fetched, update the "processed" column to "True"
|
||||
if records:
|
||||
# Fetching document_ids from the records to update the "processed" column
|
||||
document_ids = tuple(record["document_id"] for record in records)
|
||||
# SQL command to update the "processed" column to "True" for fetched records
|
||||
update_data_sql = f"UPDATE cognify SET processed = TRUE WHERE document_id IN {document_ids};"
|
||||
|
||||
with self.get_connection() as connection:
|
||||
# Execute the update query
|
||||
connection.execute(update_data_sql)
|
||||
|
||||
# Return the fetched records
|
||||
return records
|
||||
|
||||
|
||||
def delete_cognify_data(self):
|
||||
# SQL command to create the "cognify" table with the specified columns
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS cognify (
|
||||
document_id STRING,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT NULL,
|
||||
processed BOOLEAN DEFAULT FALSE,
|
||||
document_id_target STRING NULL
|
||||
);
|
||||
"""
|
||||
|
||||
with self.get_connection() as connection:
|
||||
# Execute the SQL command to create the table
|
||||
connection.execute(create_table_sql)
|
||||
|
||||
with self.get_connection() as connection:
|
||||
# SQL command to select data from the "cognify" table
|
||||
select_data_sql = "DELETE FROM cognify;"
|
||||
connection.sql(select_data_sql)
|
||||
drop_data_sql = "DROP TABLE cognify;"
|
||||
connection.sql(drop_data_sql)
|
||||
|
||||
def delete_database(self):
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
|
||||
if LocalStorage.file_exists(self.db_location):
|
||||
LocalStorage.remove(self.db_location)
|
||||
|
||||
if LocalStorage.file_exists(self.db_location + ".wal"):
|
||||
LocalStorage.remove(self.db_location + ".wal")
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
from abc import abstractmethod
|
||||
from typing import Protocol, TypeVar, Type, List
|
||||
|
||||
RowDataType = TypeVar('RowDataType')
|
||||
|
||||
class RelationalDBInterface(Protocol):
|
||||
@abstractmethod
|
||||
async def create_database(self, database_name: str, database_path: str): raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def create_table(self, table_name: str, table_config: object): raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def add_row(self, table_name: str, row_data: Type[RowDataType]): raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def add_rows(self, table_name: str, rows_data: List[Type[RowDataType]]): raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def get_row(self, table_name: str, row_id: str): raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def update_row(self, table_name: str, row_id: str, row_data: Type[RowDataType]): raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def delete_row(self, table_name: str, row_id: str): raise NotImplementedError
|
||||
|
|
@ -1,39 +1,18 @@
|
|||
import os
|
||||
import asyncio
|
||||
from typing import AsyncGenerator
|
||||
from contextlib import asynccontextmanager
|
||||
from sqlalchemy import create_engine, text, select
|
||||
from sqlalchemy.orm import sessionmaker, joinedload
|
||||
from sqlalchemy import text, select
|
||||
from sqlalchemy.orm import joinedload
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
from cognee.infrastructure.databases.relational.FakeAsyncSession import FakeAsyncSession
|
||||
|
||||
from ..ModelBase import Base
|
||||
|
||||
def make_async_sessionmaker(sessionmaker):
|
||||
@asynccontextmanager
|
||||
async def async_session_maker():
|
||||
await asyncio.sleep(0.1)
|
||||
session = FakeAsyncSession(sessionmaker())
|
||||
try:
|
||||
yield session
|
||||
finally:
|
||||
await session.close() # Ensure the session is closed
|
||||
|
||||
return async_session_maker
|
||||
|
||||
class SQLAlchemyAdapter():
|
||||
def __init__(self, db_type: str, db_path: str, db_name: str, db_user: str, db_password: str, db_host: str, db_port: str):
|
||||
self.db_location = os.path.abspath(os.path.join(db_path, db_name))
|
||||
self.db_name = db_name
|
||||
def __init__(self, connection_string: str):
|
||||
self.engine = create_async_engine(connection_string)
|
||||
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
|
||||
|
||||
if db_type == "duckdb":
|
||||
LocalStorage.ensure_directory_exists(db_path)
|
||||
|
||||
self.engine = create_engine(f"duckdb:///{self.db_location}")
|
||||
self.sessionmaker = make_async_sessionmaker(sessionmaker(bind=self.engine))
|
||||
else:
|
||||
self.engine = create_async_engine(f"postgresql+asyncpg://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")
|
||||
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
|
||||
if self.engine.dialect.name == "sqlite":
|
||||
self.db_path = connection_string.split("///")[1]
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_async_session(self) -> AsyncGenerator[AsyncSession, None]:
|
||||
|
|
@ -72,6 +51,7 @@ class SQLAlchemyAdapter():
|
|||
await connection.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE;"))
|
||||
|
||||
await connection.close()
|
||||
|
||||
async def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
|
||||
columns = ", ".join(data[0].keys())
|
||||
values = ", ".join([f"({', '.join([f':{key}' for key in row.keys()])})" for row in data])
|
||||
|
|
@ -80,6 +60,7 @@ class SQLAlchemyAdapter():
|
|||
async with self.engine.begin() as connection:
|
||||
await connection.execute(insert_query, data)
|
||||
await connection.close()
|
||||
|
||||
async def get_data(self, table_name: str, filters: dict = None):
|
||||
async with self.engine.begin() as connection:
|
||||
query = f"SELECT * FROM {table_name}"
|
||||
|
|
@ -113,11 +94,19 @@ class SQLAlchemyAdapter():
|
|||
print(f"Error dropping database tables: {e}")
|
||||
|
||||
async def delete_database(self):
|
||||
async with self.engine.begin() as connection:
|
||||
try:
|
||||
for table in Base.metadata.sorted_tables:
|
||||
drop_table_query = text(f'DROP TABLE IF EXISTS {table.name} CASCADE')
|
||||
await connection.execute(drop_table_query)
|
||||
print("Database deleted successfully.")
|
||||
except Exception as e:
|
||||
print(f"Error deleting database: {e}")
|
||||
try:
|
||||
if self.engine.dialect.name == "sqlite":
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
|
||||
LocalStorage.remove(self.db_path)
|
||||
self.db_path = None
|
||||
else:
|
||||
async with self.engine.begin() as connection:
|
||||
for table in Base.metadata.sorted_tables:
|
||||
drop_table_query = text(f'DROP TABLE IF EXISTS {table.name} CASCADE')
|
||||
await connection.execute(drop_table_query)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error deleting database: {e}")
|
||||
|
||||
print("Database deleted successfully.")
|
||||
|
|
|
|||
|
|
@ -1,82 +0,0 @@
|
|||
import os
|
||||
import asyncio
|
||||
from typing import Callable
|
||||
from sqlalchemy.inspection import inspect
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncEngine, AsyncSession, async_scoped_session
|
||||
from sqlalchemy.future import select
|
||||
from cognee.infrastructure.files.storage.LocalStorage import LocalStorage
|
||||
from ..DatabaseEngine import DatabaseEngine
|
||||
from ..ModelBase import Base
|
||||
from ..utils import with_rollback
|
||||
|
||||
class SqliteEngine(DatabaseEngine):
|
||||
db_path: str = None
|
||||
db_name: str = None
|
||||
engine: AsyncEngine = None
|
||||
session_maker: Callable[[], async_scoped_session[AsyncSession]] = None
|
||||
is_db_done: bool = False
|
||||
|
||||
def __init__(self, db_path: str, db_name: str):
|
||||
self.db_path = db_path
|
||||
self.db_name = db_name
|
||||
self.db_location = db_path + "/" + db_name
|
||||
self.engine = create_async_engine(
|
||||
f"sqlite+aiosqlite:///{self.db_location}",
|
||||
pool_recycle = 3600,
|
||||
echo = False
|
||||
)
|
||||
self.session_maker = lambda: async_scoped_session(
|
||||
async_sessionmaker(
|
||||
bind = self.engine,
|
||||
class_ = AsyncSession
|
||||
),
|
||||
scopefunc = asyncio.current_task
|
||||
)
|
||||
|
||||
async def ensure_tables(self):
|
||||
if not self.database_exists(self.db_name):
|
||||
self.create_database(self.db_name)
|
||||
|
||||
await self.create_tables()
|
||||
|
||||
self.is_db_done = True
|
||||
|
||||
return True
|
||||
|
||||
def database_exists(self, db_name: str) -> bool:
|
||||
return os.path.exists(self.db_path + "/" + db_name)
|
||||
|
||||
def create_database(self, db_name: str):
|
||||
LocalStorage.ensure_directory_exists(self.db_path)
|
||||
|
||||
with open(self.db_path + "/" + db_name, mode = "w+", encoding = "utf-8") as file:
|
||||
file.write("")
|
||||
|
||||
def drop_database(self, db_name: str):
|
||||
os.remove(self.db_location)
|
||||
|
||||
async def table_exists(self, table_name: str) -> bool:
|
||||
return inspect(self.engine).has_table(table_name)
|
||||
|
||||
async def create_tables(self):
|
||||
async with self.engine.begin() as connection:
|
||||
return await connection.run_sync(Base.metadata.create_all)
|
||||
|
||||
async def create(self, data):
|
||||
async with with_rollback(self.session_maker()) as session:
|
||||
session.add(data)
|
||||
|
||||
async def query(self, query_term):
|
||||
async with with_rollback(self.session_maker()) as session:
|
||||
return await session.execute(query_term)
|
||||
|
||||
async def query_entity(self, entity):
|
||||
async with with_rollback(self.session_maker()) as session:
|
||||
return await session.execute(
|
||||
select(type(entity))
|
||||
.where(type(entity).id == entity.id)
|
||||
)
|
||||
|
||||
async def update(self, data_update_fn):
|
||||
async with with_rollback(self.session_maker()):
|
||||
data_update_fn()
|
||||
|
|
@ -1 +0,0 @@
|
|||
from .with_rollback import with_rollback
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from sqlalchemy.ext.asyncio import async_scoped_session
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@asynccontextmanager
|
||||
async def with_rollback(session: async_scoped_session):
|
||||
"""Provide a transactional scope around a series of operations."""
|
||||
|
||||
try:
|
||||
# async with session.begin():
|
||||
yield session
|
||||
await session.commit()
|
||||
await session.remove()
|
||||
except Exception as exception:
|
||||
await session.rollback()
|
||||
logger.error("Session rolled back due to: %s", str(exception))
|
||||
raise exception
|
||||
|
|
@ -4,7 +4,7 @@ import litellm
|
|||
from litellm import aembedding
|
||||
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
||||
|
||||
litellm.set_verbose = True
|
||||
litellm.set_verbose = False
|
||||
|
||||
class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
||||
api_key: str
|
||||
|
|
|
|||
|
|
@ -106,11 +106,10 @@ class QDrantAdapter(VectorDBInterface):
|
|||
points = [convert_to_qdrant_point(point) for point in data_points]
|
||||
|
||||
try:
|
||||
result = await client.upload_points(
|
||||
client.upload_points(
|
||||
collection_name = collection_name,
|
||||
points = points
|
||||
)
|
||||
return result
|
||||
except Exception as error:
|
||||
logger.error("Error uploading data points to Qdrant: %s", str(error))
|
||||
raise error
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from datetime import datetime
|
||||
from sqlalchemy.orm import Mapped, MappedColumn
|
||||
from sqlalchemy import Column, String, DateTime, ForeignKey, Enum, UUID, JSON
|
||||
from cognee.infrastructure.databases.relational import ModelBase
|
||||
from sqlalchemy import Column, DateTime, ForeignKey, Enum, JSON
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class OperationType(Enum):
|
||||
MERGE_DATA = "MERGE_DATA"
|
||||
|
|
@ -14,10 +14,10 @@ class OperationStatus(Enum):
|
|||
ERROR = "OPERATION_ERROR"
|
||||
CANCELLED = "OPERATION_CANCELLED"
|
||||
|
||||
class Operation(ModelBase):
|
||||
class Operation(Base):
|
||||
__tablename__ = "operation"
|
||||
|
||||
id = Column(String, primary_key = True)
|
||||
id = Column(UUID, primary_key = True)
|
||||
status = Column(Enum(OperationStatus))
|
||||
operation_type = Column(Enum(OperationType))
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ async def get_datasets_by_name(dataset_names: list[str], user_id: UUID) -> list[
|
|||
db_engine = get_relational_engine()
|
||||
|
||||
async with db_engine.get_async_session() as session:
|
||||
if isinstance(dataset_names, str):
|
||||
dataset_names = [dataset_names]
|
||||
datasets = (await session.scalars(
|
||||
select(Dataset)
|
||||
.filter(Dataset.owner_id == user_id)
|
||||
|
|
|
|||
|
|
@ -2,14 +2,14 @@ from uuid import uuid4
|
|||
from typing import List
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy.orm import relationship, Mapped
|
||||
from sqlalchemy import Column, String, DateTime, UUID
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, String, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
from .DatasetData import DatasetData
|
||||
|
||||
class Data(Base):
|
||||
__tablename__ = "data"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
name = Column(String)
|
||||
extension = Column(String)
|
||||
|
|
|
|||
|
|
@ -2,14 +2,14 @@ from uuid import uuid4
|
|||
from typing import List
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy.orm import relationship, Mapped
|
||||
from sqlalchemy import Column, Text, DateTime, UUID
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, Text, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
from .DatasetData import DatasetData
|
||||
|
||||
class Dataset(Base):
|
||||
__tablename__ = "datasets"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
name = Column(Text)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, DateTime, UUID, ForeignKey
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, DateTime, ForeignKey
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class DatasetData(Base):
|
||||
__tablename__ = "dataset_data"
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
|
||||
dataset_id = Column(UUID(as_uuid = True), ForeignKey("datasets.id"), primary_key = True)
|
||||
data_id = Column(UUID(as_uuid = True), ForeignKey("data.id"), primary_key = True)
|
||||
dataset_id = Column(UUID, ForeignKey("datasets.id"), primary_key = True)
|
||||
data_id = Column(UUID, ForeignKey("data.id"), primary_key = True)
|
||||
|
|
|
|||
|
|
@ -6,18 +6,18 @@ from .Document import Document
|
|||
class AudioDocument(Document):
|
||||
type: str = "audio"
|
||||
title: str
|
||||
file_path: str
|
||||
chunking_strategy:str
|
||||
raw_data_location: str
|
||||
chunking_strategy: str
|
||||
|
||||
def __init__(self, id: UUID, title: str, file_path: str, chunking_strategy:str="paragraph"):
|
||||
def __init__(self, id: UUID, title: str, raw_data_location: str, chunking_strategy:str="paragraph"):
|
||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||
self.title = title
|
||||
self.file_path = file_path
|
||||
self.raw_data_location = raw_data_location
|
||||
self.chunking_strategy = chunking_strategy
|
||||
|
||||
def read(self):
|
||||
# Transcribe the audio file
|
||||
result = get_llm_client().create_transcript(self.file_path)
|
||||
result = get_llm_client().create_transcript(self.raw_data_location)
|
||||
text = result.text
|
||||
|
||||
chunker = TextChunker(self.id, get_text = lambda: text)
|
||||
|
|
@ -30,5 +30,5 @@ class AudioDocument(Document):
|
|||
id=str(self.id),
|
||||
type=self.type,
|
||||
title=self.title,
|
||||
file_path=self.file_path,
|
||||
raw_data_location=self.raw_data_location,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ class Document(Protocol):
|
|||
id: UUID
|
||||
type: str
|
||||
title: str
|
||||
file_path: str
|
||||
raw_data_location: str
|
||||
|
||||
def read(self) -> str:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -7,16 +7,16 @@ from .Document import Document
|
|||
class ImageDocument(Document):
|
||||
type: str = "image"
|
||||
title: str
|
||||
file_path: str
|
||||
raw_data_location: str
|
||||
|
||||
def __init__(self, id: UUID, title: str, file_path: str):
|
||||
def __init__(self, id: UUID, title: str, raw_data_location: str):
|
||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||
self.title = title
|
||||
self.file_path = file_path
|
||||
self.raw_data_location = raw_data_location
|
||||
|
||||
def read(self):
|
||||
# Transcribe the image file
|
||||
result = get_llm_client().transcribe_image(self.file_path)
|
||||
result = get_llm_client().transcribe_image(self.raw_data_location)
|
||||
text = result.choices[0].message.content
|
||||
|
||||
chunker = TextChunker(self.id, get_text = lambda: text)
|
||||
|
|
@ -29,5 +29,5 @@ class ImageDocument(Document):
|
|||
id=str(self.id),
|
||||
type=self.type,
|
||||
title=self.title,
|
||||
file_path=self.file_path,
|
||||
raw_data_location=self.raw_data_location,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -6,15 +6,15 @@ from .Document import Document
|
|||
class PdfDocument(Document):
|
||||
type: str = "pdf"
|
||||
title: str
|
||||
file_path: str
|
||||
raw_data_location: str
|
||||
|
||||
def __init__(self, id: UUID, title: str, file_path: str):
|
||||
def __init__(self, id: UUID, title: str, raw_data_location: str):
|
||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||
self.title = title
|
||||
self.file_path = file_path
|
||||
self.raw_data_location = raw_data_location
|
||||
|
||||
def read(self) -> PdfReader:
|
||||
file = PdfReader(self.file_path)
|
||||
file = PdfReader(self.raw_data_location)
|
||||
|
||||
def get_text():
|
||||
for page in file.pages:
|
||||
|
|
@ -32,5 +32,5 @@ class PdfDocument(Document):
|
|||
id = str(self.id),
|
||||
type = self.type,
|
||||
title = self.title,
|
||||
file_path = self.file_path,
|
||||
raw_data_location = self.raw_data_location,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,16 +5,16 @@ from .Document import Document
|
|||
class TextDocument(Document):
|
||||
type: str = "text"
|
||||
title: str
|
||||
file_path: str
|
||||
raw_data_location: str
|
||||
|
||||
def __init__(self, id: UUID, title: str, file_path: str):
|
||||
def __init__(self, id: UUID, title: str, raw_data_location: str):
|
||||
self.id = id or uuid5(NAMESPACE_OID, title)
|
||||
self.title = title
|
||||
self.file_path = file_path
|
||||
self.raw_data_location = raw_data_location
|
||||
|
||||
def read(self):
|
||||
def get_text():
|
||||
with open(self.file_path, mode = "r", encoding = "utf-8") as file:
|
||||
with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
|
||||
while True:
|
||||
text = file.read(1024)
|
||||
|
||||
|
|
@ -34,5 +34,5 @@ class TextDocument(Document):
|
|||
id = str(self.id),
|
||||
type = self.type,
|
||||
title = self.title,
|
||||
file_path = self.file_path,
|
||||
raw_data_location = self.raw_data_location,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, UUID, DateTime, String, Text
|
||||
from sqlalchemy import Column, DateTime, String, Text
|
||||
from sqlalchemy.orm import relationship, Mapped
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
from .PipelineTask import PipelineTask
|
||||
|
||||
class Pipeline(Base):
|
||||
__tablename__ = "pipelines"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
name = Column(String)
|
||||
description = Column(Text, nullable = True)
|
||||
|
|
|
|||
|
|
@ -1,16 +1,16 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, UUID, DateTime, String, JSON
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, DateTime, String, JSON
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class PipelineRun(Base):
|
||||
__tablename__ = "pipeline_runs"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
|
||||
status = Column(String)
|
||||
|
||||
run_id = Column(UUID(as_uuid = True), index = True)
|
||||
run_id = Column(UUID, index = True)
|
||||
run_info = Column(JSON)
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, DateTime, UUID, ForeignKey
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, DateTime, ForeignKey
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class PipelineTask(Base):
|
||||
__tablename__ = "pipeline_task"
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
|
||||
pipeline_id = Column("pipeline", UUID(as_uuid = True), ForeignKey("pipeline.id"), primary_key = True)
|
||||
task_id = Column("task", UUID(as_uuid = True), ForeignKey("task.id"), primary_key = True)
|
||||
pipeline_id = Column("pipeline", UUID, ForeignKey("pipeline.id"), primary_key = True)
|
||||
task_id = Column("task", UUID, ForeignKey("task.id"), primary_key = True)
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy.orm import relationship, Mapped
|
||||
from sqlalchemy import Column, String, DateTime, UUID, Text
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, String, DateTime, Text
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
from .PipelineTask import PipelineTask
|
||||
|
||||
class Task(Base):
|
||||
__tablename__ = "tasks"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
name = Column(String)
|
||||
description = Column(Text, nullable = True)
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, UUID, DateTime, String, JSON
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, DateTime, String, JSON
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class TaskRun(Base):
|
||||
__tablename__ = "task_runs"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
task_name = Column(String)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,20 +1,20 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy.orm import relationship, Mapped
|
||||
from sqlalchemy import Column, ForeignKey, DateTime, UUID
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, ForeignKey, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
from .ACLResources import ACLResources
|
||||
|
||||
class ACL(Base):
|
||||
__tablename__ = "acls"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
|
||||
|
||||
principal_id = Column(UUID(as_uuid = True), ForeignKey("principals.id"))
|
||||
permission_id = Column(UUID(as_uuid = True), ForeignKey("permissions.id"))
|
||||
principal_id = Column(UUID, ForeignKey("principals.id"))
|
||||
permission_id = Column(UUID, ForeignKey("permissions.id"))
|
||||
|
||||
principal = relationship("Principal")
|
||||
permission = relationship("Permission")
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, ForeignKey, UUID, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, ForeignKey, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class ACLResources(Base):
|
||||
__tablename__ = "acl_resources"
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
|
||||
acl_id = Column(UUID(as_uuid = True), ForeignKey("acls.id"), primary_key = True)
|
||||
resource_id = Column(UUID(as_uuid = True), ForeignKey("resources.id"), primary_key = True)
|
||||
acl_id = Column(UUID, ForeignKey("acls.id"), primary_key = True)
|
||||
resource_id = Column(UUID, ForeignKey("resources.id"), primary_key = True)
|
||||
|
|
|
|||
|
|
@ -1,12 +1,13 @@
|
|||
from sqlalchemy.orm import relationship, Mapped
|
||||
from sqlalchemy import Column, String, ForeignKey, UUID
|
||||
from sqlalchemy import Column, String, ForeignKey
|
||||
from cognee.infrastructure.databases.relational import UUID
|
||||
from .Principal import Principal
|
||||
from .UserGroup import UserGroup
|
||||
|
||||
class Group(Principal):
|
||||
__tablename__ = "groups"
|
||||
|
||||
id = Column(UUID(as_uuid = True), ForeignKey("principals.id"), primary_key = True)
|
||||
id = Column(UUID, ForeignKey("principals.id"), primary_key = True)
|
||||
|
||||
name = Column(String, unique = True, nullable = False, index = True)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
# from sqlalchemy.orm import relationship
|
||||
from sqlalchemy import Column, DateTime, UUID, String
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, DateTime, String
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class Permission(Base):
|
||||
__tablename__ = "permissions"
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, String, DateTime, UUID
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, String, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class Principal(Base):
|
||||
__tablename__ = "principals"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, index = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, index = True, default = uuid4)
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
|
||||
|
|
|
|||
|
|
@ -1,18 +1,18 @@
|
|||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy import Column, DateTime, UUID
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
from .ACLResources import ACLResources
|
||||
|
||||
class Resource(Base):
|
||||
__tablename__ = "resources"
|
||||
|
||||
id = Column(UUID(as_uuid = True), primary_key = True, default = uuid4)
|
||||
id = Column(UUID, primary_key = True, default = uuid4)
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
|
||||
|
||||
resource_id = Column(UUID(as_uuid = True), nullable = False)
|
||||
resource_id = Column(UUID, nullable = False)
|
||||
|
||||
acls = relationship("ACL", secondary = ACLResources.__tablename__, back_populates = "resources")
|
||||
|
|
|
|||
|
|
@ -1,14 +1,15 @@
|
|||
from uuid import UUID as uuid_UUID
|
||||
from sqlalchemy import ForeignKey, UUID, Column
|
||||
from sqlalchemy import ForeignKey, Column
|
||||
from sqlalchemy.orm import relationship, Mapped
|
||||
from fastapi_users.db import SQLAlchemyBaseUserTableUUID
|
||||
from cognee.infrastructure.databases.relational import UUID
|
||||
from .Principal import Principal
|
||||
from .UserGroup import UserGroup
|
||||
|
||||
class User(SQLAlchemyBaseUserTableUUID, Principal):
|
||||
__tablename__ = "users"
|
||||
|
||||
id = Column(UUID(as_uuid = True), ForeignKey("principals.id"), primary_key = True)
|
||||
id = Column(UUID, ForeignKey("principals.id"), primary_key = True)
|
||||
|
||||
groups: Mapped[list["Group"]] = relationship(
|
||||
secondary = UserGroup.__tablename__,
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, ForeignKey, DateTime, UUID
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
from sqlalchemy import Column, ForeignKey, DateTime
|
||||
from cognee.infrastructure.databases.relational import Base, UUID
|
||||
|
||||
class UserGroup(Base):
|
||||
__tablename__ = "user_groups"
|
||||
|
||||
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
|
||||
|
||||
user_id = Column(UUID(as_uuid = True), ForeignKey("users.id"), primary_key = True)
|
||||
group_id = Column(UUID(as_uuid = True), ForeignKey("groups.id"), primary_key = True)
|
||||
user_id = Column(UUID, ForeignKey("users.id"), primary_key = True)
|
||||
group_id = Column(UUID, ForeignKey("groups.id"), primary_key = True)
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ def chunk_by_word(data: str):
|
|||
while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "):
|
||||
j += 1
|
||||
next_character = data[j] if j < len(data) else None
|
||||
if next_character.isupper():
|
||||
if next_character and next_character.isupper():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@ from cognee.modules.data.processing.document_types import Document, PdfDocument,
|
|||
|
||||
def classify_documents(data_documents: list[Data]) -> list[Document]:
|
||||
documents = [
|
||||
PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "pdf" else
|
||||
AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "audio" else
|
||||
ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location) if data_item.extension == "image" else
|
||||
TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", file_path=data_item.raw_data_location)
|
||||
PdfDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "pdf" else
|
||||
AudioDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "audio" else
|
||||
ImageDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location) if data_item.extension == "image" else
|
||||
TextDocument(id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location)
|
||||
for data_item in data_documents
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ class OntologyEngine:
|
|||
chunk_strategy = chunk_config.chunk_strategy
|
||||
|
||||
for base_file in documents:
|
||||
with open(base_file.file_path, "rb") as file:
|
||||
with open(base_file.raw_data_location, "rb") as file:
|
||||
try:
|
||||
file_type = guess_file_type(file)
|
||||
text = extract_text_from_file(file, file_type)
|
||||
|
|
@ -175,7 +175,7 @@ async def infer_data_ontology(documents, ontology_model = KnowledgeGraph, root_n
|
|||
ontology_engine = OntologyEngine()
|
||||
root_node_id = await ontology_engine.add_graph_ontology(documents = documents)
|
||||
else:
|
||||
graph_engine = get_graph_engine()
|
||||
graph_engine = await get_graph_engine()
|
||||
await add_model_class_to_graph(ontology_model, graph_engine)
|
||||
|
||||
yield (documents, root_node_id)
|
||||
|
|
|
|||
512
notebooks/cognee_demo_1.5.ipynb
Normal file
512
notebooks/cognee_demo_1.5.ipynb
Normal file
|
|
@ -0,0 +1,512 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "958375a6ffc0c2e4",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:47.336283Z",
|
||||
"start_time": "2024-09-20T14:02:43.652444Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import asyncio\n",
|
||||
"import logging\n",
|
||||
"from typing import Union\n",
|
||||
"\n",
|
||||
"from cognee.modules.cognify.config import get_cognify_config\n",
|
||||
"from cognee.shared.data_models import KnowledgeGraph\n",
|
||||
"from cognee.modules.data.models import Dataset, Data\n",
|
||||
"from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
|
||||
"from cognee.modules.data.methods import get_datasets, get_datasets_by_name\n",
|
||||
"from cognee.modules.pipelines.tasks.Task import Task\n",
|
||||
"from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n",
|
||||
"from cognee.modules.users.models import User\n",
|
||||
"from cognee.modules.users.methods import get_default_user\n",
|
||||
"from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status\n",
|
||||
"from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status\n",
|
||||
"from cognee.tasks import chunk_extract_summary, \\\n",
|
||||
" chunk_naive_llm_classifier, \\\n",
|
||||
" chunk_remove_disconnected, \\\n",
|
||||
" infer_data_ontology, \\\n",
|
||||
" save_chunks_to_store, \\\n",
|
||||
" chunk_update_check, \\\n",
|
||||
" chunks_into_graph, \\\n",
|
||||
" source_documents_to_chunks, \\\n",
|
||||
" check_permissions_on_documents, \\\n",
|
||||
" classify_documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "df16431d0f48b006",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:48.519686Z",
|
||||
"start_time": "2024-09-20T14:02:48.515589Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"job_position = \"\"\"Senior Data Scientist (Machine Learning)\n",
|
||||
"\n",
|
||||
"Company: TechNova Solutions\n",
|
||||
"Location: San Francisco, CA\n",
|
||||
"\n",
|
||||
"Job Description:\n",
|
||||
"\n",
|
||||
"TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n",
|
||||
"\n",
|
||||
"Responsibilities:\n",
|
||||
"\n",
|
||||
"Develop and implement advanced machine learning algorithms and models.\n",
|
||||
"Analyze large, complex datasets to extract meaningful patterns and insights.\n",
|
||||
"Collaborate with cross-functional teams to integrate predictive models into products.\n",
|
||||
"Stay updated with the latest advancements in machine learning and data science.\n",
|
||||
"Mentor junior data scientists and provide technical guidance.\n",
|
||||
"Qualifications:\n",
|
||||
"\n",
|
||||
"Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n",
|
||||
"5+ years of experience in data science and machine learning.\n",
|
||||
"Proficient in Python, R, and SQL.\n",
|
||||
"Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n",
|
||||
"Strong problem-solving skills and attention to detail.\n",
|
||||
"Candidate CVs\n",
|
||||
"\"\"\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "9086abf3af077ab4",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:49.120838Z",
|
||||
"start_time": "2024-09-20T14:02:49.118294Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"job_1 = \"\"\"\n",
|
||||
"CV 1: Relevant\n",
|
||||
"Name: Dr. Emily Carter\n",
|
||||
"Contact Information:\n",
|
||||
"\n",
|
||||
"Email: emily.carter@example.com\n",
|
||||
"Phone: (555) 123-4567\n",
|
||||
"Summary:\n",
|
||||
"\n",
|
||||
"Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n",
|
||||
"\n",
|
||||
"Education:\n",
|
||||
"\n",
|
||||
"Ph.D. in Computer Science, Stanford University (2014)\n",
|
||||
"B.S. in Mathematics, University of California, Berkeley (2010)\n",
|
||||
"Experience:\n",
|
||||
"\n",
|
||||
"Senior Data Scientist, InnovateAI Labs (2016 – Present)\n",
|
||||
"Led a team in developing machine learning models for natural language processing applications.\n",
|
||||
"Implemented deep learning algorithms that improved prediction accuracy by 25%.\n",
|
||||
"Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n",
|
||||
"Data Scientist, DataWave Analytics (2014 – 2016)\n",
|
||||
"Developed predictive models for customer segmentation and churn analysis.\n",
|
||||
"Analyzed large datasets using Hadoop and Spark frameworks.\n",
|
||||
"Skills:\n",
|
||||
"\n",
|
||||
"Programming Languages: Python, R, SQL\n",
|
||||
"Machine Learning: TensorFlow, Keras, Scikit-Learn\n",
|
||||
"Big Data Technologies: Hadoop, Spark\n",
|
||||
"Data Visualization: Tableau, Matplotlib\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a9de0cc07f798b7f",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:49.675003Z",
|
||||
"start_time": "2024-09-20T14:02:49.671615Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"job_2 = \"\"\"\n",
|
||||
"CV 2: Relevant\n",
|
||||
"Name: Michael Rodriguez\n",
|
||||
"Contact Information:\n",
|
||||
"\n",
|
||||
"Email: michael.rodriguez@example.com\n",
|
||||
"Phone: (555) 234-5678\n",
|
||||
"Summary:\n",
|
||||
"\n",
|
||||
"Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n",
|
||||
"\n",
|
||||
"Education:\n",
|
||||
"\n",
|
||||
"M.S. in Data Science, Carnegie Mellon University (2013)\n",
|
||||
"B.S. in Computer Science, University of Michigan (2011)\n",
|
||||
"Experience:\n",
|
||||
"\n",
|
||||
"Senior Data Scientist, Alpha Analytics (2017 – Present)\n",
|
||||
"Developed machine learning models to optimize marketing strategies.\n",
|
||||
"Reduced customer acquisition cost by 15% through predictive modeling.\n",
|
||||
"Data Scientist, TechInsights (2013 – 2017)\n",
|
||||
"Analyzed user behavior data to improve product features.\n",
|
||||
"Implemented A/B testing frameworks to evaluate product changes.\n",
|
||||
"Skills:\n",
|
||||
"\n",
|
||||
"Programming Languages: Python, Java, SQL\n",
|
||||
"Machine Learning: Scikit-Learn, XGBoost\n",
|
||||
"Data Visualization: Seaborn, Plotly\n",
|
||||
"Databases: MySQL, MongoDB\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "185ff1c102d06111",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:50.286828Z",
|
||||
"start_time": "2024-09-20T14:02:50.284369Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"job_3 = \"\"\"\n",
|
||||
"CV 3: Relevant\n",
|
||||
"Name: Sarah Nguyen\n",
|
||||
"Contact Information:\n",
|
||||
"\n",
|
||||
"Email: sarah.nguyen@example.com\n",
|
||||
"Phone: (555) 345-6789\n",
|
||||
"Summary:\n",
|
||||
"\n",
|
||||
"Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n",
|
||||
"\n",
|
||||
"Education:\n",
|
||||
"\n",
|
||||
"M.S. in Statistics, University of Washington (2014)\n",
|
||||
"B.S. in Applied Mathematics, University of Texas at Austin (2012)\n",
|
||||
"Experience:\n",
|
||||
"\n",
|
||||
"Data Scientist, QuantumTech (2016 – Present)\n",
|
||||
"Designed and implemented machine learning algorithms for financial forecasting.\n",
|
||||
"Improved model efficiency by 20% through algorithm optimization.\n",
|
||||
"Junior Data Scientist, DataCore Solutions (2014 – 2016)\n",
|
||||
"Assisted in developing predictive models for supply chain optimization.\n",
|
||||
"Conducted data cleaning and preprocessing on large datasets.\n",
|
||||
"Skills:\n",
|
||||
"\n",
|
||||
"Programming Languages: Python, R\n",
|
||||
"Machine Learning Frameworks: PyTorch, Scikit-Learn\n",
|
||||
"Statistical Analysis: SAS, SPSS\n",
|
||||
"Cloud Platforms: AWS, Azure\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "d55ce4c58f8efb67",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:50.950343Z",
|
||||
"start_time": "2024-09-20T14:02:50.946378Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"job_4 = \"\"\"\n",
|
||||
"CV 4: Not Relevant\n",
|
||||
"Name: David Thompson\n",
|
||||
"Contact Information:\n",
|
||||
"\n",
|
||||
"Email: david.thompson@example.com\n",
|
||||
"Phone: (555) 456-7890\n",
|
||||
"Summary:\n",
|
||||
"\n",
|
||||
"Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n",
|
||||
"\n",
|
||||
"Education:\n",
|
||||
"\n",
|
||||
"B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n",
|
||||
"Experience:\n",
|
||||
"\n",
|
||||
"Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n",
|
||||
"Led design projects for clients in various industries.\n",
|
||||
"Created branding materials that increased client engagement by 30%.\n",
|
||||
"Graphic Designer, Visual Innovations (2012 – 2015)\n",
|
||||
"Designed marketing collateral, including brochures, logos, and websites.\n",
|
||||
"Collaborated with the marketing team to develop cohesive brand strategies.\n",
|
||||
"Skills:\n",
|
||||
"\n",
|
||||
"Design Software: Adobe Photoshop, Illustrator, InDesign\n",
|
||||
"Web Design: HTML, CSS\n",
|
||||
"Specialties: Branding and Identity, Typography\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ca4ecc32721ad332",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:51.548191Z",
|
||||
"start_time": "2024-09-20T14:02:51.545520Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"job_5 = \"\"\"\n",
|
||||
"CV 5: Not Relevant\n",
|
||||
"Name: Jessica Miller\n",
|
||||
"Contact Information:\n",
|
||||
"\n",
|
||||
"Email: jessica.miller@example.com\n",
|
||||
"Phone: (555) 567-8901\n",
|
||||
"Summary:\n",
|
||||
"\n",
|
||||
"Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n",
|
||||
"\n",
|
||||
"Education:\n",
|
||||
"\n",
|
||||
"B.A. in Business Administration, University of Southern California (2010)\n",
|
||||
"Experience:\n",
|
||||
"\n",
|
||||
"Sales Manager, Global Enterprises (2015 – Present)\n",
|
||||
"Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n",
|
||||
"Developed sales strategies that expanded customer base by 25%.\n",
|
||||
"Sales Representative, Market Leaders Inc. (2010 – 2015)\n",
|
||||
"Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n",
|
||||
"Skills:\n",
|
||||
"\n",
|
||||
"Sales Strategy and Planning\n",
|
||||
"Team Leadership and Development\n",
|
||||
"CRM Software: Salesforce, Zoho\n",
|
||||
"Negotiation and Relationship Building\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "904df61ba484a8e5",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:54.243987Z",
|
||||
"start_time": "2024-09-20T14:02:52.498195Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cognee\n",
|
||||
"from os import listdir, path\n",
|
||||
"\n",
|
||||
"data_path = path.abspath(\".data\")\n",
|
||||
"\n",
|
||||
"results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], \"example\")\n",
|
||||
"\n",
|
||||
"for result in results:\n",
|
||||
" print(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "6f9b564de121713d",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:55.564445Z",
|
||||
"start_time": "2024-09-20T14:02:55.562784Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "8911f8bd4f8c440a",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:56.714408Z",
|
||||
"start_time": "2024-09-20T14:02:56.711812Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# from enum import Enum, auto\n",
|
||||
"# from typing import Optional, List, Union, Dict, Any\n",
|
||||
"# from pydantic import BaseModel, Field\n",
|
||||
"# \n",
|
||||
"# class Node(BaseModel):\n",
|
||||
"# \"\"\"Node in a knowledge graph.\"\"\"\n",
|
||||
"# id: str\n",
|
||||
"# name: str\n",
|
||||
"# type: str\n",
|
||||
"# description: str\n",
|
||||
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the node.\")\n",
|
||||
"# \n",
|
||||
"# class Edge(BaseModel):\n",
|
||||
"# \"\"\"Edge in a knowledge graph.\"\"\"\n",
|
||||
"# source_node_id: str\n",
|
||||
"# target_node_id: str\n",
|
||||
"# relationship_name: str\n",
|
||||
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the edge.\")\n",
|
||||
"# \n",
|
||||
"# class KnowledgeGraph(BaseModel):\n",
|
||||
"# \"\"\"Knowledge graph.\"\"\"\n",
|
||||
"# nodes: List[Node] = Field(..., default_factory=list)\n",
|
||||
"# edges: List[Edge] = Field(..., default_factory=list)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "7c431fdef4921ae0",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:57.925667Z",
|
||||
"start_time": "2024-09-20T14:02:57.922353Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
|
||||
" data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
"\n",
|
||||
" root_node_id = None\n",
|
||||
"\n",
|
||||
" tasks = [\n",
|
||||
" Task(classify_documents),\n",
|
||||
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
|
||||
" Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
|
||||
" Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
|
||||
" Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
|
||||
" Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
|
||||
" Task(\n",
|
||||
" save_chunks_to_store,\n",
|
||||
" collection_name = \"chunks\",\n",
|
||||
" ), \n",
|
||||
" Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" pipeline = run_tasks(tasks, data_documents)\n",
|
||||
"\n",
|
||||
" async for result in pipeline:\n",
|
||||
" print(result)\n",
|
||||
" except Exception as error:\n",
|
||||
" raise error"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0a91b99c6215e09",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-20T14:02:58.905774Z",
|
||||
"start_time": "2024-09-20T14:02:58.625915Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"user = await get_default_user()\n",
|
||||
"datasets = await get_datasets_by_name([\"example\"], user.id)\n",
|
||||
"await run_cognify_pipeline(datasets[0], user)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "080389e5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from cognee.shared.utils import render_graph\n",
|
||||
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
|
||||
"import graphistry\n",
|
||||
"\n",
|
||||
"# # Setting an environment variable\n",
|
||||
"# os.environ[\"GRAPHISTRY_USERNAME\"] = placeholder\n",
|
||||
"# os.environ[\"GRAPHISTRY_PASSWORD\"] = placeholder\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
|
||||
"\n",
|
||||
"graph_engine = await get_graph_engine()\n",
|
||||
"\n",
|
||||
"graph_url = await render_graph(graph_engine.graph)\n",
|
||||
"print(graph_url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e5e7dfc8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"async def search(\n",
|
||||
" vector_engine,\n",
|
||||
" collection_name: str,\n",
|
||||
" query_text: str = None,\n",
|
||||
"):\n",
|
||||
" query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n",
|
||||
"\n",
|
||||
" connection = await vector_engine.get_connection()\n",
|
||||
" collection = await connection.open_table(collection_name)\n",
|
||||
"\n",
|
||||
" results = await collection.vector_search(query_vector).limit(10).to_pandas()\n",
|
||||
"\n",
|
||||
" result_values = list(results.to_dict(\"index\").values())\n",
|
||||
"\n",
|
||||
" return [dict(\n",
|
||||
" id = str(result[\"id\"]),\n",
|
||||
" payload = result[\"payload\"],\n",
|
||||
" score = result[\"_distance\"],\n",
|
||||
" ) for result in result_values]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from cognee.infrastructure.databases.vector import get_vector_engine\n",
|
||||
"\n",
|
||||
"vector_engine = get_vector_engine()\n",
|
||||
"results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n",
|
||||
"for result in results:\n",
|
||||
" print(result)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
3167
poetry.lock
generated
3167
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -38,12 +38,10 @@ greenlet = "^3.0.3"
|
|||
ruff = "^0.2.2"
|
||||
filetype = "^1.2.0"
|
||||
nltk = "^3.8.1"
|
||||
dlt = {extras = ["postgres"], version = "^0.5.2"}
|
||||
duckdb = {version = "^0.10.0", extras = ["dlt"]}
|
||||
dlt = {extras = ["sqlalchemy"], version = "^1.0.0"}
|
||||
overrides = "^7.7.0"
|
||||
aiofiles = "^23.2.1"
|
||||
qdrant-client = "^1.9.0"
|
||||
duckdb-engine = "0.13.0"
|
||||
graphistry = "^0.33.5"
|
||||
tenacity = "^8.2.3"
|
||||
weaviate-client = "4.6.7"
|
||||
|
|
@ -75,14 +73,12 @@ asyncpg = "^0.29.0"
|
|||
|
||||
|
||||
[tool.poetry.extras]
|
||||
duckdb = ["duckdb"]
|
||||
filesystem = ["s3fs", "botocore"]
|
||||
motherduck = ["duckdb"]
|
||||
cli = ["pipdeptree", "cron-descriptor"]
|
||||
weaviate = ["weaviate-client"]
|
||||
qdrant = ["qdrant-client"]
|
||||
neo4j = ["neo4j", "py2neo"]
|
||||
notebook = ["ipykernel","overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
|
||||
neo4j = ["neo4j"]
|
||||
notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.4.0"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue