Merge branch 'dev' into feat/csv-ingestion
Signed-off-by: EricXiao <taoiaox@gmail.com>
This commit is contained in:
commit
4c609d6074
212 changed files with 11740 additions and 6509 deletions
|
|
@ -169,8 +169,9 @@ REQUIRE_AUTHENTICATION=False
|
|||
# Vector: LanceDB
|
||||
# Graph: KuzuDB
|
||||
#
|
||||
# It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset
|
||||
ENABLE_BACKEND_ACCESS_CONTROL=False
|
||||
# It enforces creation of databases per Cognee user + dataset. Does not work with some graph and database providers.
|
||||
# Disable mode when using not supported graph/vector databases.
|
||||
ENABLE_BACKEND_ACCESS_CONTROL=True
|
||||
|
||||
################################################################################
|
||||
# ☁️ Cloud Sync Settings
|
||||
|
|
@ -242,13 +243,14 @@ LITELLM_LOG="ERROR"
|
|||
|
||||
########## Local LLM via Ollama ###############################################
|
||||
|
||||
|
||||
#LLM_API_KEY ="ollama"
|
||||
#LLM_MODEL="llama3.1:8b"
|
||||
#LLM_PROVIDER="ollama"
|
||||
#LLM_ENDPOINT="http://localhost:11434/v1"
|
||||
#EMBEDDING_PROVIDER="ollama"
|
||||
#EMBEDDING_MODEL="nomic-embed-text:latest"
|
||||
#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
|
||||
#EMBEDDING_ENDPOINT="http://localhost:11434/api/embed"
|
||||
#EMBEDDING_DIMENSIONS=768
|
||||
#HUGGINGFACE_TOKENIZER="nomic-ai/nomic-embed-text-v1.5"
|
||||
|
||||
|
|
|
|||
5
.github/actions/cognee_setup/action.yml
vendored
5
.github/actions/cognee_setup/action.yml
vendored
|
|
@ -42,3 +42,8 @@ runs:
|
|||
done
|
||||
fi
|
||||
uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS
|
||||
|
||||
- name: Add telemetry identifier for telemetry test and in case telemetry is enabled by accident
|
||||
shell: bash
|
||||
run: |
|
||||
echo "test-machine" > .anon_id
|
||||
|
|
|
|||
16
.github/workflows/basic_tests.yml
vendored
16
.github/workflows/basic_tests.yml
vendored
|
|
@ -75,6 +75,7 @@ jobs:
|
|||
name: Run Unit Tests
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -104,6 +105,7 @@ jobs:
|
|||
name: Run Integration Tests
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -123,6 +125,7 @@ jobs:
|
|||
uses: ./.github/actions/cognee_setup
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
extra-dependencies: "scraping"
|
||||
|
||||
- name: Run Integration Tests
|
||||
run: uv run pytest cognee/tests/integration/
|
||||
|
|
@ -131,6 +134,7 @@ jobs:
|
|||
name: Run Simple Examples
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -160,12 +164,13 @@ jobs:
|
|||
name: Run Simple Examples BAML
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
STRUCTURED_OUTPUT_FRAMEWORK: "BAML"
|
||||
BAML_LLM_PROVIDER: azure-openai
|
||||
BAML_LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
BAML_LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
BAML_LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
BAML_LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
BAML_LLM_PROVIDER: openai
|
||||
BAML_LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
|
||||
BAML_LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
|
||||
BAML_LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
# BAML_LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
|
|
@ -197,6 +202,7 @@ jobs:
|
|||
name: Run Basic Graph Tests
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
|
|||
3
.github/workflows/cli_tests.yml
vendored
3
.github/workflows/cli_tests.yml
vendored
|
|
@ -39,6 +39,7 @@ jobs:
|
|||
name: CLI Unit Tests
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -66,6 +67,7 @@ jobs:
|
|||
name: CLI Integration Tests
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -93,6 +95,7 @@ jobs:
|
|||
name: CLI Functionality Tests
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
|
|||
6
.github/workflows/db_examples_tests.yml
vendored
6
.github/workflows/db_examples_tests.yml
vendored
|
|
@ -60,7 +60,7 @@ jobs:
|
|||
|
||||
- name: Run Neo4j Example
|
||||
env:
|
||||
ENV: dev
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
|
|
@ -95,7 +95,7 @@ jobs:
|
|||
|
||||
- name: Run Kuzu Example
|
||||
env:
|
||||
ENV: dev
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
|
|
@ -141,7 +141,7 @@ jobs:
|
|||
|
||||
- name: Run PGVector Example
|
||||
env:
|
||||
ENV: dev
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
|
|
|
|||
20
.github/workflows/dockerhub-mcp.yml
vendored
20
.github/workflows/dockerhub-mcp.yml
vendored
|
|
@ -7,14 +7,29 @@ on:
|
|||
|
||||
jobs:
|
||||
docker-build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: Default
|
||||
labels:
|
||||
- docker_build_runner
|
||||
|
||||
steps:
|
||||
- name: Check and free disk space before build
|
||||
run: |
|
||||
echo "=== Before cleanup ==="
|
||||
df -h
|
||||
echo "Removing unused preinstalled SDKs to free space..."
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc || true
|
||||
docker system prune -af || true
|
||||
echo "=== After cleanup ==="
|
||||
df -h
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
buildkitd-flags: --root /tmp/buildkit
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
|
|
@ -34,7 +49,7 @@ jobs:
|
|||
|
||||
- name: Build and push
|
||||
id: build
|
||||
uses: docker/build-push-action@v5
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
|
@ -45,5 +60,6 @@ jobs:
|
|||
cache-from: type=registry,ref=cognee/cognee-mcp:buildcache
|
||||
cache-to: type=registry,ref=cognee/cognee-mcp:buildcache,mode=max
|
||||
|
||||
|
||||
- name: Image digest
|
||||
run: echo ${{ steps.build.outputs.digest }}
|
||||
|
|
|
|||
57
.github/workflows/e2e_tests.yml
vendored
57
.github/workflows/e2e_tests.yml
vendored
|
|
@ -226,7 +226,7 @@ jobs:
|
|||
- name: Dependencies already installed
|
||||
run: echo "Dependencies already installed in setup"
|
||||
|
||||
- name: Run parallel databases test
|
||||
- name: Run permissions test
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
|
|
@ -239,6 +239,31 @@ jobs:
|
|||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: uv run python ./cognee/tests/test_permissions.py
|
||||
|
||||
test-multi-tenancy:
|
||||
name: Test multi tenancy with different situations in Cognee
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cognee Setup
|
||||
uses: ./.github/actions/cognee_setup
|
||||
with:
|
||||
python-version: '3.11.x'
|
||||
|
||||
- name: Run multi tenancy test
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: uv run python ./cognee/tests/test_multi_tenancy.py
|
||||
|
||||
test-graph-edges:
|
||||
name: Test graph edge ingestion
|
||||
runs-on: ubuntu-22.04
|
||||
|
|
@ -358,6 +383,34 @@ jobs:
|
|||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: uv run python ./cognee/tests/tasks/entity_extraction/entity_extraction_test.py
|
||||
|
||||
test-feedback-enrichment:
|
||||
name: Test Feedback Enrichment
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cognee Setup
|
||||
uses: ./.github/actions/cognee_setup
|
||||
with:
|
||||
python-version: '3.11.x'
|
||||
|
||||
- name: Dependencies already installed
|
||||
run: echo "Dependencies already installed in setup"
|
||||
|
||||
- name: Run Feedback Enrichment Test
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: uv run python ./cognee/tests/test_feedback_enrichment.py
|
||||
|
||||
run_conversation_sessions_test:
|
||||
name: Conversation sessions test
|
||||
runs-on: ubuntu-latest
|
||||
|
|
@ -401,7 +454,7 @@ jobs:
|
|||
|
||||
- name: Run Conversation session tests
|
||||
env:
|
||||
ENV: dev
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
|
|
|
|||
36
.github/workflows/examples_tests.yml
vendored
36
.github/workflows/examples_tests.yml
vendored
|
|
@ -21,6 +21,7 @@ jobs:
|
|||
|
||||
- name: Run Multimedia Example
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
run: uv run python ./examples/python/multimedia_example.py
|
||||
|
|
@ -40,6 +41,7 @@ jobs:
|
|||
|
||||
- name: Run Evaluation Framework Example
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
|
|
@ -69,6 +71,7 @@ jobs:
|
|||
|
||||
- name: Run Descriptive Graph Metrics Example
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
|
|
@ -99,6 +102,7 @@ jobs:
|
|||
|
||||
- name: Run Dynamic Steps Tests
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -124,6 +128,7 @@ jobs:
|
|||
|
||||
- name: Run Temporal Example
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -149,6 +154,7 @@ jobs:
|
|||
|
||||
- name: Run Ontology Demo Example
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -174,6 +180,7 @@ jobs:
|
|||
|
||||
- name: Run Agentic Reasoning Example
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -199,6 +206,7 @@ jobs:
|
|||
|
||||
- name: Run Memify Tests
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -210,6 +218,32 @@ jobs:
|
|||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: uv run python ./examples/python/memify_coding_agent_example.py
|
||||
|
||||
test-custom-pipeline:
|
||||
name: Run Custom Pipeline Example
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cognee Setup
|
||||
uses: ./.github/actions/cognee_setup
|
||||
with:
|
||||
python-version: '3.11.x'
|
||||
|
||||
- name: Run Custom Pipeline Example
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: uv run python ./examples/python/run_custom_pipeline_example.py
|
||||
|
||||
test-permissions-example:
|
||||
name: Run Permissions Example
|
||||
runs-on: ubuntu-22.04
|
||||
|
|
@ -224,6 +258,7 @@ jobs:
|
|||
|
||||
- name: Run Memify Tests
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
@ -249,6 +284,7 @@ jobs:
|
|||
|
||||
- name: Run Docling Test
|
||||
env:
|
||||
ENV: 'dev'
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
|
|
|
|||
70
.github/workflows/load_tests.yml
vendored
Normal file
70
.github/workflows/load_tests.yml
vendored
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
name: Load tests
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
secrets:
|
||||
LLM_MODEL:
|
||||
required: true
|
||||
LLM_ENDPOINT:
|
||||
required: true
|
||||
LLM_API_KEY:
|
||||
required: true
|
||||
LLM_API_VERSION:
|
||||
required: true
|
||||
EMBEDDING_MODEL:
|
||||
required: true
|
||||
EMBEDDING_ENDPOINT:
|
||||
required: true
|
||||
EMBEDDING_API_KEY:
|
||||
required: true
|
||||
EMBEDDING_API_VERSION:
|
||||
required: true
|
||||
OPENAI_API_KEY:
|
||||
required: true
|
||||
AWS_ACCESS_KEY_ID:
|
||||
required: true
|
||||
AWS_SECRET_ACCESS_KEY:
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
test-load:
|
||||
name: Test Load
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cognee Setup
|
||||
uses: ./.github/actions/cognee_setup
|
||||
with:
|
||||
python-version: '3.11.x'
|
||||
extra-dependencies: "aws"
|
||||
|
||||
- name: Verify File Descriptor Limit
|
||||
run: ulimit -n
|
||||
|
||||
- name: Run Load Test
|
||||
env:
|
||||
ENV: 'dev'
|
||||
ENABLE_BACKEND_ACCESS_CONTROL: True
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
STORAGE_BACKEND: s3
|
||||
AWS_REGION: eu-west-1
|
||||
AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }}
|
||||
run: uv run python ./cognee/tests/test_load.py
|
||||
|
||||
|
||||
17
.github/workflows/release_test.yml
vendored
Normal file
17
.github/workflows/release_test.yml
vendored
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
# Long-running, heavy and resource-consuming tests for release validation
|
||||
name: Release Test Workflow
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
load-tests:
|
||||
name: Load Tests
|
||||
uses: ./.github/workflows/load_tests.yml
|
||||
secrets: inherit
|
||||
78
.github/workflows/scorecard.yml
vendored
Normal file
78
.github/workflows/scorecard.yml
vendored
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
# This workflow uses actions that are not certified by GitHub. They are provided
|
||||
# by a third-party and are governed by separate terms of service, privacy
|
||||
# policy, and support documentation.
|
||||
|
||||
name: Scorecard supply-chain security
|
||||
on:
|
||||
# For Branch-Protection check. Only the default branch is supported. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
|
||||
branch_protection_rule:
|
||||
# To guarantee Maintained check is occasionally updated. See
|
||||
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
|
||||
schedule:
|
||||
- cron: '35 8 * * 2'
|
||||
push:
|
||||
branches: [ "main" ]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecard analysis
|
||||
runs-on: ubuntu-latest
|
||||
# `publish_results: true` only works when run from the default branch. conditional can be removed if disabled.
|
||||
if: github.event.repository.default_branch == github.ref_name || github.event_name == 'pull_request'
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
# Needed to publish results and get a badge (see publish_results below).
|
||||
id-token: write
|
||||
# Uncomment the permissions below if installing in a private repository.
|
||||
# contents: read
|
||||
# actions: read
|
||||
|
||||
steps:
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
|
||||
# - you want to enable the Branch-Protection check on a *public* repository, or
|
||||
# - you are installing Scorecard on a *private* repository
|
||||
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
|
||||
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
|
||||
|
||||
# Public repositories:
|
||||
# - Publish results to OpenSSF REST API for easy access by consumers
|
||||
# - Allows the repository to include the Scorecard badge.
|
||||
# - See https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories:
|
||||
# - `publish_results` will always be set to `false`, regardless
|
||||
# of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# (Optional) Uncomment file_mode if you have a .gitattributes with files marked export-ignore
|
||||
# file_mode: git
|
||||
|
||||
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
|
||||
# format to the repository Actions tab.
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard (optional).
|
||||
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@v3
|
||||
with:
|
||||
sarif_file: results.sarif
|
||||
3
.github/workflows/search_db_tests.yml
vendored
3
.github/workflows/search_db_tests.yml
vendored
|
|
@ -84,6 +84,7 @@ jobs:
|
|||
GRAPH_DATABASE_PROVIDER: 'neo4j'
|
||||
VECTOR_DB_PROVIDER: 'lancedb'
|
||||
DB_PROVIDER: 'sqlite'
|
||||
ENABLE_BACKEND_ACCESS_CONTROL: 'false'
|
||||
GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }}
|
||||
GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }}
|
||||
GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }}
|
||||
|
|
@ -135,6 +136,7 @@ jobs:
|
|||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPH_DATABASE_PROVIDER: 'kuzu'
|
||||
VECTOR_DB_PROVIDER: 'pgvector'
|
||||
ENABLE_BACKEND_ACCESS_CONTROL: 'false'
|
||||
DB_PROVIDER: 'postgres'
|
||||
DB_NAME: 'cognee_db'
|
||||
DB_HOST: '127.0.0.1'
|
||||
|
|
@ -197,6 +199,7 @@ jobs:
|
|||
GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }}
|
||||
GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }}
|
||||
GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }}
|
||||
ENABLE_BACKEND_ACCESS_CONTROL: 'false'
|
||||
DB_NAME: cognee_db
|
||||
DB_HOST: 127.0.0.1
|
||||
DB_PORT: 5432
|
||||
|
|
|
|||
28
.github/workflows/temporal_graph_tests.yml
vendored
28
.github/workflows/temporal_graph_tests.yml
vendored
|
|
@ -34,10 +34,9 @@ jobs:
|
|||
- name: Run Temporal Graph with Kuzu (lancedb + sqlite)
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
|
|
@ -73,10 +72,9 @@ jobs:
|
|||
- name: Run Temporal Graph with Neo4j (lancedb + sqlite)
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
|
|
@ -125,10 +123,9 @@ jobs:
|
|||
- name: Run Temporal Graph with Kuzu (postgres + pgvector)
|
||||
env:
|
||||
ENV: dev
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
|
|
@ -192,10 +189,9 @@ jobs:
|
|||
- name: Run Temporal Graph with Neo4j (postgres + pgvector)
|
||||
env:
|
||||
ENV: dev
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,10 @@ on:
|
|||
required: false
|
||||
type: string
|
||||
default: '["3.10.x", "3.12.x", "3.13.x"]'
|
||||
os:
|
||||
required: false
|
||||
type: string
|
||||
default: '["ubuntu-22.04", "macos-15", "windows-latest"]'
|
||||
secrets:
|
||||
LLM_PROVIDER:
|
||||
required: true
|
||||
|
|
@ -40,10 +44,11 @@ jobs:
|
|||
run-unit-tests:
|
||||
name: Unit tests ${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ${{ fromJSON(inputs.python-versions) }}
|
||||
os: [ubuntu-22.04, macos-15, windows-latest]
|
||||
os: ${{ fromJSON(inputs.os) }}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Check out
|
||||
|
|
@ -76,10 +81,11 @@ jobs:
|
|||
run-integration-tests:
|
||||
name: Integration tests ${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ${{ fromJSON(inputs.python-versions) }}
|
||||
os: [ ubuntu-22.04, macos-15, windows-latest ]
|
||||
os: ${{ fromJSON(inputs.os) }}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Check out
|
||||
|
|
@ -112,10 +118,11 @@ jobs:
|
|||
run-library-test:
|
||||
name: Library test ${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ${{ fromJSON(inputs.python-versions) }}
|
||||
os: [ ubuntu-22.04, macos-15, windows-latest ]
|
||||
os: ${{ fromJSON(inputs.os) }}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Check out
|
||||
|
|
@ -148,10 +155,11 @@ jobs:
|
|||
run-build-test:
|
||||
name: Build test ${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ${{ fromJSON(inputs.python-versions) }}
|
||||
os: [ ubuntu-22.04, macos-15, windows-latest ]
|
||||
os: ${{ fromJSON(inputs.os) }}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Check out
|
||||
|
|
@ -177,10 +185,11 @@ jobs:
|
|||
run-soft-deletion-test:
|
||||
name: Soft Delete test ${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ${{ fromJSON(inputs.python-versions) }}
|
||||
os: [ ubuntu-22.04, macos-15, windows-latest ]
|
||||
os: ${{ fromJSON(inputs.os) }}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Check out
|
||||
|
|
@ -214,10 +223,11 @@ jobs:
|
|||
run-hard-deletion-test:
|
||||
name: Hard Delete test ${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ${{ fromJSON(inputs.python-versions) }}
|
||||
os: [ ubuntu-22.04, macos-15, windows-latest ]
|
||||
os: ${{ fromJSON(inputs.os) }}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Check out
|
||||
|
|
|
|||
4
.github/workflows/test_ollama.yml
vendored
4
.github/workflows/test_ollama.yml
vendored
|
|
@ -75,7 +75,7 @@ jobs:
|
|||
{ "role": "user", "content": "Whatever I say, answer with Yes." }
|
||||
]
|
||||
}'
|
||||
curl -X POST http://127.0.0.1:11434/v1/embeddings \
|
||||
curl -X POST http://127.0.0.1:11434/api/embed \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "avr/sfr-embedding-mistral:latest",
|
||||
|
|
@ -98,7 +98,7 @@ jobs:
|
|||
LLM_MODEL: "phi4"
|
||||
EMBEDDING_PROVIDER: "ollama"
|
||||
EMBEDDING_MODEL: "avr/sfr-embedding-mistral:latest"
|
||||
EMBEDDING_ENDPOINT: "http://localhost:11434/api/embeddings"
|
||||
EMBEDDING_ENDPOINT: "http://localhost:11434/api/embed"
|
||||
EMBEDDING_DIMENSIONS: "4096"
|
||||
HUGGINGFACE_TOKENIZER: "Salesforce/SFR-Embedding-Mistral"
|
||||
run: uv run python ./examples/python/simple_example.py
|
||||
|
|
|
|||
25
.github/workflows/test_suites.yml
vendored
25
.github/workflows/test_suites.yml
vendored
|
|
@ -1,4 +1,6 @@
|
|||
name: Test Suites
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
push:
|
||||
|
|
@ -80,12 +82,22 @@ jobs:
|
|||
uses: ./.github/workflows/notebooks_tests.yml
|
||||
secrets: inherit
|
||||
|
||||
different-operating-systems-tests:
|
||||
name: Operating System and Python Tests
|
||||
different-os-tests-basic:
|
||||
name: OS and Python Tests Ubuntu
|
||||
needs: [basic-tests, e2e-tests]
|
||||
uses: ./.github/workflows/test_different_operating_systems.yml
|
||||
with:
|
||||
python-versions: '["3.10.x", "3.11.x", "3.12.x", "3.13.x"]'
|
||||
os: '["ubuntu-22.04"]'
|
||||
secrets: inherit
|
||||
|
||||
different-os-tests-extended:
|
||||
name: OS and Python Tests Extended
|
||||
needs: [basic-tests, e2e-tests]
|
||||
uses: ./.github/workflows/test_different_operating_systems.yml
|
||||
with:
|
||||
python-versions: '["3.13.x"]'
|
||||
os: '["macos-15", "windows-latest"]'
|
||||
secrets: inherit
|
||||
|
||||
# Matrix-based vector database tests
|
||||
|
|
@ -135,7 +147,8 @@ jobs:
|
|||
e2e-tests,
|
||||
graph-db-tests,
|
||||
notebook-tests,
|
||||
different-operating-systems-tests,
|
||||
different-os-tests-basic,
|
||||
different-os-tests-extended,
|
||||
vector-db-tests,
|
||||
example-tests,
|
||||
llm-tests,
|
||||
|
|
@ -155,7 +168,8 @@ jobs:
|
|||
cli-tests,
|
||||
graph-db-tests,
|
||||
notebook-tests,
|
||||
different-operating-systems-tests,
|
||||
different-os-tests-basic,
|
||||
different-os-tests-extended,
|
||||
vector-db-tests,
|
||||
example-tests,
|
||||
db-examples-tests,
|
||||
|
|
@ -176,7 +190,8 @@ jobs:
|
|||
"${{ needs.cli-tests.result }}" == "success" &&
|
||||
"${{ needs.graph-db-tests.result }}" == "success" &&
|
||||
"${{ needs.notebook-tests.result }}" == "success" &&
|
||||
"${{ needs.different-operating-systems-tests.result }}" == "success" &&
|
||||
"${{ needs.different-os-tests-basic.result }}" == "success" &&
|
||||
"${{ needs.different-os-tests-extended.result }}" == "success" &&
|
||||
"${{ needs.vector-db-tests.result }}" == "success" &&
|
||||
"${{ needs.example-tests.result }}" == "success" &&
|
||||
"${{ needs.db-examples-tests.result }}" == "success" &&
|
||||
|
|
|
|||
33
.github/workflows/weighted_edges_tests.yml
vendored
33
.github/workflows/weighted_edges_tests.yml
vendored
|
|
@ -2,7 +2,7 @@ name: Weighted Edges Tests
|
|||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, weighted_edges ]
|
||||
branches: [ main, dev, weighted_edges ]
|
||||
paths:
|
||||
- 'cognee/modules/graph/utils/get_graph_from_model.py'
|
||||
- 'cognee/infrastructure/engine/models/Edge.py'
|
||||
|
|
@ -10,7 +10,7 @@ on:
|
|||
- 'examples/python/weighted_edges_example.py'
|
||||
- '.github/workflows/weighted_edges_tests.yml'
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
branches: [ main, dev ]
|
||||
paths:
|
||||
- 'cognee/modules/graph/utils/get_graph_from_model.py'
|
||||
- 'cognee/infrastructure/engine/models/Edge.py'
|
||||
|
|
@ -32,7 +32,7 @@ jobs:
|
|||
env:
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: gpt-5-mini
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
|
||||
steps:
|
||||
- name: Check out repository
|
||||
|
|
@ -67,14 +67,13 @@ jobs:
|
|||
env:
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: gpt-5-mini
|
||||
LLM_ENDPOINT: https://api.openai.com/v1/
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_ENDPOINT: https://api.openai.com/v1
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_API_VERSION: "2024-02-01"
|
||||
EMBEDDING_PROVIDER: openai
|
||||
EMBEDDING_MODEL: text-embedding-3-small
|
||||
EMBEDDING_ENDPOINT: https://api.openai.com/v1/
|
||||
EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
EMBEDDING_API_VERSION: "2024-02-01"
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
|
|
@ -108,14 +107,14 @@ jobs:
|
|||
env:
|
||||
LLM_PROVIDER: openai
|
||||
LLM_MODEL: gpt-5-mini
|
||||
LLM_ENDPOINT: https://api.openai.com/v1/
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_ENDPOINT: https://api.openai.com/v1
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_API_VERSION: "2024-02-01"
|
||||
EMBEDDING_PROVIDER: openai
|
||||
EMBEDDING_MODEL: text-embedding-3-small
|
||||
EMBEDDING_ENDPOINT: https://api.openai.com/v1/
|
||||
EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
EMBEDDING_API_VERSION: "2024-02-01"
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
|
|
|
|||
132
AGENTS.md
Normal file
132
AGENTS.md
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
## Repository Guidelines
|
||||
|
||||
This document summarizes how to work with the cognee repository: how it’s organized, how to build, test, lint, and contribute. It mirrors our actual tooling and CI while providing quick commands for local development.
|
||||
|
||||
## Project Structure & Module Organization
|
||||
|
||||
- `cognee/`: Core Python library and API.
|
||||
- `api/`: FastAPI application and versioned routers (add, cognify, memify, search, delete, users, datasets, responses, visualize, settings, sync, update, checks).
|
||||
- `cli/`: CLI entry points and subcommands invoked via `cognee` / `cognee-cli`.
|
||||
- `infrastructure/`: Databases, LLM providers, embeddings, loaders, and storage adapters.
|
||||
- `modules/`: Domain logic (graph, retrieval, ontology, users, processing, observability, etc.).
|
||||
- `tasks/`: Reusable tasks (e.g., code graph, web scraping, storage). Extend with new tasks here.
|
||||
- `eval_framework/`: Evaluation utilities and adapters.
|
||||
- `shared/`: Cross-cutting helpers (logging, settings, utils).
|
||||
- `tests/`: Unit, integration, CLI, and end-to-end tests organized by feature.
|
||||
- `__main__.py`: Entrypoint to route to CLI.
|
||||
- `cognee-mcp/`: Model Context Protocol server exposing cognee as MCP tools (SSE/HTTP/stdio). Contains its own README and Dockerfile.
|
||||
- `cognee-frontend/`: Next.js UI for local development and demos.
|
||||
- `distributed/`: Utilities for distributed execution (Modal, workers, queues).
|
||||
- `examples/`: Example scripts demonstrating the public APIs and features (graph, code graph, multimodal, permissions, etc.).
|
||||
- `notebooks/`: Jupyter notebooks for demos and tutorials.
|
||||
- `alembic/`: Database migrations for relational backends.
|
||||
|
||||
Notes:
|
||||
- Co-locate feature-specific helpers under their respective package (`modules/`, `infrastructure/`, or `tasks/`).
|
||||
- Extend the system by adding new tasks, loaders, or retrievers rather than modifying core pipeline mechanisms.
|
||||
|
||||
## Build, Test, and Development Commands
|
||||
|
||||
Python (root) – requires Python >= 3.10 and < 3.14. We recommend `uv` for speed and reproducibility.
|
||||
|
||||
- Create/refresh env and install dev deps:
|
||||
```bash
|
||||
uv sync --dev --all-extras --reinstall
|
||||
```
|
||||
|
||||
- Run the CLI (examples):
|
||||
```bash
|
||||
uv run cognee-cli add "Cognee turns documents into AI memory."
|
||||
uv run cognee-cli cognify
|
||||
uv run cognee-cli search "What does cognee do?"
|
||||
uv run cognee-cli -ui # Launches UI, backend API, and MCP server together
|
||||
```
|
||||
|
||||
- Start the FastAPI server directly:
|
||||
```bash
|
||||
uv run python -m cognee.api.client
|
||||
```
|
||||
|
||||
- Run tests (CI mirrors these commands):
|
||||
```bash
|
||||
uv run pytest cognee/tests/unit/ -v
|
||||
uv run pytest cognee/tests/integration/ -v
|
||||
```
|
||||
|
||||
- Lint and format (ruff):
|
||||
```bash
|
||||
uv run ruff check .
|
||||
uv run ruff format .
|
||||
```
|
||||
|
||||
- Optional static type checks (mypy):
|
||||
```bash
|
||||
uv run mypy cognee/
|
||||
```
|
||||
|
||||
MCP Server (`cognee-mcp/`):
|
||||
|
||||
- Install and run locally:
|
||||
```bash
|
||||
cd cognee-mcp
|
||||
uv sync --dev --all-extras --reinstall
|
||||
uv run python src/server.py # stdio (default)
|
||||
uv run python src/server.py --transport sse
|
||||
uv run python src/server.py --transport http --host 127.0.0.1 --port 8000 --path /mcp
|
||||
```
|
||||
|
||||
- API Mode (connect to a running Cognee API):
|
||||
```bash
|
||||
uv run python src/server.py --transport sse --api-url http://localhost:8000 --api-token YOUR_TOKEN
|
||||
```
|
||||
|
||||
- Docker quickstart (examples): see `cognee-mcp/README.md` for full details
|
||||
```bash
|
||||
docker run -e TRANSPORT_MODE=http --env-file ./.env -p 8000:8000 --rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
Frontend (`cognee-frontend/`):
|
||||
```bash
|
||||
cd cognee-frontend
|
||||
npm install
|
||||
npm run dev # Next.js dev server
|
||||
npm run lint # ESLint
|
||||
npm run build && npm start
|
||||
```
|
||||
|
||||
## Coding Style & Naming Conventions
|
||||
|
||||
Python:
|
||||
- 4-space indentation, modules and functions in `snake_case`, classes in `PascalCase`.
|
||||
- Public APIs should be type-annotated where practical.
|
||||
- Use `ruff format` before committing; `ruff check` enforces import hygiene and style (line-length 100 configured in `pyproject.toml`).
|
||||
- Prefer explicit, structured error handling. Use shared logging utilities in `cognee.shared.logging_utils`.
|
||||
|
||||
MCP server and Frontend:
|
||||
- Follow the local `README.md` and ESLint/TypeScript configuration in `cognee-frontend/`.
|
||||
|
||||
## Testing Guidelines
|
||||
|
||||
- Place Python tests under `cognee/tests/`.
|
||||
- Unit tests: `cognee/tests/unit/`
|
||||
- Integration tests: `cognee/tests/integration/`
|
||||
- CLI tests: `cognee/tests/cli_tests/`
|
||||
- Name test files `test_*.py`. Use `pytest.mark.asyncio` for async tests.
|
||||
- Avoid external state; rely on test fixtures and the CI-provided env vars when LLM/embedding providers are required. See CI workflows under `.github/workflows/` for expected environment variables.
|
||||
- When adding public APIs, provide/update targeted examples under `examples/python/`.
|
||||
|
||||
## Commit & Pull Request Guidelines
|
||||
|
||||
- Use clear, imperative subjects (≤ 72 chars) and conventional commit styling in PR titles. Our CI validates semantic PR titles (see `.github/workflows/pr_lint`). Examples:
|
||||
- `feat(graph): add temporal edge weighting`
|
||||
- `fix(api): handle missing auth cookie`
|
||||
- `docs: update installation instructions`
|
||||
- Reference related issues/discussions in the PR body and provide brief context.
|
||||
- PRs should describe scope, list local test commands run, and mention any impacts on MCP server or UI if applicable.
|
||||
- Sign commits and affirm the DCO (see `CONTRIBUTING.md`).
|
||||
|
||||
## CI Mirrors Local Commands
|
||||
|
||||
Our GitHub Actions run the same ruff checks and pytest suites shown above (`.github/workflows/basic_tests.yml` and related workflows). Use the commands in this document locally to minimize CI surprises.
|
||||
|
||||
|
||||
|
|
@ -97,7 +97,7 @@ Hosted platform:
|
|||
|
||||
### 📦 Installation
|
||||
|
||||
You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager.
|
||||
You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager..
|
||||
|
||||
Cognee supports Python 3.10 to 3.12
|
||||
|
||||
|
|
|
|||
|
|
@ -87,11 +87,6 @@ db_engine = get_relational_engine()
|
|||
|
||||
print("Using database:", db_engine.db_uri)
|
||||
|
||||
if "sqlite" in db_engine.db_uri:
|
||||
from cognee.infrastructure.utils.run_sync import run_sync
|
||||
|
||||
run_sync(db_engine.create_database())
|
||||
|
||||
config.set_section_option(
|
||||
config.config_ini_section,
|
||||
"SQLALCHEMY_DATABASE_URI",
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from typing import Sequence, Union
|
|||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
|
|
@ -26,7 +27,34 @@ def upgrade() -> None:
|
|||
connection = op.get_bind()
|
||||
inspector = sa.inspect(connection)
|
||||
|
||||
if op.get_context().dialect.name == "postgresql":
|
||||
syncstatus_enum = postgresql.ENUM(
|
||||
"STARTED", "IN_PROGRESS", "COMPLETED", "FAILED", "CANCELLED", name="syncstatus"
|
||||
)
|
||||
syncstatus_enum.create(op.get_bind(), checkfirst=True)
|
||||
|
||||
if "sync_operations" not in inspector.get_table_names():
|
||||
if op.get_context().dialect.name == "postgresql":
|
||||
syncstatus = postgresql.ENUM(
|
||||
"STARTED",
|
||||
"IN_PROGRESS",
|
||||
"COMPLETED",
|
||||
"FAILED",
|
||||
"CANCELLED",
|
||||
name="syncstatus",
|
||||
create_type=False,
|
||||
)
|
||||
else:
|
||||
syncstatus = sa.Enum(
|
||||
"STARTED",
|
||||
"IN_PROGRESS",
|
||||
"COMPLETED",
|
||||
"FAILED",
|
||||
"CANCELLED",
|
||||
name="syncstatus",
|
||||
create_type=False,
|
||||
)
|
||||
|
||||
# Table doesn't exist, create it normally
|
||||
op.create_table(
|
||||
"sync_operations",
|
||||
|
|
@ -34,15 +62,7 @@ def upgrade() -> None:
|
|||
sa.Column("run_id", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"status",
|
||||
sa.Enum(
|
||||
"STARTED",
|
||||
"IN_PROGRESS",
|
||||
"COMPLETED",
|
||||
"FAILED",
|
||||
"CANCELLED",
|
||||
name="syncstatus",
|
||||
create_type=False,
|
||||
),
|
||||
syncstatus,
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("progress_percentage", sa.Integer(), nullable=True),
|
||||
|
|
|
|||
|
|
@ -23,11 +23,8 @@ depends_on: Union[str, Sequence[str], None] = "8057ae7329c2"
|
|||
|
||||
|
||||
def upgrade() -> None:
|
||||
try:
|
||||
await_only(create_default_user())
|
||||
except UserAlreadyExists:
|
||||
pass # It's fine if the default user already exists
|
||||
pass
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
await_only(delete_user("default_user@example.com"))
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -0,0 +1,98 @@
|
|||
"""Expand dataset database for multi user
|
||||
|
||||
Revision ID: 76625596c5c3
|
||||
Revises: 211ab850ef3d
|
||||
Create Date: 2025-10-30 12:55:20.239562
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "76625596c5c3"
|
||||
down_revision: Union[str, None] = "c946955da633"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def _get_column(inspector, table, name, schema=None):
|
||||
for col in inspector.get_columns(table, schema=schema):
|
||||
if col["name"] == name:
|
||||
return col
|
||||
return None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
insp = sa.inspect(conn)
|
||||
|
||||
vector_database_provider_column = _get_column(
|
||||
insp, "dataset_database", "vector_database_provider"
|
||||
)
|
||||
if not vector_database_provider_column:
|
||||
op.add_column(
|
||||
"dataset_database",
|
||||
sa.Column(
|
||||
"vector_database_provider",
|
||||
sa.String(),
|
||||
unique=False,
|
||||
nullable=False,
|
||||
server_default="lancedb",
|
||||
),
|
||||
)
|
||||
|
||||
graph_database_provider_column = _get_column(
|
||||
insp, "dataset_database", "graph_database_provider"
|
||||
)
|
||||
if not graph_database_provider_column:
|
||||
op.add_column(
|
||||
"dataset_database",
|
||||
sa.Column(
|
||||
"graph_database_provider",
|
||||
sa.String(),
|
||||
unique=False,
|
||||
nullable=False,
|
||||
server_default="kuzu",
|
||||
),
|
||||
)
|
||||
|
||||
vector_database_url_column = _get_column(insp, "dataset_database", "vector_database_url")
|
||||
if not vector_database_url_column:
|
||||
op.add_column(
|
||||
"dataset_database",
|
||||
sa.Column("vector_database_url", sa.String(), unique=False, nullable=True),
|
||||
)
|
||||
|
||||
graph_database_url_column = _get_column(insp, "dataset_database", "graph_database_url")
|
||||
if not graph_database_url_column:
|
||||
op.add_column(
|
||||
"dataset_database",
|
||||
sa.Column("graph_database_url", sa.String(), unique=False, nullable=True),
|
||||
)
|
||||
|
||||
vector_database_key_column = _get_column(insp, "dataset_database", "vector_database_key")
|
||||
if not vector_database_key_column:
|
||||
op.add_column(
|
||||
"dataset_database",
|
||||
sa.Column("vector_database_key", sa.String(), unique=False, nullable=True),
|
||||
)
|
||||
|
||||
graph_database_key_column = _get_column(insp, "dataset_database", "graph_database_key")
|
||||
if not graph_database_key_column:
|
||||
op.add_column(
|
||||
"dataset_database",
|
||||
sa.Column("graph_database_key", sa.String(), unique=False, nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("dataset_database", "vector_database_provider")
|
||||
op.drop_column("dataset_database", "graph_database_provider")
|
||||
op.drop_column("dataset_database", "vector_database_url")
|
||||
op.drop_column("dataset_database", "graph_database_url")
|
||||
op.drop_column("dataset_database", "vector_database_key")
|
||||
op.drop_column("dataset_database", "graph_database_key")
|
||||
|
|
@ -18,11 +18,8 @@ depends_on: Union[str, Sequence[str], None] = None
|
|||
|
||||
|
||||
def upgrade() -> None:
|
||||
db_engine = get_relational_engine()
|
||||
# we might want to delete this
|
||||
await_only(db_engine.create_database())
|
||||
pass
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
db_engine = get_relational_engine()
|
||||
await_only(db_engine.delete_database())
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -144,44 +144,58 @@ def _create_data_permission(conn, user_id, data_id, permission_name):
|
|||
)
|
||||
|
||||
|
||||
def _get_column(inspector, table, name, schema=None):
|
||||
for col in inspector.get_columns(table, schema=schema):
|
||||
if col["name"] == name:
|
||||
return col
|
||||
return None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
insp = sa.inspect(conn)
|
||||
|
||||
# Recreate ACLs table with default permissions set to datasets instead of documents
|
||||
op.drop_table("acls")
|
||||
dataset_id_column = _get_column(insp, "acls", "dataset_id")
|
||||
if not dataset_id_column:
|
||||
# Recreate ACLs table with default permissions set to datasets instead of documents
|
||||
op.drop_table("acls")
|
||||
|
||||
acls_table = op.create_table(
|
||||
"acls",
|
||||
sa.Column("id", UUID, primary_key=True, default=uuid4),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at", sa.DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)
|
||||
),
|
||||
sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")),
|
||||
sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")),
|
||||
sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")),
|
||||
)
|
||||
acls_table = op.create_table(
|
||||
"acls",
|
||||
sa.Column("id", UUID, primary_key=True, default=uuid4),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
onupdate=lambda: datetime.now(timezone.utc),
|
||||
),
|
||||
sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")),
|
||||
sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")),
|
||||
sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")),
|
||||
)
|
||||
|
||||
# Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table
|
||||
# definition or load what is in the database
|
||||
dataset_table = _define_dataset_table()
|
||||
datasets = conn.execute(sa.select(dataset_table)).fetchall()
|
||||
# Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table
|
||||
# definition or load what is in the database
|
||||
dataset_table = _define_dataset_table()
|
||||
datasets = conn.execute(sa.select(dataset_table)).fetchall()
|
||||
|
||||
if not datasets:
|
||||
return
|
||||
if not datasets:
|
||||
return
|
||||
|
||||
acl_list = []
|
||||
acl_list = []
|
||||
|
||||
for dataset in datasets:
|
||||
acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read"))
|
||||
acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write"))
|
||||
acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share"))
|
||||
acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete"))
|
||||
for dataset in datasets:
|
||||
acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read"))
|
||||
acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write"))
|
||||
acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share"))
|
||||
acl_list.append(
|
||||
_create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete")
|
||||
)
|
||||
|
||||
if acl_list:
|
||||
op.bulk_insert(acls_table, acl_list)
|
||||
if acl_list:
|
||||
op.bulk_insert(acls_table, acl_list)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
|
|
|
|||
137
alembic/versions/c946955da633_multi_tenant_support.py
Normal file
137
alembic/versions/c946955da633_multi_tenant_support.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
"""Multi Tenant Support
|
||||
|
||||
Revision ID: c946955da633
|
||||
Revises: 211ab850ef3d
|
||||
Create Date: 2025-11-04 18:11:09.325158
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
from datetime import datetime, timezone
|
||||
from uuid import uuid4
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "c946955da633"
|
||||
down_revision: Union[str, None] = "211ab850ef3d"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def _now():
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def _define_user_table() -> sa.Table:
|
||||
table = sa.Table(
|
||||
"users",
|
||||
sa.MetaData(),
|
||||
sa.Column(
|
||||
"id",
|
||||
sa.UUID,
|
||||
sa.ForeignKey("principals.id", ondelete="CASCADE"),
|
||||
primary_key=True,
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), index=True, nullable=True),
|
||||
)
|
||||
return table
|
||||
|
||||
|
||||
def _define_dataset_table() -> sa.Table:
|
||||
# Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table
|
||||
# definition or load what is in the database
|
||||
table = sa.Table(
|
||||
"datasets",
|
||||
sa.MetaData(),
|
||||
sa.Column("id", sa.UUID, primary_key=True, default=uuid4),
|
||||
sa.Column("name", sa.Text),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
default=lambda: datetime.now(timezone.utc),
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
onupdate=lambda: datetime.now(timezone.utc),
|
||||
),
|
||||
sa.Column("owner_id", sa.UUID(), sa.ForeignKey("principals.id"), index=True),
|
||||
sa.Column("tenant_id", sa.UUID(), sa.ForeignKey("tenants.id"), index=True, nullable=True),
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def _get_column(inspector, table, name, schema=None):
|
||||
for col in inspector.get_columns(table, schema=schema):
|
||||
if col["name"] == name:
|
||||
return col
|
||||
return None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
insp = sa.inspect(conn)
|
||||
|
||||
dataset = _define_dataset_table()
|
||||
user = _define_user_table()
|
||||
|
||||
if "user_tenants" not in insp.get_table_names():
|
||||
# Define table with all necessary columns including primary key
|
||||
user_tenants = op.create_table(
|
||||
"user_tenants",
|
||||
sa.Column("user_id", sa.UUID, sa.ForeignKey("users.id"), primary_key=True),
|
||||
sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), primary_key=True),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
|
||||
),
|
||||
)
|
||||
|
||||
# Get all users with their tenant_id
|
||||
user_data = conn.execute(
|
||||
sa.select(user.c.id, user.c.tenant_id).where(user.c.tenant_id.isnot(None))
|
||||
).fetchall()
|
||||
|
||||
# Insert into user_tenants table
|
||||
if user_data:
|
||||
op.bulk_insert(
|
||||
user_tenants,
|
||||
[
|
||||
{"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()}
|
||||
for user_id, tenant_id in user_data
|
||||
],
|
||||
)
|
||||
|
||||
tenant_id_column = _get_column(insp, "datasets", "tenant_id")
|
||||
if not tenant_id_column:
|
||||
op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True))
|
||||
|
||||
# Build subquery, select users.tenant_id for each dataset.owner_id
|
||||
tenant_id_from_dataset_owner = (
|
||||
sa.select(user.c.tenant_id).where(user.c.id == dataset.c.owner_id).scalar_subquery()
|
||||
)
|
||||
|
||||
if op.get_context().dialect.name == "sqlite":
|
||||
# If column doesn't exist create new original_extension column and update from values of extension column
|
||||
with op.batch_alter_table("datasets") as batch_op:
|
||||
batch_op.execute(
|
||||
dataset.update().values(
|
||||
tenant_id=tenant_id_from_dataset_owner,
|
||||
)
|
||||
)
|
||||
else:
|
||||
conn = op.get_bind()
|
||||
conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner))
|
||||
|
||||
op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_table("user_tenants")
|
||||
op.drop_index(op.f("ix_datasets_tenant_id"), table_name="datasets")
|
||||
op.drop_column("datasets", "tenant_id")
|
||||
# ### end Alembic commands ###
|
||||
|
|
@ -110,6 +110,47 @@ If you'd rather run cognee-mcp in a container, you have two options:
|
|||
# For stdio transport (default)
|
||||
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
**Installing optional dependencies at runtime:**
|
||||
|
||||
You can install optional dependencies when running the container by setting the `EXTRAS` environment variable:
|
||||
```bash
|
||||
# Install a single optional dependency group at runtime
|
||||
docker run \
|
||||
-e TRANSPORT_MODE=http \
|
||||
-e EXTRAS=aws \
|
||||
--env-file ./.env \
|
||||
-p 8000:8000 \
|
||||
--rm -it cognee/cognee-mcp:main
|
||||
|
||||
# Install multiple optional dependency groups at runtime (comma-separated)
|
||||
docker run \
|
||||
-e TRANSPORT_MODE=sse \
|
||||
-e EXTRAS=aws,postgres,neo4j \
|
||||
--env-file ./.env \
|
||||
-p 8000:8000 \
|
||||
--rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
**Available optional dependency groups:**
|
||||
- `aws` - S3 storage support
|
||||
- `postgres` / `postgres-binary` - PostgreSQL database support
|
||||
- `neo4j` - Neo4j graph database support
|
||||
- `neptune` - AWS Neptune support
|
||||
- `chromadb` - ChromaDB vector store support
|
||||
- `scraping` - Web scraping capabilities
|
||||
- `distributed` - Modal distributed execution
|
||||
- `langchain` - LangChain integration
|
||||
- `llama-index` - LlamaIndex integration
|
||||
- `anthropic` - Anthropic models
|
||||
- `groq` - Groq models
|
||||
- `mistral` - Mistral models
|
||||
- `ollama` / `huggingface` - Local model support
|
||||
- `docs` - Document processing
|
||||
- `codegraph` - Code analysis
|
||||
- `monitoring` - Sentry & Langfuse monitoring
|
||||
- `redis` - Redis support
|
||||
- And more (see [pyproject.toml](https://github.com/topoteretes/cognee/blob/main/pyproject.toml) for full list)
|
||||
2. **Pull from Docker Hub** (no build required):
|
||||
```bash
|
||||
# With HTTP transport (recommended for web deployments)
|
||||
|
|
@ -119,6 +160,17 @@ If you'd rather run cognee-mcp in a container, you have two options:
|
|||
# With stdio transport (default)
|
||||
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
**With runtime installation of optional dependencies:**
|
||||
```bash
|
||||
# Install optional dependencies from Docker Hub image
|
||||
docker run \
|
||||
-e TRANSPORT_MODE=http \
|
||||
-e EXTRAS=aws,postgres \
|
||||
--env-file ./.env \
|
||||
-p 8000:8000 \
|
||||
--rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
### **Important: Docker vs Direct Usage**
|
||||
**Docker uses environment variables**, not command line arguments:
|
||||
|
|
|
|||
|
|
@ -4,6 +4,42 @@ set -e # Exit on error
|
|||
echo "Debug mode: $DEBUG"
|
||||
echo "Environment: $ENVIRONMENT"
|
||||
|
||||
# Install optional dependencies if EXTRAS is set
|
||||
if [ -n "$EXTRAS" ]; then
|
||||
echo "Installing optional dependencies: $EXTRAS"
|
||||
|
||||
# Get the cognee version that's currently installed
|
||||
COGNEE_VERSION=$(uv pip show cognee | grep "Version:" | awk '{print $2}')
|
||||
echo "Current cognee version: $COGNEE_VERSION"
|
||||
|
||||
# Build the extras list for cognee
|
||||
IFS=',' read -ra EXTRA_ARRAY <<< "$EXTRAS"
|
||||
# Combine base extras from pyproject.toml with requested extras
|
||||
ALL_EXTRAS=""
|
||||
for extra in "${EXTRA_ARRAY[@]}"; do
|
||||
# Trim whitespace
|
||||
extra=$(echo "$extra" | xargs)
|
||||
# Add to extras list if not already present
|
||||
if [[ ! "$ALL_EXTRAS" =~ (^|,)"$extra"(,|$) ]]; then
|
||||
if [ -z "$ALL_EXTRAS" ]; then
|
||||
ALL_EXTRAS="$extra"
|
||||
else
|
||||
ALL_EXTRAS="$ALL_EXTRAS,$extra"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Installing cognee with extras: $ALL_EXTRAS"
|
||||
echo "Running: uv pip install 'cognee[$ALL_EXTRAS]==$COGNEE_VERSION'"
|
||||
uv pip install "cognee[$ALL_EXTRAS]==$COGNEE_VERSION"
|
||||
|
||||
# Verify installation
|
||||
echo ""
|
||||
echo "✓ Optional dependencies installation completed"
|
||||
else
|
||||
echo "No optional dependencies specified"
|
||||
fi
|
||||
|
||||
# Set default transport mode if not specified
|
||||
TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
|
||||
echo "Transport mode: $TRANSPORT_MODE"
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ dependencies = [
|
|||
# For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes.
|
||||
#"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j] @ file:/Users/igorilic/Desktop/cognee",
|
||||
# TODO: Remove gemini from optional dependecnies for new Cognee version after 0.3.4
|
||||
"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j]==0.3.4",
|
||||
"cognee[postgres,docs,neo4j]==0.3.7",
|
||||
"fastmcp>=2.10.0,<3.0.0",
|
||||
"mcp>=1.12.0,<2.0.0",
|
||||
"uv>=0.6.3,<1.0.0",
|
||||
|
|
|
|||
|
|
@ -37,12 +37,10 @@ async def run():
|
|||
|
||||
toolResult = await session.call_tool("prune", arguments={})
|
||||
|
||||
toolResult = await session.call_tool(
|
||||
"codify", arguments={"repo_path": "SOME_REPO_PATH"}
|
||||
)
|
||||
toolResult = await session.call_tool("cognify", arguments={})
|
||||
|
||||
toolResult = await session.call_tool(
|
||||
"search", arguments={"search_type": "CODE", "search_query": "exceptions"}
|
||||
"search", arguments={"search_type": "GRAPH_COMPLETION"}
|
||||
)
|
||||
|
||||
print(f"Cognify result: {toolResult.content}")
|
||||
|
|
|
|||
|
|
@ -1096,6 +1096,10 @@ async def main():
|
|||
|
||||
# Skip migrations when in API mode (the API server handles its own database)
|
||||
if not args.no_migration and not args.api_url:
|
||||
from cognee.modules.engine.operations.setup import setup
|
||||
|
||||
await setup()
|
||||
|
||||
# Run Alembic migrations from the main cognee directory where alembic.ini is located
|
||||
logger.info("Running database migrations...")
|
||||
migration_result = subprocess.run(
|
||||
|
|
|
|||
8575
cognee-mcp/uv.lock
generated
8575
cognee-mcp/uv.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -19,6 +19,7 @@ from .api.v1.add import add
|
|||
from .api.v1.delete import delete
|
||||
from .api.v1.cognify import cognify
|
||||
from .modules.memify import memify
|
||||
from .modules.run_custom_pipeline import run_custom_pipeline
|
||||
from .api.v1.update import update
|
||||
from .api.v1.config.config import config
|
||||
from .api.v1.datasets.datasets import datasets
|
||||
|
|
|
|||
|
|
@ -39,6 +39,8 @@ from cognee.api.v1.users.routers import (
|
|||
)
|
||||
from cognee.modules.users.methods.get_authenticated_user import REQUIRE_AUTHENTICATION
|
||||
|
||||
# Ensure application logging is configured for container stdout/stderr
|
||||
setup_logging()
|
||||
logger = get_logger()
|
||||
|
||||
if os.getenv("ENV", "prod") == "prod":
|
||||
|
|
@ -74,6 +76,9 @@ async def lifespan(app: FastAPI):
|
|||
|
||||
await get_default_user()
|
||||
|
||||
# Emit a clear startup message for docker logs
|
||||
logger.info("Backend server has started")
|
||||
|
||||
yield
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,5 @@
|
|||
from uuid import UUID
|
||||
import os
|
||||
from typing import Union, BinaryIO, List, Optional, Dict, Any
|
||||
from pydantic import BaseModel
|
||||
from urllib.parse import urlparse
|
||||
from typing import Union, BinaryIO, List, Optional, Any
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.modules.pipelines import Task, run_pipeline
|
||||
from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
|
||||
|
|
@ -17,16 +14,6 @@ from cognee.shared.logging_utils import get_logger
|
|||
|
||||
logger = get_logger()
|
||||
|
||||
try:
|
||||
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
|
||||
from cognee.context_global_variables import (
|
||||
tavily_config as tavily,
|
||||
soup_crawler_config as soup_crawler,
|
||||
)
|
||||
except ImportError:
|
||||
logger.debug(f"Unable to import {str(ImportError)}")
|
||||
pass
|
||||
|
||||
|
||||
async def add(
|
||||
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
|
||||
|
|
@ -36,11 +23,8 @@ async def add(
|
|||
vector_db_config: dict = None,
|
||||
graph_db_config: dict = None,
|
||||
dataset_id: Optional[UUID] = None,
|
||||
preferred_loaders: List[str] = None,
|
||||
preferred_loaders: Optional[List[Union[str, dict[str, dict[str, Any]]]]] = None,
|
||||
incremental_loading: bool = True,
|
||||
extraction_rules: Optional[Dict[str, Any]] = None,
|
||||
tavily_config: Optional[BaseModel] = None,
|
||||
soup_crawler_config: Optional[BaseModel] = None,
|
||||
data_per_batch: Optional[int] = 20,
|
||||
):
|
||||
"""
|
||||
|
|
@ -180,28 +164,14 @@ async def add(
|
|||
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
|
||||
|
||||
"""
|
||||
|
||||
try:
|
||||
if not soup_crawler_config and extraction_rules:
|
||||
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
|
||||
if not tavily_config and os.getenv("TAVILY_API_KEY"):
|
||||
tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
soup_crawler.set(soup_crawler_config)
|
||||
tavily.set(tavily_config)
|
||||
|
||||
http_schemes = {"http", "https"}
|
||||
|
||||
def _is_http_url(item: Union[str, BinaryIO]) -> bool:
|
||||
return isinstance(item, str) and urlparse(item).scheme in http_schemes
|
||||
|
||||
if _is_http_url(data):
|
||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
||||
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
|
||||
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
|
||||
except NameError:
|
||||
logger.debug(f"Unable to import {str(ImportError)}")
|
||||
pass
|
||||
if preferred_loaders is not None:
|
||||
transformed = {}
|
||||
for item in preferred_loaders:
|
||||
if isinstance(item, dict):
|
||||
transformed.update(item)
|
||||
else:
|
||||
transformed[item] = {}
|
||||
preferred_loaders = transformed
|
||||
|
||||
tasks = [
|
||||
Task(resolve_data_directories, include_subdirectories=True),
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from cognee.modules.users.methods import get_authenticated_user
|
|||
from cognee.shared.utils import send_telemetry
|
||||
from cognee.modules.pipelines.models import PipelineRunErrored
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
@ -63,7 +64,11 @@ def get_add_router() -> APIRouter:
|
|||
send_telemetry(
|
||||
"Add API Endpoint Invoked",
|
||||
user.id,
|
||||
additional_properties={"endpoint": "POST /v1/add", "node_set": node_set},
|
||||
additional_properties={
|
||||
"endpoint": "POST /v1/add",
|
||||
"node_set": node_set,
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
from cognee.api.v1.add import add as cognee_add
|
||||
|
|
@ -77,7 +82,9 @@ def get_add_router() -> APIRouter:
|
|||
datasetName,
|
||||
user=user,
|
||||
dataset_id=datasetId,
|
||||
node_set=node_set if node_set else None,
|
||||
node_set=node_set
|
||||
if node_set != [""]
|
||||
else None, # Transform default node_set endpoint value to None
|
||||
)
|
||||
|
||||
if isinstance(add_run, PipelineRunErrored):
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ from cognee.modules.pipelines.queues.pipeline_run_info_queues import (
|
|||
)
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.shared.utils import send_telemetry
|
||||
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
logger = get_logger("api.cognify")
|
||||
|
||||
|
|
@ -98,6 +98,7 @@ def get_cognify_router() -> APIRouter:
|
|||
user.id,
|
||||
additional_properties={
|
||||
"endpoint": "POST /v1/cognify",
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from uuid import UUID
|
||||
from cognee.modules.data.methods import has_dataset_data
|
||||
from cognee.modules.users.methods import get_default_user
|
||||
from cognee.modules.ingestion import discover_directory_datasets
|
||||
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
|
||||
|
|
@ -26,6 +27,16 @@ class datasets:
|
|||
|
||||
return await get_dataset_data(dataset.id)
|
||||
|
||||
@staticmethod
|
||||
async def has_data(dataset_id: str) -> bool:
|
||||
from cognee.modules.data.methods import get_dataset
|
||||
|
||||
user = await get_default_user()
|
||||
|
||||
dataset = await get_dataset(user.id, dataset_id)
|
||||
|
||||
return await has_dataset_data(dataset.id)
|
||||
|
||||
@staticmethod
|
||||
async def get_status(dataset_ids: list[UUID]) -> dict:
|
||||
return await get_pipeline_status(dataset_ids, pipeline_name="cognify_pipeline")
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from cognee.modules.users.permissions.methods import (
|
|||
from cognee.modules.graph.methods import get_formatted_graph_data
|
||||
from cognee.modules.pipelines.models import PipelineRunStatus
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
@ -100,6 +101,7 @@ def get_datasets_router() -> APIRouter:
|
|||
user.id,
|
||||
additional_properties={
|
||||
"endpoint": "GET /v1/datasets",
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -147,6 +149,7 @@ def get_datasets_router() -> APIRouter:
|
|||
user.id,
|
||||
additional_properties={
|
||||
"endpoint": "POST /v1/datasets",
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -201,6 +204,7 @@ def get_datasets_router() -> APIRouter:
|
|||
additional_properties={
|
||||
"endpoint": f"DELETE /v1/datasets/{str(dataset_id)}",
|
||||
"dataset_id": str(dataset_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -246,6 +250,7 @@ def get_datasets_router() -> APIRouter:
|
|||
"endpoint": f"DELETE /v1/datasets/{str(dataset_id)}/data/{str(data_id)}",
|
||||
"dataset_id": str(dataset_id),
|
||||
"data_id": str(data_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -327,6 +332,7 @@ def get_datasets_router() -> APIRouter:
|
|||
additional_properties={
|
||||
"endpoint": f"GET /v1/datasets/{str(dataset_id)}/data",
|
||||
"dataset_id": str(dataset_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -387,6 +393,7 @@ def get_datasets_router() -> APIRouter:
|
|||
additional_properties={
|
||||
"endpoint": "GET /v1/datasets/status",
|
||||
"datasets": [str(dataset_id) for dataset_id in datasets],
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -433,6 +440,7 @@ def get_datasets_router() -> APIRouter:
|
|||
"endpoint": f"GET /v1/datasets/{str(dataset_id)}/data/{str(data_id)}/raw",
|
||||
"dataset_id": str(dataset_id),
|
||||
"data_id": str(data_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from cognee.shared.logging_utils import get_logger
|
|||
from cognee.modules.users.models import User
|
||||
from cognee.modules.users.methods import get_authenticated_user
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
@ -39,6 +40,7 @@ def get_delete_router() -> APIRouter:
|
|||
"endpoint": "DELETE /v1/delete",
|
||||
"dataset_id": str(dataset_id),
|
||||
"data_id": str(data_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ from cognee.modules.users.methods import get_authenticated_user
|
|||
from cognee.shared.utils import send_telemetry
|
||||
from cognee.modules.pipelines.models import PipelineRunErrored
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
@ -73,7 +74,7 @@ def get_memify_router() -> APIRouter:
|
|||
send_telemetry(
|
||||
"Memify API Endpoint Invoked",
|
||||
user.id,
|
||||
additional_properties={"endpoint": "POST /v1/memify"},
|
||||
additional_properties={"endpoint": "POST /v1/memify", "cognee_version": cognee_version},
|
||||
)
|
||||
|
||||
if not payload.dataset_id and not payload.dataset_name:
|
||||
|
|
|
|||
|
|
@ -1,12 +1,18 @@
|
|||
from uuid import UUID
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.api.DTO import InDTO
|
||||
from cognee.modules.users.methods import get_authenticated_user
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
|
||||
class SelectTenantDTO(InDTO):
|
||||
tenant_id: UUID | None = None
|
||||
|
||||
|
||||
def get_permissions_router() -> APIRouter:
|
||||
|
|
@ -48,6 +54,7 @@ def get_permissions_router() -> APIRouter:
|
|||
"endpoint": f"POST /v1/permissions/datasets/{str(principal_id)}",
|
||||
"dataset_ids": str(dataset_ids),
|
||||
"principal_id": str(principal_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -89,6 +96,7 @@ def get_permissions_router() -> APIRouter:
|
|||
additional_properties={
|
||||
"endpoint": "POST /v1/permissions/roles",
|
||||
"role_name": role_name,
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -133,6 +141,7 @@ def get_permissions_router() -> APIRouter:
|
|||
"endpoint": f"POST /v1/permissions/users/{str(user_id)}/roles",
|
||||
"user_id": str(user_id),
|
||||
"role_id": str(role_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -175,6 +184,7 @@ def get_permissions_router() -> APIRouter:
|
|||
"endpoint": f"POST /v1/permissions/users/{str(user_id)}/tenants",
|
||||
"user_id": str(user_id),
|
||||
"tenant_id": str(tenant_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -209,6 +219,7 @@ def get_permissions_router() -> APIRouter:
|
|||
additional_properties={
|
||||
"endpoint": "POST /v1/permissions/tenants",
|
||||
"tenant_name": tenant_name,
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -220,4 +231,39 @@ def get_permissions_router() -> APIRouter:
|
|||
status_code=200, content={"message": "Tenant created.", "tenant_id": str(tenant_id)}
|
||||
)
|
||||
|
||||
@permissions_router.post("/tenants/select")
|
||||
async def select_tenant(payload: SelectTenantDTO, user: User = Depends(get_authenticated_user)):
|
||||
"""
|
||||
Select current tenant.
|
||||
|
||||
This endpoint selects a tenant with the specified UUID. Tenants are used
|
||||
to organize users and resources in multi-tenant environments, providing
|
||||
isolation and access control between different groups or organizations.
|
||||
|
||||
Sending a null/None value as tenant_id selects his default single user tenant
|
||||
|
||||
## Request Parameters
|
||||
- **tenant_id** (Union[UUID, None]): UUID of the tenant to select, If null/None is provided use the default single user tenant
|
||||
|
||||
## Response
|
||||
Returns a success message along with selected tenant id.
|
||||
"""
|
||||
send_telemetry(
|
||||
"Permissions API Endpoint Invoked",
|
||||
user.id,
|
||||
additional_properties={
|
||||
"endpoint": f"POST /v1/permissions/tenants/{str(payload.tenant_id)}",
|
||||
"tenant_id": str(payload.tenant_id),
|
||||
},
|
||||
)
|
||||
|
||||
from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method
|
||||
|
||||
await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id)
|
||||
|
||||
return JSONResponse(
|
||||
status_code=200,
|
||||
content={"message": "Tenant selected.", "tenant_id": str(payload.tenant_id)},
|
||||
)
|
||||
|
||||
return permissions_router
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from cognee.modules.users.models import User
|
|||
from cognee.modules.search.operations import get_history
|
||||
from cognee.modules.users.methods import get_authenticated_user
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
|
||||
# Note: Datasets sent by name will only map to datasets owned by the request sender
|
||||
|
|
@ -61,9 +62,7 @@ def get_search_router() -> APIRouter:
|
|||
send_telemetry(
|
||||
"Search API Endpoint Invoked",
|
||||
user.id,
|
||||
additional_properties={
|
||||
"endpoint": "GET /v1/search",
|
||||
},
|
||||
additional_properties={"endpoint": "GET /v1/search", "cognee_version": cognee_version},
|
||||
)
|
||||
|
||||
try:
|
||||
|
|
@ -118,6 +117,7 @@ def get_search_router() -> APIRouter:
|
|||
"top_k": payload.top_k,
|
||||
"only_context": payload.only_context,
|
||||
"use_combined_context": payload.use_combined_context,
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from uuid import UUID
|
||||
from typing import Union, Optional, List, Type
|
||||
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from cognee.modules.engine.models.node_set import NodeSet
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ from cognee.modules.sync.methods import get_running_sync_operations_for_user, ge
|
|||
from cognee.shared.utils import send_telemetry
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.api.v1.sync import SyncResponse
|
||||
from cognee import __version__ as cognee_version
|
||||
from cognee.context_global_variables import set_database_global_context_variables
|
||||
|
||||
logger = get_logger()
|
||||
|
|
@ -99,6 +100,7 @@ def get_sync_router() -> APIRouter:
|
|||
user.id,
|
||||
additional_properties={
|
||||
"endpoint": "POST /v1/sync",
|
||||
"cognee_version": cognee_version,
|
||||
"dataset_ids": [str(id) for id in request.dataset_ids]
|
||||
if request.dataset_ids
|
||||
else "*",
|
||||
|
|
@ -205,6 +207,7 @@ def get_sync_router() -> APIRouter:
|
|||
user.id,
|
||||
additional_properties={
|
||||
"endpoint": "GET /v1/sync/status",
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -503,7 +503,7 @@ def start_ui(
|
|||
if start_mcp:
|
||||
logger.info("Starting Cognee MCP server with Docker...")
|
||||
try:
|
||||
image = "cognee/cognee-mcp:feature-standalone-mcp" # TODO: change to "cognee/cognee-mcp:main" right before merging into main
|
||||
image = "cognee/cognee-mcp:main"
|
||||
subprocess.run(["docker", "pull", image], check=True)
|
||||
|
||||
import uuid
|
||||
|
|
@ -538,9 +538,7 @@ def start_ui(
|
|||
env_file = os.path.join(cwd, ".env")
|
||||
docker_cmd.extend(["--env-file", env_file])
|
||||
|
||||
docker_cmd.append(
|
||||
image
|
||||
) # TODO: change to "cognee/cognee-mcp:main" right before merging into main
|
||||
docker_cmd.append(image)
|
||||
|
||||
mcp_process = subprocess.Popen(
|
||||
docker_cmd,
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from cognee.shared.logging_utils import get_logger
|
|||
from cognee.modules.users.models import User
|
||||
from cognee.modules.users.methods import get_authenticated_user
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee import __version__ as cognee_version
|
||||
from cognee.modules.pipelines.models.PipelineRunInfo import (
|
||||
PipelineRunErrored,
|
||||
)
|
||||
|
|
@ -64,6 +65,7 @@ def get_update_router() -> APIRouter:
|
|||
"dataset_id": str(dataset_id),
|
||||
"data_id": str(data_id),
|
||||
"node_set": str(node_set),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from uuid import UUID
|
||||
from typing import Union, BinaryIO, List, Optional
|
||||
from typing import Union, BinaryIO, List, Optional, Any
|
||||
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.api.v1.delete import delete
|
||||
|
|
@ -15,7 +15,7 @@ async def update(
|
|||
node_set: Optional[List[str]] = None,
|
||||
vector_db_config: dict = None,
|
||||
graph_db_config: dict = None,
|
||||
preferred_loaders: List[str] = None,
|
||||
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||
incremental_loading: bool = True,
|
||||
):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from cognee.modules.users.models import User
|
|||
|
||||
from cognee.context_global_variables import set_database_global_context_variables
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee import __version__ as cognee_version
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
|
@ -46,6 +47,7 @@ def get_visualize_router() -> APIRouter:
|
|||
additional_properties={
|
||||
"endpoint": "GET /v1/visualize",
|
||||
"dataset_id": str(dataset_id),
|
||||
"cognee_version": cognee_version,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from functools import lru_cache
|
||||
from cognee.root_dir import get_absolute_path, ensure_absolute_path
|
||||
|
|
@ -11,6 +12,9 @@ class BaseConfig(BaseSettings):
|
|||
data_root_directory: str = get_absolute_path(".data_storage")
|
||||
system_root_directory: str = get_absolute_path(".cognee_system")
|
||||
cache_root_directory: str = get_absolute_path(".cognee_cache")
|
||||
logs_root_directory: str = os.getenv(
|
||||
"COGNEE_LOGS_DIR", str(os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs"))
|
||||
)
|
||||
monitoring_tool: object = Observer.NONE
|
||||
|
||||
@pydantic.model_validator(mode="after")
|
||||
|
|
@ -30,6 +34,8 @@ class BaseConfig(BaseSettings):
|
|||
# Require absolute paths for root directories
|
||||
self.data_root_directory = ensure_absolute_path(self.data_root_directory)
|
||||
self.system_root_directory = ensure_absolute_path(self.system_root_directory)
|
||||
self.logs_root_directory = ensure_absolute_path(self.logs_root_directory)
|
||||
|
||||
# Set monitoring tool based on available keys
|
||||
if self.langfuse_public_key and self.langfuse_secret_key:
|
||||
self.monitoring_tool = Observer.LANGFUSE
|
||||
|
|
@ -49,6 +55,7 @@ class BaseConfig(BaseSettings):
|
|||
"system_root_directory": self.system_root_directory,
|
||||
"monitoring_tool": self.monitoring_tool,
|
||||
"cache_root_directory": self.cache_root_directory,
|
||||
"logs_root_directory": self.logs_root_directory,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ from typing import Union
|
|||
from uuid import UUID
|
||||
|
||||
from cognee.base_config import get_base_config
|
||||
from cognee.infrastructure.databases.vector.config import get_vectordb_context_config
|
||||
from cognee.infrastructure.databases.graph.config import get_graph_context_config
|
||||
from cognee.infrastructure.databases.utils import get_or_create_dataset_database
|
||||
from cognee.infrastructure.files.storage.config import file_storage_config
|
||||
from cognee.modules.users.methods import get_user
|
||||
|
|
@ -13,14 +15,41 @@ from cognee.modules.users.methods import get_user
|
|||
vector_db_config = ContextVar("vector_db_config", default=None)
|
||||
graph_db_config = ContextVar("graph_db_config", default=None)
|
||||
session_user = ContextVar("session_user", default=None)
|
||||
soup_crawler_config = ContextVar("soup_crawler_config", default=None)
|
||||
tavily_config = ContextVar("tavily_config", default=None)
|
||||
|
||||
VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"]
|
||||
GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"]
|
||||
|
||||
|
||||
async def set_session_user_context_variable(user):
|
||||
session_user.set(user)
|
||||
|
||||
|
||||
def multi_user_support_possible():
|
||||
graph_db_config = get_graph_context_config()
|
||||
vector_db_config = get_vectordb_context_config()
|
||||
return (
|
||||
graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT
|
||||
and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT
|
||||
)
|
||||
|
||||
|
||||
def backend_access_control_enabled():
|
||||
backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None)
|
||||
if backend_access_control is None:
|
||||
# If backend access control is not defined in environment variables,
|
||||
# enable it by default if graph and vector DBs can support it, otherwise disable it
|
||||
return multi_user_support_possible()
|
||||
elif backend_access_control.lower() == "true":
|
||||
# If enabled, ensure that the current graph and vector DBs can support it
|
||||
multi_user_support = multi_user_support_possible()
|
||||
if not multi_user_support:
|
||||
raise EnvironmentError(
|
||||
"ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control."
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID):
|
||||
"""
|
||||
If backend access control is enabled this function will ensure all datasets have their own databases,
|
||||
|
|
@ -40,9 +69,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_
|
|||
|
||||
"""
|
||||
|
||||
base_config = get_base_config()
|
||||
|
||||
if not os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true":
|
||||
if not backend_access_control_enabled():
|
||||
return
|
||||
|
||||
user = await get_user(user_id)
|
||||
|
|
@ -50,6 +77,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_
|
|||
# To ensure permissions are enforced properly all datasets will have their own databases
|
||||
dataset_database = await get_or_create_dataset_database(dataset, user)
|
||||
|
||||
base_config = get_base_config()
|
||||
data_root_directory = os.path.join(
|
||||
base_config.data_root_directory, str(user.tenant_id or user.id)
|
||||
)
|
||||
|
|
@ -59,15 +87,17 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_
|
|||
|
||||
# Set vector and graph database configuration based on dataset database information
|
||||
vector_config = {
|
||||
"vector_db_url": os.path.join(
|
||||
databases_directory_path, dataset_database.vector_database_name
|
||||
),
|
||||
"vector_db_key": "",
|
||||
"vector_db_provider": "lancedb",
|
||||
"vector_db_provider": dataset_database.vector_database_provider,
|
||||
"vector_db_url": dataset_database.vector_database_url,
|
||||
"vector_db_key": dataset_database.vector_database_key,
|
||||
"vector_db_name": dataset_database.vector_database_name,
|
||||
}
|
||||
|
||||
graph_config = {
|
||||
"graph_database_provider": "kuzu",
|
||||
"graph_database_provider": dataset_database.graph_database_provider,
|
||||
"graph_database_url": dataset_database.graph_database_url,
|
||||
"graph_database_name": dataset_database.graph_database_name,
|
||||
"graph_database_key": dataset_database.graph_database_key,
|
||||
"graph_file_path": os.path.join(
|
||||
databases_directory_path, dataset_database.graph_database_name
|
||||
),
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ class CogneeValidationError(CogneeApiError):
|
|||
self,
|
||||
message: str = "A validation error occurred.",
|
||||
name: str = "CogneeValidationError",
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
log=True,
|
||||
log_level="ERROR",
|
||||
):
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ class DatabaseNotCreatedError(CogneeSystemError):
|
|||
self,
|
||||
message: str = "The database has not been created yet. Please call `await setup()` first.",
|
||||
name: str = "DatabaseNotCreatedError",
|
||||
status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
):
|
||||
super().__init__(message, name, status_code)
|
||||
|
||||
|
|
@ -99,7 +99,7 @@ class EmbeddingException(CogneeConfigurationError):
|
|||
self,
|
||||
message: str = "Embedding Exception.",
|
||||
name: str = "EmbeddingException",
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
):
|
||||
super().__init__(message, name, status_code)
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ class GraphConfig(BaseSettings):
|
|||
- graph_database_username
|
||||
- graph_database_password
|
||||
- graph_database_port
|
||||
- graph_database_key
|
||||
- graph_file_path
|
||||
- graph_model
|
||||
- graph_topology
|
||||
|
|
@ -41,6 +42,7 @@ class GraphConfig(BaseSettings):
|
|||
graph_database_username: str = ""
|
||||
graph_database_password: str = ""
|
||||
graph_database_port: int = 123
|
||||
graph_database_key: str = ""
|
||||
graph_file_path: str = ""
|
||||
graph_filename: str = ""
|
||||
graph_model: object = KnowledgeGraph
|
||||
|
|
@ -90,6 +92,7 @@ class GraphConfig(BaseSettings):
|
|||
"graph_database_username": self.graph_database_username,
|
||||
"graph_database_password": self.graph_database_password,
|
||||
"graph_database_port": self.graph_database_port,
|
||||
"graph_database_key": self.graph_database_key,
|
||||
"graph_file_path": self.graph_file_path,
|
||||
"graph_model": self.graph_model,
|
||||
"graph_topology": self.graph_topology,
|
||||
|
|
@ -116,6 +119,7 @@ class GraphConfig(BaseSettings):
|
|||
"graph_database_username": self.graph_database_username,
|
||||
"graph_database_password": self.graph_database_password,
|
||||
"graph_database_port": self.graph_database_port,
|
||||
"graph_database_key": self.graph_database_key,
|
||||
"graph_file_path": self.graph_file_path,
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ def create_graph_engine(
|
|||
graph_database_username="",
|
||||
graph_database_password="",
|
||||
graph_database_port="",
|
||||
graph_database_key="",
|
||||
):
|
||||
"""
|
||||
Create a graph engine based on the specified provider type.
|
||||
|
|
@ -69,6 +70,7 @@ def create_graph_engine(
|
|||
graph_database_url=graph_database_url,
|
||||
graph_database_username=graph_database_username,
|
||||
graph_database_password=graph_database_password,
|
||||
database_name=graph_database_name,
|
||||
)
|
||||
|
||||
if graph_database_provider == "neo4j":
|
||||
|
|
|
|||
|
|
@ -159,6 +159,11 @@ class GraphDBInterface(ABC):
|
|||
- get_connections
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
async def is_empty(self) -> bool:
|
||||
logger.warning("is_empty() is not implemented")
|
||||
return True
|
||||
|
||||
@abstractmethod
|
||||
async def query(self, query: str, params: dict) -> List[Any]:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -198,6 +198,15 @@ class KuzuAdapter(GraphDBInterface):
|
|||
except FileNotFoundError:
|
||||
logger.warning(f"Kuzu S3 storage file not found: {self.db_path}")
|
||||
|
||||
async def is_empty(self) -> bool:
|
||||
query = """
|
||||
MATCH (n)
|
||||
RETURN true
|
||||
LIMIT 1;
|
||||
"""
|
||||
query_result = await self.query(query)
|
||||
return len(query_result) == 0
|
||||
|
||||
async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]:
|
||||
"""
|
||||
Execute a Kuzu query asynchronously with automatic reconnection.
|
||||
|
|
@ -1357,9 +1366,15 @@ class KuzuAdapter(GraphDBInterface):
|
|||
params[param_name] = values
|
||||
|
||||
where_clause = " AND ".join(where_clauses)
|
||||
nodes_query = (
|
||||
f"MATCH (n:Node) WHERE {where_clause} RETURN n.id, {{properties: n.properties}}"
|
||||
)
|
||||
nodes_query = f"""
|
||||
MATCH (n:Node)
|
||||
WHERE {where_clause}
|
||||
RETURN n.id, {{
|
||||
name: n.name,
|
||||
type: n.type,
|
||||
properties: n.properties
|
||||
}}
|
||||
"""
|
||||
edges_query = f"""
|
||||
MATCH (n1:Node)-[r:EDGE]->(n2:Node)
|
||||
WHERE {where_clause.replace("n.", "n1.")} AND {where_clause.replace("n.", "n2.")}
|
||||
|
|
|
|||
|
|
@ -87,6 +87,15 @@ class Neo4jAdapter(GraphDBInterface):
|
|||
async with self.driver.session(database=self.graph_database_name) as session:
|
||||
yield session
|
||||
|
||||
async def is_empty(self) -> bool:
|
||||
query = """
|
||||
RETURN EXISTS {
|
||||
MATCH (n)
|
||||
} AS node_exists;
|
||||
"""
|
||||
query_result = await self.query(query)
|
||||
return not query_result[0]["node_exists"]
|
||||
|
||||
@deadlock_retry()
|
||||
async def query(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -416,6 +416,15 @@ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface):
|
|||
self._client.query(f"MATCH (n :{self._VECTOR_NODE_LABEL}) DETACH DELETE n")
|
||||
pass
|
||||
|
||||
async def is_empty(self) -> bool:
|
||||
query = """
|
||||
MATCH (n)
|
||||
RETURN true
|
||||
LIMIT 1;
|
||||
"""
|
||||
query_result = await self._client.query(query)
|
||||
return len(query_result) == 0
|
||||
|
||||
@staticmethod
|
||||
def _get_scored_result(
|
||||
item: dict, with_vector: bool = False, with_score: bool = False
|
||||
|
|
|
|||
|
|
@ -1,11 +1,15 @@
|
|||
import os
|
||||
from uuid import UUID
|
||||
from typing import Union
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from cognee.modules.data.methods import create_dataset
|
||||
|
||||
from cognee.base_config import get_base_config
|
||||
from cognee.modules.data.methods import create_dataset
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
from cognee.infrastructure.databases.vector import get_vectordb_config
|
||||
from cognee.infrastructure.databases.graph.config import get_graph_config
|
||||
from cognee.modules.data.methods import get_unique_dataset_id
|
||||
from cognee.modules.users.models import DatasetDatabase
|
||||
from cognee.modules.users.models import User
|
||||
|
|
@ -32,8 +36,32 @@ async def get_or_create_dataset_database(
|
|||
|
||||
dataset_id = await get_unique_dataset_id(dataset, user)
|
||||
|
||||
vector_db_name = f"{dataset_id}.lance.db"
|
||||
graph_db_name = f"{dataset_id}.pkl"
|
||||
vector_config = get_vectordb_config()
|
||||
graph_config = get_graph_config()
|
||||
|
||||
# Note: for hybrid databases both graph and vector DB name have to be the same
|
||||
if graph_config.graph_database_provider == "kuzu":
|
||||
graph_db_name = f"{dataset_id}.pkl"
|
||||
else:
|
||||
graph_db_name = f"{dataset_id}"
|
||||
|
||||
if vector_config.vector_db_provider == "lancedb":
|
||||
vector_db_name = f"{dataset_id}.lance.db"
|
||||
else:
|
||||
vector_db_name = f"{dataset_id}"
|
||||
|
||||
base_config = get_base_config()
|
||||
databases_directory_path = os.path.join(
|
||||
base_config.system_root_directory, "databases", str(user.id)
|
||||
)
|
||||
|
||||
# Determine vector database URL
|
||||
if vector_config.vector_db_provider == "lancedb":
|
||||
vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name)
|
||||
else:
|
||||
vector_db_url = vector_config.vector_database_url
|
||||
|
||||
# Determine graph database URL
|
||||
|
||||
async with db_engine.get_async_session() as session:
|
||||
# Create dataset if it doesn't exist
|
||||
|
|
@ -55,6 +83,12 @@ async def get_or_create_dataset_database(
|
|||
dataset_id=dataset_id,
|
||||
vector_database_name=vector_db_name,
|
||||
graph_database_name=graph_db_name,
|
||||
vector_database_provider=vector_config.vector_db_provider,
|
||||
graph_database_provider=graph_config.graph_database_provider,
|
||||
vector_database_url=vector_db_url,
|
||||
graph_database_url=graph_config.graph_database_url,
|
||||
vector_database_key=vector_config.vector_db_key,
|
||||
graph_database_key=graph_config.graph_database_key,
|
||||
)
|
||||
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -18,12 +18,14 @@ class VectorConfig(BaseSettings):
|
|||
Instance variables:
|
||||
- vector_db_url: The URL of the vector database.
|
||||
- vector_db_port: The port for the vector database.
|
||||
- vector_db_name: The name of the vector database.
|
||||
- vector_db_key: The key for accessing the vector database.
|
||||
- vector_db_provider: The provider for the vector database.
|
||||
"""
|
||||
|
||||
vector_db_url: str = ""
|
||||
vector_db_port: int = 1234
|
||||
vector_db_name: str = ""
|
||||
vector_db_key: str = ""
|
||||
vector_db_provider: str = "lancedb"
|
||||
|
||||
|
|
@ -58,6 +60,7 @@ class VectorConfig(BaseSettings):
|
|||
return {
|
||||
"vector_db_url": self.vector_db_url,
|
||||
"vector_db_port": self.vector_db_port,
|
||||
"vector_db_name": self.vector_db_name,
|
||||
"vector_db_key": self.vector_db_key,
|
||||
"vector_db_provider": self.vector_db_provider,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from .supported_databases import supported_databases
|
||||
from .embeddings import get_embedding_engine
|
||||
from cognee.infrastructure.databases.graph.config import get_graph_context_config
|
||||
|
||||
from functools import lru_cache
|
||||
|
||||
|
|
@ -8,6 +9,7 @@ from functools import lru_cache
|
|||
def create_vector_engine(
|
||||
vector_db_provider: str,
|
||||
vector_db_url: str,
|
||||
vector_db_name: str,
|
||||
vector_db_port: str = "",
|
||||
vector_db_key: str = "",
|
||||
):
|
||||
|
|
@ -27,6 +29,7 @@ def create_vector_engine(
|
|||
- vector_db_url (str): The URL for the vector database instance.
|
||||
- vector_db_port (str): The port for the vector database instance. Required for some
|
||||
providers.
|
||||
- vector_db_name (str): The name of the vector database instance.
|
||||
- vector_db_key (str): The API key or access token for the vector database instance.
|
||||
- vector_db_provider (str): The name of the vector database provider to use (e.g.,
|
||||
'pgvector').
|
||||
|
|
@ -45,9 +48,10 @@ def create_vector_engine(
|
|||
url=vector_db_url,
|
||||
api_key=vector_db_key,
|
||||
embedding_engine=embedding_engine,
|
||||
database_name=vector_db_name,
|
||||
)
|
||||
|
||||
if vector_db_provider == "pgvector":
|
||||
if vector_db_provider.lower() == "pgvector":
|
||||
from cognee.infrastructure.databases.relational import get_relational_config
|
||||
|
||||
# Get configuration for postgres database
|
||||
|
|
@ -78,7 +82,7 @@ def create_vector_engine(
|
|||
embedding_engine,
|
||||
)
|
||||
|
||||
elif vector_db_provider == "chromadb":
|
||||
elif vector_db_provider.lower() == "chromadb":
|
||||
try:
|
||||
import chromadb
|
||||
except ImportError:
|
||||
|
|
@ -94,7 +98,7 @@ def create_vector_engine(
|
|||
embedding_engine=embedding_engine,
|
||||
)
|
||||
|
||||
elif vector_db_provider == "neptune_analytics":
|
||||
elif vector_db_provider.lower() == "neptune_analytics":
|
||||
try:
|
||||
from langchain_aws import NeptuneAnalyticsGraph
|
||||
except ImportError:
|
||||
|
|
@ -122,7 +126,7 @@ def create_vector_engine(
|
|||
embedding_engine=embedding_engine,
|
||||
)
|
||||
|
||||
else:
|
||||
elif vector_db_provider.lower() == "lancedb":
|
||||
from .lancedb.LanceDBAdapter import LanceDBAdapter
|
||||
|
||||
return LanceDBAdapter(
|
||||
|
|
@ -130,3 +134,9 @@ def create_vector_engine(
|
|||
api_key=vector_db_key,
|
||||
embedding_engine=embedding_engine,
|
||||
)
|
||||
|
||||
else:
|
||||
raise EnvironmentError(
|
||||
f"Unsupported vector database provider: {vector_db_provider}. "
|
||||
f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
|
|||
self.endpoint, json=payload, headers=headers, timeout=60.0
|
||||
) as response:
|
||||
data = await response.json()
|
||||
return data["embedding"]
|
||||
return data["embeddings"][0]
|
||||
|
||||
def get_vector_size(self) -> int:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ class CollectionNotFoundError(CogneeValidationError):
|
|||
self,
|
||||
message,
|
||||
name: str = "CollectionNotFoundError",
|
||||
status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
log=True,
|
||||
log_level="DEBUG",
|
||||
):
|
||||
|
|
|
|||
|
|
@ -324,7 +324,6 @@ class LanceDBAdapter(VectorDBInterface):
|
|||
|
||||
def get_data_point_schema(self, model_type: BaseModel):
|
||||
related_models_fields = []
|
||||
|
||||
for field_name, field_config in model_type.model_fields.items():
|
||||
if hasattr(field_config, "model_fields"):
|
||||
related_models_fields.append(field_name)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, field_validator
|
||||
from typing import Optional, Any, Dict
|
||||
|
||||
|
||||
|
|
@ -18,9 +18,21 @@ class Edge(BaseModel):
|
|||
|
||||
# Mixed usage
|
||||
has_items: (Edge(weight=0.5, weights={"confidence": 0.9}), list[Item])
|
||||
|
||||
# With edge_text for rich embedding representation
|
||||
contains: (Edge(relationship_type="contains", edge_text="relationship_name: contains; entity_description: Alice"), Entity)
|
||||
"""
|
||||
|
||||
weight: Optional[float] = None
|
||||
weights: Optional[Dict[str, float]] = None
|
||||
relationship_type: Optional[str] = None
|
||||
properties: Optional[Dict[str, Any]] = None
|
||||
edge_text: Optional[str] = None
|
||||
|
||||
@field_validator("edge_text", mode="before")
|
||||
@classmethod
|
||||
def ensure_edge_text(cls, v, info):
|
||||
"""Auto-populate edge_text from relationship_type if not explicitly provided."""
|
||||
if v is None and info.data.get("relationship_type"):
|
||||
return info.data["relationship_type"]
|
||||
return v
|
||||
|
|
|
|||
|
|
@ -8,6 +8,6 @@ class FileContentHashingError(Exception):
|
|||
self,
|
||||
message: str = "Failed to hash content of the file.",
|
||||
name: str = "FileContentHashingError",
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
):
|
||||
super().__init__(message, name, status_code)
|
||||
|
|
|
|||
|
|
@ -82,16 +82,16 @@ class LocalFileStorage(Storage):
|
|||
self.ensure_directory_exists(file_dir_path)
|
||||
|
||||
if overwrite or not os.path.exists(full_file_path):
|
||||
with open(
|
||||
full_file_path,
|
||||
mode="w" if isinstance(data, str) else "wb",
|
||||
encoding="utf-8" if isinstance(data, str) else None,
|
||||
) as file:
|
||||
if hasattr(data, "read"):
|
||||
data.seek(0)
|
||||
file.write(data.read())
|
||||
else:
|
||||
if isinstance(data, str):
|
||||
with open(full_file_path, mode="w", encoding="utf-8", newline="\n") as file:
|
||||
file.write(data)
|
||||
else:
|
||||
with open(full_file_path, mode="wb") as file:
|
||||
if hasattr(data, "read"):
|
||||
data.seek(0)
|
||||
file.write(data.read())
|
||||
else:
|
||||
file.write(data)
|
||||
|
||||
file.close()
|
||||
|
||||
|
|
|
|||
|
|
@ -70,18 +70,18 @@ class S3FileStorage(Storage):
|
|||
if overwrite or not await self.file_exists(file_path):
|
||||
|
||||
def save_data_to_file():
|
||||
with self.s3.open(
|
||||
full_file_path,
|
||||
mode="w" if isinstance(data, str) else "wb",
|
||||
encoding="utf-8" if isinstance(data, str) else None,
|
||||
) as file:
|
||||
if hasattr(data, "read"):
|
||||
data.seek(0)
|
||||
file.write(data.read())
|
||||
else:
|
||||
if isinstance(data, str):
|
||||
with self.s3.open(
|
||||
full_file_path, mode="w", encoding="utf-8", newline="\n"
|
||||
) as file:
|
||||
file.write(data)
|
||||
|
||||
file.close()
|
||||
else:
|
||||
with self.s3.open(full_file_path, mode="wb") as file:
|
||||
if hasattr(data, "read"):
|
||||
data.seek(0)
|
||||
file.write(data.read())
|
||||
else:
|
||||
file.write(data)
|
||||
|
||||
await run_async(save_data_to_file)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import io
|
||||
import os.path
|
||||
from typing import BinaryIO, TypedDict
|
||||
from typing import BinaryIO, TypedDict, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
|
@ -27,7 +27,7 @@ class FileMetadata(TypedDict):
|
|||
file_size: int
|
||||
|
||||
|
||||
async def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
||||
async def get_file_metadata(file: BinaryIO, name: Optional[str] = None) -> FileMetadata:
|
||||
"""
|
||||
Retrieve metadata from a file object.
|
||||
|
||||
|
|
@ -53,7 +53,7 @@ async def get_file_metadata(file: BinaryIO) -> FileMetadata:
|
|||
except io.UnsupportedOperation as error:
|
||||
logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n")
|
||||
|
||||
file_type = guess_file_type(file)
|
||||
file_type = guess_file_type(file, name)
|
||||
|
||||
file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
from typing import BinaryIO
|
||||
import io
|
||||
from pathlib import Path
|
||||
from typing import BinaryIO, Optional, Any
|
||||
import filetype
|
||||
|
||||
from .is_text_content import is_text_content
|
||||
from .is_csv_content import is_csv_content
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from filetype.types.base import Type
|
||||
|
||||
|
||||
class FileTypeException(Exception):
|
||||
|
|
@ -24,90 +25,7 @@ class FileTypeException(Exception):
|
|||
self.message = message
|
||||
|
||||
|
||||
class TxtFileType(filetype.Type):
|
||||
"""
|
||||
Represents a text file type with specific MIME and extension properties.
|
||||
|
||||
Public methods:
|
||||
- match: Determines whether a given buffer matches the text file type.
|
||||
"""
|
||||
|
||||
MIME = "text/plain"
|
||||
EXTENSION = "txt"
|
||||
|
||||
def __init__(self):
|
||||
super(TxtFileType, self).__init__(mime=TxtFileType.MIME, extension=TxtFileType.EXTENSION)
|
||||
|
||||
def match(self, buf):
|
||||
"""
|
||||
Determine if the given buffer contains text content.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- buf: The buffer to check for text content.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
Returns True if the buffer is identified as text content, otherwise False.
|
||||
"""
|
||||
return is_text_content(buf)
|
||||
|
||||
|
||||
txt_file_type = TxtFileType()
|
||||
|
||||
filetype.add_type(txt_file_type)
|
||||
|
||||
|
||||
class CustomPdfMatcher(filetype.Type):
|
||||
"""
|
||||
Match PDF file types based on MIME type and extension.
|
||||
|
||||
Public methods:
|
||||
- match
|
||||
|
||||
Instance variables:
|
||||
- MIME: The MIME type of the PDF.
|
||||
- EXTENSION: The file extension of the PDF.
|
||||
"""
|
||||
|
||||
MIME = "application/pdf"
|
||||
EXTENSION = "pdf"
|
||||
|
||||
def __init__(self):
|
||||
super(CustomPdfMatcher, self).__init__(
|
||||
mime=CustomPdfMatcher.MIME, extension=CustomPdfMatcher.EXTENSION
|
||||
)
|
||||
|
||||
def match(self, buf):
|
||||
"""
|
||||
Determine if the provided buffer is a PDF file.
|
||||
|
||||
This method checks for the presence of the PDF signature in the buffer.
|
||||
|
||||
Raises:
|
||||
- TypeError: If the buffer is not of bytes type.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- buf: The buffer containing the data to be checked.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
Returns True if the buffer contains a PDF signature, otherwise returns False.
|
||||
"""
|
||||
return b"PDF-" in buf
|
||||
|
||||
|
||||
custom_pdf_matcher = CustomPdfMatcher()
|
||||
|
||||
filetype.add_type(custom_pdf_matcher)
|
||||
|
||||
|
||||
def guess_file_type(file: BinaryIO) -> filetype.Type:
|
||||
def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type:
|
||||
"""
|
||||
Guess the file type from the given binary file stream.
|
||||
|
||||
|
|
@ -124,56 +42,26 @@ def guess_file_type(file: BinaryIO) -> filetype.Type:
|
|||
|
||||
- filetype.Type: The guessed file type, represented as filetype.Type.
|
||||
"""
|
||||
|
||||
# Note: If file has .txt or .text extension, consider it a plain text file as filetype.guess may not detect it properly
|
||||
# as it contains no magic number encoding
|
||||
ext = None
|
||||
if isinstance(file, str):
|
||||
ext = Path(file).suffix
|
||||
elif name is not None:
|
||||
ext = Path(name).suffix
|
||||
|
||||
if ext in [".txt", ".text"]:
|
||||
file_type = Type("text/plain", "txt")
|
||||
return file_type
|
||||
|
||||
file_type = filetype.guess(file)
|
||||
|
||||
# If file type could not be determined consider it a plain text file as they don't have magic number encoding
|
||||
if file_type is None:
|
||||
from filetype.types.base import Type
|
||||
|
||||
file_type = Type("text/plain", "txt")
|
||||
|
||||
if file_type is None:
|
||||
raise FileTypeException(f"Unknown file detected: {file.name}.")
|
||||
|
||||
return file_type
|
||||
|
||||
|
||||
class CsvFileType(filetype.Type):
|
||||
"""
|
||||
Match CSV file types based on MIME type and extension.
|
||||
|
||||
Public methods:
|
||||
- match
|
||||
|
||||
Instance variables:
|
||||
- MIME: The MIME type of the CSV.
|
||||
- EXTENSION: The file extension of the CSV.
|
||||
"""
|
||||
|
||||
MIME = "text/csv"
|
||||
EXTENSION = "csv"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(mime=self.MIME, extension=self.EXTENSION)
|
||||
|
||||
def match(self, buf):
|
||||
"""
|
||||
Determine if the given buffer contains csv content.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- buf: The buffer to check for csv content.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
Returns True if the buffer is identified as csv content, otherwise False.
|
||||
"""
|
||||
|
||||
return is_csv_content(buf)
|
||||
|
||||
|
||||
csv_file_type = CsvFileType()
|
||||
|
||||
filetype.add_type(csv_file_type)
|
||||
|
|
|
|||
|
|
@ -1,15 +1,13 @@
|
|||
For the purposes of identifying timestamps in a query, you are tasked with extracting relevant timestamps from the query.
|
||||
## Timestamp requirements
|
||||
- If the query contains interval extrack both starts_at and ends_at properties
|
||||
- If the query contains an instantaneous timestamp, starts_at and ends_at should be the same
|
||||
- If the query its open-ended (before 2009 or after 2009), the corresponding non defined end of the time should be none
|
||||
-For example: "before 2009" -- starts_at: None, ends_at: 2009 or "after 2009" -- starts_at: 2009, ends_at: None
|
||||
- Put always the data that comes first in time as starts_at and the timestamps that comes second in time as ends_at
|
||||
- If starts_at or ends_at cannot be extracted both of them has to be None
|
||||
## Output Format
|
||||
Your reply should be a JSON: list of dictionaries with the following structure:
|
||||
```python
|
||||
class QueryInterval(BaseModel):
|
||||
starts_at: Optional[Timestamp] = None
|
||||
ends_at: Optional[Timestamp] = None
|
||||
```
|
||||
You are tasked with identifying relevant time periods where the answer to a given query should be searched.
|
||||
Current date is: `{{ time_now }}`. Determine relevant period(s) and return structured intervals.
|
||||
|
||||
Extraction rules:
|
||||
|
||||
1. Query without specific timestamp: use the time period with starts_at set to None and ends_at set to now.
|
||||
2. Explicit time intervals: If the query specifies a range (e.g., from 2010 to 2020, between January and March 2023), extract both start and end dates. Always assign the earlier date to starts_at and the later date to ends_at.
|
||||
3. Single timestamp: If the query refers to one specific moment (e.g., in 2015, on March 5, 2022), set starts_at and ends_at to that same timestamp.
|
||||
4. Open-ended time references: For phrases such as "before X" or "after X", represent the unspecified side as None. For example: before 2009 → starts_at: None, ends_at: 2009; after 2009 → starts_at: 2009, ends_at: None.
|
||||
5. Current-time references ("now", "current", "today"): If the query explicitly refers to the present, set both starts_at and ends_at to now (the ingestion timestamp).
|
||||
6. "Who is" and "Who was" questions: These imply a general identity or biographical inquiry without a specific temporal scope. Set both starts_at and ends_at to None.
|
||||
7. Ordering rule: Always ensure the earlier date is assigned to starts_at and the later date to ends_at.
|
||||
8. No temporal information: If no valid or inferable time reference is found, set both starts_at and ends_at to None.
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
A question was previously answered, but the answer received negative feedback.
|
||||
Please reconsider and improve the response.
|
||||
|
||||
Question: {question}
|
||||
Context originally used: {context}
|
||||
Previous answer: {wrong_answer}
|
||||
Feedback on that answer: {negative_feedback}
|
||||
|
||||
Task: Provide a better response. The new answer should be short and direct.
|
||||
Then explain briefly why this answer is better.
|
||||
|
||||
Format your reply as:
|
||||
Answer: <improved answer>
|
||||
Explanation: <short explanation>
|
||||
13
cognee/infrastructure/llm/prompts/feedback_report_prompt.txt
Normal file
13
cognee/infrastructure/llm/prompts/feedback_report_prompt.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
Write a concise, stand-alone paragraph that explains the correct answer to the question below.
|
||||
The paragraph should read naturally on its own, providing all necessary context and reasoning
|
||||
so the answer is clear and well-supported.
|
||||
|
||||
Question: {question}
|
||||
Correct answer: {improved_answer}
|
||||
Supporting context: {new_context}
|
||||
|
||||
Your paragraph should:
|
||||
- First sentence clearly states the correct answer as a full sentence
|
||||
- Remainder flows from first sentence and provides explanation based on context
|
||||
- Use simple, direct language that is easy to follow
|
||||
- Use shorter sentences, no long-winded explanations
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
Question: {question}
|
||||
Context: {context}
|
||||
|
||||
Provide a one paragraph human readable summary of this interaction context,
|
||||
listing all the relevant facts and information in a simple and direct way.
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import filetype
|
||||
from typing import Dict, List, Optional, Any
|
||||
from .LoaderInterface import LoaderInterface
|
||||
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
|
@ -65,7 +66,9 @@ class LoaderEngine:
|
|||
return True
|
||||
|
||||
def get_loader(
|
||||
self, file_path: str, preferred_loaders: List[str] = None
|
||||
self,
|
||||
file_path: str,
|
||||
preferred_loaders: dict[str, dict[str, Any]],
|
||||
) -> Optional[LoaderInterface]:
|
||||
"""
|
||||
Get appropriate loader for a file.
|
||||
|
|
@ -77,14 +80,21 @@ class LoaderEngine:
|
|||
Returns:
|
||||
LoaderInterface that can handle the file, or None if not found
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
file_info = filetype.guess(file_path)
|
||||
file_info = guess_file_type(file_path)
|
||||
|
||||
path_extension = Path(file_path).suffix.lstrip(".")
|
||||
|
||||
# Try preferred loaders first
|
||||
if preferred_loaders:
|
||||
for loader_name in preferred_loaders:
|
||||
if loader_name in self._loaders:
|
||||
loader = self._loaders[loader_name]
|
||||
# Try with path extension first (for text formats like html)
|
||||
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
# Fall back to content-detected extension
|
||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
else:
|
||||
|
|
@ -94,6 +104,10 @@ class LoaderEngine:
|
|||
for loader_name in self.default_loader_priority:
|
||||
if loader_name in self._loaders:
|
||||
loader = self._loaders[loader_name]
|
||||
# Try with path extension first (for text formats like html)
|
||||
if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
# Fall back to content-detected extension
|
||||
if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
|
||||
return loader
|
||||
else:
|
||||
|
|
@ -106,7 +120,7 @@ class LoaderEngine:
|
|||
async def load_file(
|
||||
self,
|
||||
file_path: str,
|
||||
preferred_loaders: Optional[List[str]] = None,
|
||||
preferred_loaders: dict[str, dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
|
@ -114,7 +128,7 @@ class LoaderEngine:
|
|||
|
||||
Args:
|
||||
file_path: Path to the file to be processed
|
||||
preferred_loaders: List of preferred loader names to try first
|
||||
preferred_loaders: Dict of loader names to their configurations
|
||||
**kwargs: Additional loader-specific configuration
|
||||
|
||||
Raises:
|
||||
|
|
@ -126,8 +140,16 @@ class LoaderEngine:
|
|||
raise ValueError(f"No loader found for file: {file_path}")
|
||||
|
||||
logger.debug(f"Loading {file_path} with {loader.loader_name}")
|
||||
# TODO: loading needs to be reworked to work with both file streams and file locations
|
||||
return await loader.load(file_path, **kwargs)
|
||||
|
||||
# Extract loader-specific config from preferred_loaders
|
||||
loader_config = {}
|
||||
if preferred_loaders and loader.loader_name in preferred_loaders:
|
||||
loader_config = preferred_loaders[loader.loader_name]
|
||||
|
||||
# Merge with any additional kwargs (kwargs take precedence)
|
||||
merged_kwargs = {**loader_config, **kwargs}
|
||||
|
||||
return await loader.load(file_path, **merged_kwargs)
|
||||
|
||||
def get_available_loaders(self) -> List[str]:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ class AudioLoader(LoaderInterface):
|
|||
"audio/wav",
|
||||
"audio/amr",
|
||||
"audio/aiff",
|
||||
"audio/x-wav",
|
||||
]
|
||||
|
||||
@property
|
||||
|
|
|
|||
|
|
@ -27,3 +27,10 @@ try:
|
|||
__all__.append("AdvancedPdfLoader")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from .beautiful_soup_loader import BeautifulSoupLoader
|
||||
|
||||
__all__.append("BeautifulSoupLoader")
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
|
|||
310
cognee/infrastructure/loaders/external/beautiful_soup_loader.py
vendored
Normal file
310
cognee/infrastructure/loaders/external/beautiful_soup_loader.py
vendored
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
"""BeautifulSoup-based web crawler for extracting content from web pages.
|
||||
|
||||
This module provides the BeautifulSoupCrawler class for fetching and extracting content
|
||||
from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
|
||||
supports robots.txt handling, rate limiting, and custom extraction rules.
|
||||
"""
|
||||
|
||||
from typing import Union, Dict, Any, Optional, List
|
||||
from dataclasses import dataclass
|
||||
from bs4 import BeautifulSoup
|
||||
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionRule:
|
||||
"""Normalized extraction rule for web content.
|
||||
|
||||
Attributes:
|
||||
selector: CSS selector for extraction (if any).
|
||||
xpath: XPath expression for extraction (if any).
|
||||
attr: HTML attribute to extract (if any).
|
||||
all: If True, extract all matching elements; otherwise, extract first.
|
||||
join_with: String to join multiple extracted elements.
|
||||
"""
|
||||
|
||||
selector: Optional[str] = None
|
||||
xpath: Optional[str] = None
|
||||
attr: Optional[str] = None
|
||||
all: bool = False
|
||||
join_with: str = " "
|
||||
|
||||
|
||||
class BeautifulSoupLoader(LoaderInterface):
|
||||
"""Crawler for fetching and extracting web content using BeautifulSoup.
|
||||
|
||||
Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
|
||||
compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
|
||||
|
||||
Attributes:
|
||||
concurrency: Number of concurrent requests allowed.
|
||||
crawl_delay: Minimum seconds between requests to the same domain.
|
||||
max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
|
||||
timeout: Per-request timeout in seconds.
|
||||
max_retries: Number of retries for failed requests.
|
||||
retry_delay_factor: Multiplier for exponential backoff on retries.
|
||||
headers: HTTP headers for requests (e.g., User-Agent).
|
||||
robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
|
||||
"""
|
||||
|
||||
@property
|
||||
def supported_extensions(self) -> List[str]:
|
||||
return ["html"]
|
||||
|
||||
@property
|
||||
def supported_mime_types(self) -> List[str]:
|
||||
return ["text/html", "text/plain"]
|
||||
|
||||
@property
|
||||
def loader_name(self) -> str:
|
||||
return "beautiful_soup_loader"
|
||||
|
||||
def can_handle(self, extension: str, mime_type: str) -> bool:
|
||||
can = extension in self.supported_extensions and mime_type in self.supported_mime_types
|
||||
return can
|
||||
|
||||
def _get_default_extraction_rules(self):
|
||||
# Comprehensive default extraction rules for common HTML content
|
||||
return {
|
||||
# Meta information
|
||||
"title": {"selector": "title", "all": False},
|
||||
"meta_description": {
|
||||
"selector": "meta[name='description']",
|
||||
"attr": "content",
|
||||
"all": False,
|
||||
},
|
||||
"meta_keywords": {
|
||||
"selector": "meta[name='keywords']",
|
||||
"attr": "content",
|
||||
"all": False,
|
||||
},
|
||||
# Open Graph meta tags
|
||||
"og_title": {
|
||||
"selector": "meta[property='og:title']",
|
||||
"attr": "content",
|
||||
"all": False,
|
||||
},
|
||||
"og_description": {
|
||||
"selector": "meta[property='og:description']",
|
||||
"attr": "content",
|
||||
"all": False,
|
||||
},
|
||||
# Main content areas (prioritized selectors)
|
||||
"article": {"selector": "article", "all": True, "join_with": "\n\n"},
|
||||
"main": {"selector": "main", "all": True, "join_with": "\n\n"},
|
||||
# Semantic content sections
|
||||
"headers_h1": {"selector": "h1", "all": True, "join_with": "\n"},
|
||||
"headers_h2": {"selector": "h2", "all": True, "join_with": "\n"},
|
||||
"headers_h3": {"selector": "h3", "all": True, "join_with": "\n"},
|
||||
"headers_h4": {"selector": "h4", "all": True, "join_with": "\n"},
|
||||
"headers_h5": {"selector": "h5", "all": True, "join_with": "\n"},
|
||||
"headers_h6": {"selector": "h6", "all": True, "join_with": "\n"},
|
||||
# Text content
|
||||
"paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"},
|
||||
"blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"},
|
||||
"preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"},
|
||||
# Lists
|
||||
"ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"},
|
||||
"unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"},
|
||||
"list_items": {"selector": "li", "all": True, "join_with": "\n"},
|
||||
"definition_lists": {"selector": "dl", "all": True, "join_with": "\n"},
|
||||
# Tables
|
||||
"tables": {"selector": "table", "all": True, "join_with": "\n\n"},
|
||||
"table_captions": {
|
||||
"selector": "caption",
|
||||
"all": True,
|
||||
"join_with": "\n",
|
||||
},
|
||||
# Code blocks
|
||||
"code_blocks": {"selector": "code", "all": True, "join_with": "\n"},
|
||||
# Figures and media descriptions
|
||||
"figures": {"selector": "figure", "all": True, "join_with": "\n\n"},
|
||||
"figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"},
|
||||
"image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "},
|
||||
# Links (text content, not URLs to avoid clutter)
|
||||
"link_text": {"selector": "a", "all": True, "join_with": " "},
|
||||
# Emphasized text
|
||||
"strong": {"selector": "strong", "all": True, "join_with": " "},
|
||||
"emphasis": {"selector": "em", "all": True, "join_with": " "},
|
||||
"marked": {"selector": "mark", "all": True, "join_with": " "},
|
||||
# Time and data elements
|
||||
"time": {"selector": "time", "all": True, "join_with": " "},
|
||||
"data": {"selector": "data", "all": True, "join_with": " "},
|
||||
# Sections and semantic structure
|
||||
"sections": {"selector": "section", "all": True, "join_with": "\n\n"},
|
||||
"asides": {"selector": "aside", "all": True, "join_with": "\n\n"},
|
||||
"details": {"selector": "details", "all": True, "join_with": "\n"},
|
||||
"summary": {"selector": "summary", "all": True, "join_with": "\n"},
|
||||
# Navigation (may contain important links/structure)
|
||||
"nav": {"selector": "nav", "all": True, "join_with": "\n"},
|
||||
# Footer information
|
||||
"footer": {"selector": "footer", "all": True, "join_with": "\n"},
|
||||
# Divs with specific content roles
|
||||
"content_divs": {
|
||||
"selector": "div[role='main'], div[role='article'], div.content, div#content",
|
||||
"all": True,
|
||||
"join_with": "\n\n",
|
||||
},
|
||||
}
|
||||
|
||||
async def load(
|
||||
self,
|
||||
file_path: str,
|
||||
extraction_rules: dict[str, Any] = None,
|
||||
join_all_matches: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Load an HTML file, extract content, and save to storage.
|
||||
|
||||
Args:
|
||||
file_path: Path to the HTML file
|
||||
extraction_rules: Dict of CSS selector rules for content extraction
|
||||
join_all_matches: If True, extract all matching elements for each rule
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
Path to the stored extracted text file
|
||||
"""
|
||||
if extraction_rules is None:
|
||||
extraction_rules = self._get_default_extraction_rules()
|
||||
logger.info("Using default comprehensive extraction rules for HTML content")
|
||||
|
||||
logger.info(f"Processing HTML file: {file_path}")
|
||||
|
||||
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
||||
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
file_metadata = await get_file_metadata(f)
|
||||
f.seek(0)
|
||||
html = f.read()
|
||||
|
||||
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
||||
|
||||
# Normalize extraction rules
|
||||
normalized_rules: List[ExtractionRule] = []
|
||||
for _, rule in extraction_rules.items():
|
||||
r = self._normalize_rule(rule)
|
||||
if join_all_matches:
|
||||
r.all = True
|
||||
normalized_rules.append(r)
|
||||
|
||||
pieces = []
|
||||
for rule in normalized_rules:
|
||||
text = self._extract_from_html(html, rule)
|
||||
if text:
|
||||
pieces.append(text)
|
||||
|
||||
full_content = " ".join(pieces).strip()
|
||||
|
||||
# remove after defaults for extraction rules
|
||||
# Fallback: If no content extracted, check if the file is plain text (not HTML)
|
||||
if not full_content:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# If there are no HTML tags, treat as plain text
|
||||
if not soup.find():
|
||||
logger.warning(
|
||||
f"No HTML tags found in {file_path}. Treating as plain text. "
|
||||
"This may happen when content is pre-extracted (e.g., via Tavily with text format)."
|
||||
)
|
||||
full_content = html.decode("utf-8") if isinstance(html, bytes) else html
|
||||
full_content = full_content.strip()
|
||||
|
||||
if not full_content:
|
||||
logger.warning(f"No content extracted from HTML file: {file_path}")
|
||||
|
||||
# Store the extracted content
|
||||
storage_config = get_storage_config()
|
||||
data_root_directory = storage_config["data_root_directory"]
|
||||
storage = get_file_storage(data_root_directory)
|
||||
|
||||
full_file_path = await storage.store(storage_file_name, full_content)
|
||||
|
||||
logger.info(f"Extracted {len(full_content)} characters from HTML")
|
||||
return full_file_path
|
||||
|
||||
def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
|
||||
"""Normalize an extraction rule to an ExtractionRule dataclass.
|
||||
|
||||
Args:
|
||||
rule: A string (CSS selector) or dict with extraction parameters.
|
||||
|
||||
Returns:
|
||||
ExtractionRule: Normalized extraction rule.
|
||||
|
||||
Raises:
|
||||
ValueError: If the rule is invalid.
|
||||
"""
|
||||
if isinstance(rule, str):
|
||||
return ExtractionRule(selector=rule)
|
||||
if isinstance(rule, dict):
|
||||
return ExtractionRule(
|
||||
selector=rule.get("selector"),
|
||||
xpath=rule.get("xpath"),
|
||||
attr=rule.get("attr"),
|
||||
all=bool(rule.get("all", False)),
|
||||
join_with=rule.get("join_with", " "),
|
||||
)
|
||||
raise ValueError(f"Invalid extraction rule: {rule}")
|
||||
|
||||
def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
|
||||
"""Extract content from HTML using BeautifulSoup or lxml XPath.
|
||||
|
||||
Args:
|
||||
html: The HTML content to extract from.
|
||||
rule: The extraction rule to apply.
|
||||
|
||||
Returns:
|
||||
str: The extracted content.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If XPath is used but lxml is not installed.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
if rule.xpath:
|
||||
try:
|
||||
from lxml import html as lxml_html
|
||||
except ImportError:
|
||||
raise RuntimeError(
|
||||
"XPath requested but lxml is not available. Install lxml or use CSS selectors."
|
||||
)
|
||||
doc = lxml_html.fromstring(html)
|
||||
nodes = doc.xpath(rule.xpath)
|
||||
texts = []
|
||||
for n in nodes:
|
||||
if hasattr(n, "text_content"):
|
||||
texts.append(n.text_content().strip())
|
||||
else:
|
||||
texts.append(str(n).strip())
|
||||
return rule.join_with.join(t for t in texts if t)
|
||||
|
||||
if not rule.selector:
|
||||
return ""
|
||||
|
||||
if rule.all:
|
||||
nodes = soup.select(rule.selector)
|
||||
pieces = []
|
||||
for el in nodes:
|
||||
if rule.attr:
|
||||
val = el.get(rule.attr)
|
||||
if val:
|
||||
pieces.append(val.strip())
|
||||
else:
|
||||
text = el.get_text(strip=True)
|
||||
if text:
|
||||
pieces.append(text)
|
||||
return rule.join_with.join(pieces).strip()
|
||||
else:
|
||||
el = soup.select_one(rule.selector)
|
||||
if el is None:
|
||||
return ""
|
||||
if rule.attr:
|
||||
val = el.get(rule.attr)
|
||||
return (val or "").strip()
|
||||
return el.get_text(strip=True)
|
||||
|
|
@ -24,3 +24,10 @@ try:
|
|||
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from cognee.infrastructure.loaders.external import BeautifulSoupLoader
|
||||
|
||||
supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
from typing import Optional, List
|
||||
|
||||
from cognee import memify
|
||||
from cognee.context_global_variables import (
|
||||
set_database_global_context_variables,
|
||||
set_session_user_context_variable,
|
||||
)
|
||||
from cognee.exceptions import CogneeValidationError
|
||||
from cognee.modules.data.methods import get_authorized_existing_datasets
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.modules.pipelines.tasks.task import Task
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.tasks.memify import extract_user_sessions, cognify_session
|
||||
|
||||
|
||||
logger = get_logger("persist_sessions_in_knowledge_graph")
|
||||
|
||||
|
||||
async def persist_sessions_in_knowledge_graph_pipeline(
|
||||
user: User,
|
||||
session_ids: Optional[List[str]] = None,
|
||||
dataset: str = "main_dataset",
|
||||
run_in_background: bool = False,
|
||||
):
|
||||
await set_session_user_context_variable(user)
|
||||
dataset_to_write = await get_authorized_existing_datasets(
|
||||
user=user, datasets=[dataset], permission_type="write"
|
||||
)
|
||||
|
||||
if not dataset_to_write:
|
||||
raise CogneeValidationError(
|
||||
message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}",
|
||||
log=False,
|
||||
)
|
||||
|
||||
await set_database_global_context_variables(
|
||||
dataset_to_write[0].id, dataset_to_write[0].owner_id
|
||||
)
|
||||
|
||||
extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)]
|
||||
|
||||
enrichment_tasks = [
|
||||
Task(cognify_session, dataset_id=dataset_to_write[0].id),
|
||||
]
|
||||
|
||||
result = await memify(
|
||||
extraction_tasks=extraction_tasks,
|
||||
enrichment_tasks=enrichment_tasks,
|
||||
dataset=dataset_to_write[0].id,
|
||||
data=[{}],
|
||||
run_in_background=run_in_background,
|
||||
)
|
||||
|
||||
logger.info("Session persistence pipeline completed")
|
||||
return result
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
from typing import List, Union
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.infrastructure.engine.models.Edge import Edge
|
||||
from cognee.modules.data.processing.document_types import Document
|
||||
from cognee.modules.engine.models import Entity
|
||||
from cognee.tasks.temporal_graph.models import Event
|
||||
|
|
@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
|
|||
chunk_index: int
|
||||
cut_type: str
|
||||
is_part_of: Document
|
||||
contains: List[Union[Entity, Event]] = None
|
||||
contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None
|
||||
|
||||
metadata: dict = {"index_fields": ["text"]}
|
||||
|
|
|
|||
124
cognee/modules/chunking/text_chunker_with_overlap.py
Normal file
124
cognee/modules/chunking/text_chunker_with_overlap.py
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
from cognee.shared.logging_utils import get_logger
|
||||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
from cognee.tasks.chunks import chunk_by_paragraph
|
||||
from cognee.modules.chunking.Chunker import Chunker
|
||||
from .models.DocumentChunk import DocumentChunk
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class TextChunkerWithOverlap(Chunker):
|
||||
def __init__(
|
||||
self,
|
||||
document,
|
||||
get_text: callable,
|
||||
max_chunk_size: int,
|
||||
chunk_overlap_ratio: float = 0.0,
|
||||
get_chunk_data: callable = None,
|
||||
):
|
||||
super().__init__(document, get_text, max_chunk_size)
|
||||
self._accumulated_chunk_data = []
|
||||
self._accumulated_size = 0
|
||||
self.chunk_overlap_ratio = chunk_overlap_ratio
|
||||
self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio)
|
||||
|
||||
if get_chunk_data is not None:
|
||||
self.get_chunk_data = get_chunk_data
|
||||
elif chunk_overlap_ratio > 0:
|
||||
paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size)
|
||||
self.get_chunk_data = lambda text: chunk_by_paragraph(
|
||||
text, paragraph_max_size, batch_paragraphs=True
|
||||
)
|
||||
else:
|
||||
self.get_chunk_data = lambda text: chunk_by_paragraph(
|
||||
text, self.max_chunk_size, batch_paragraphs=True
|
||||
)
|
||||
|
||||
def _accumulation_overflows(self, chunk_data):
|
||||
"""Check if adding chunk_data would exceed max_chunk_size."""
|
||||
return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size
|
||||
|
||||
def _accumulate_chunk_data(self, chunk_data):
|
||||
"""Add chunk_data to the current accumulation."""
|
||||
self._accumulated_chunk_data.append(chunk_data)
|
||||
self._accumulated_size += chunk_data["chunk_size"]
|
||||
|
||||
def _clear_accumulation(self):
|
||||
"""Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio."""
|
||||
if self.chunk_overlap == 0:
|
||||
self._accumulated_chunk_data = []
|
||||
self._accumulated_size = 0
|
||||
return
|
||||
|
||||
# Keep chunk_data from the end that fit in overlap
|
||||
overlap_chunk_data = []
|
||||
overlap_size = 0
|
||||
|
||||
for chunk_data in reversed(self._accumulated_chunk_data):
|
||||
if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap:
|
||||
overlap_chunk_data.insert(0, chunk_data)
|
||||
overlap_size += chunk_data["chunk_size"]
|
||||
else:
|
||||
break
|
||||
|
||||
self._accumulated_chunk_data = overlap_chunk_data
|
||||
self._accumulated_size = overlap_size
|
||||
|
||||
def _create_chunk(self, text, size, cut_type, chunk_id=None):
|
||||
"""Create a DocumentChunk with standard metadata."""
|
||||
try:
|
||||
return DocumentChunk(
|
||||
id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
|
||||
text=text,
|
||||
chunk_size=size,
|
||||
is_part_of=self.document,
|
||||
chunk_index=self.chunk_index,
|
||||
cut_type=cut_type,
|
||||
contains=[],
|
||||
metadata={"index_fields": ["text"]},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
raise e
|
||||
|
||||
def _create_chunk_from_accumulation(self):
|
||||
"""Create a DocumentChunk from current accumulated chunk_data."""
|
||||
chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data)
|
||||
return self._create_chunk(
|
||||
text=chunk_text,
|
||||
size=self._accumulated_size,
|
||||
cut_type=self._accumulated_chunk_data[-1]["cut_type"],
|
||||
)
|
||||
|
||||
def _emit_chunk(self, chunk_data):
|
||||
"""Emit a chunk when accumulation overflows."""
|
||||
if len(self._accumulated_chunk_data) > 0:
|
||||
chunk = self._create_chunk_from_accumulation()
|
||||
self._clear_accumulation()
|
||||
self._accumulate_chunk_data(chunk_data)
|
||||
else:
|
||||
# Handle single chunk_data exceeding max_chunk_size
|
||||
chunk = self._create_chunk(
|
||||
text=chunk_data["text"],
|
||||
size=chunk_data["chunk_size"],
|
||||
cut_type=chunk_data["cut_type"],
|
||||
chunk_id=chunk_data["chunk_id"],
|
||||
)
|
||||
|
||||
self.chunk_index += 1
|
||||
return chunk
|
||||
|
||||
async def read(self):
|
||||
async for content_text in self.get_text():
|
||||
for chunk_data in self.get_chunk_data(content_text):
|
||||
if not self._accumulation_overflows(chunk_data):
|
||||
self._accumulate_chunk_data(chunk_data)
|
||||
continue
|
||||
|
||||
yield self._emit_chunk(chunk_data)
|
||||
|
||||
if len(self._accumulated_chunk_data) == 0:
|
||||
return
|
||||
|
||||
yield self._create_chunk_from_accumulation()
|
||||
|
|
@ -10,7 +10,7 @@ class UnstructuredLibraryImportError(CogneeConfigurationError):
|
|||
self,
|
||||
message: str = "Import error. Unstructured library is not installed.",
|
||||
name: str = "UnstructuredModuleImportError",
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
):
|
||||
super().__init__(message, name, status_code)
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
|
|||
from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
|
||||
from .get_data import get_data
|
||||
from .get_unique_dataset_id import get_unique_dataset_id
|
||||
from .get_unique_data_id import get_unique_data_id
|
||||
from .get_authorized_existing_datasets import get_authorized_existing_datasets
|
||||
from .get_dataset_ids import get_dataset_ids
|
||||
|
||||
|
|
@ -23,3 +24,6 @@ from .create_authorized_dataset import create_authorized_dataset
|
|||
|
||||
# Check
|
||||
from .check_dataset_name import check_dataset_name
|
||||
|
||||
# Boolean check
|
||||
from .has_dataset_data import has_dataset_data
|
||||
|
|
|
|||
|
|
@ -16,14 +16,16 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) -
|
|||
.options(joinedload(Dataset.data))
|
||||
.filter(Dataset.name == dataset_name)
|
||||
.filter(Dataset.owner_id == owner_id)
|
||||
.filter(Dataset.tenant_id == user.tenant_id)
|
||||
)
|
||||
).first()
|
||||
|
||||
if dataset is None:
|
||||
# Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name
|
||||
dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user)
|
||||
dataset = Dataset(id=dataset_id, name=dataset_name, data=[])
|
||||
dataset.owner_id = owner_id
|
||||
dataset = Dataset(
|
||||
id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id
|
||||
)
|
||||
|
||||
session.add(dataset)
|
||||
|
||||
|
|
|
|||
|
|
@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user):
|
|||
# Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.)
|
||||
user_datasets = await get_datasets(user.id)
|
||||
# Filter out non name mentioned datasets
|
||||
dataset_ids = [dataset.id for dataset in user_datasets if dataset.name in datasets]
|
||||
dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets]
|
||||
# Filter out non current tenant datasets
|
||||
dataset_ids = [
|
||||
dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id
|
||||
]
|
||||
else:
|
||||
raise DatasetTypeError(
|
||||
f"One or more of the provided dataset types is not handled: f{datasets}"
|
||||
|
|
|
|||
68
cognee/modules/data/methods/get_unique_data_id.py
Normal file
68
cognee/modules/data/methods/get_unique_data_id.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
from uuid import uuid5, NAMESPACE_OID, UUID
|
||||
from sqlalchemy import select
|
||||
|
||||
from cognee.modules.data.models.Data import Data
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
from cognee.modules.users.models import User
|
||||
|
||||
|
||||
async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
|
||||
"""
|
||||
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
||||
If data with legacy ID exists, return that ID to maintain compatibility.
|
||||
|
||||
Args:
|
||||
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
||||
user: User object adding the data
|
||||
tenant_id: UUID of the tenant for which data is being added
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the data
|
||||
"""
|
||||
|
||||
def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
|
||||
"""
|
||||
Deprecated function, returns a unique UUID for data based on data identifier and user id.
|
||||
Needed to support legacy data without tenant information.
|
||||
Args:
|
||||
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
||||
user: User object adding the data
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the data
|
||||
"""
|
||||
# return UUID hash of file contents + owner id + tenant_id
|
||||
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
|
||||
|
||||
def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
|
||||
"""
|
||||
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
||||
Args:
|
||||
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
||||
user: User object adding the data
|
||||
tenant_id: UUID of the tenant for which data is being added
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the data
|
||||
"""
|
||||
# return UUID hash of file contents + owner id + tenant_id
|
||||
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
|
||||
|
||||
# Get all possible data_id values
|
||||
data_id = {
|
||||
"modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
|
||||
"legacy_data_id": _get_deprecated_unique_data_id(
|
||||
data_identifier=data_identifier, user=user
|
||||
),
|
||||
}
|
||||
|
||||
# Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
|
||||
db_engine = get_relational_engine()
|
||||
async with db_engine.get_async_session() as session:
|
||||
legacy_data_point = (
|
||||
await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
|
||||
).scalar_one_or_none()
|
||||
|
||||
if not legacy_data_point:
|
||||
return data_id["modern_data_id"]
|
||||
return data_id["legacy_data_id"]
|
||||
|
|
@ -1,9 +1,71 @@
|
|||
from uuid import UUID, uuid5, NAMESPACE_OID
|
||||
from cognee.modules.users.models import User
|
||||
from typing import Union
|
||||
from sqlalchemy import select
|
||||
|
||||
from cognee.modules.data.models.Dataset import Dataset
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
|
||||
|
||||
async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
||||
if isinstance(dataset_name, UUID):
|
||||
return dataset_name
|
||||
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
|
||||
"""
|
||||
Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
|
||||
If dataset with legacy ID exists, return that ID to maintain compatibility.
|
||||
|
||||
Args:
|
||||
dataset_name: string representing the dataset name
|
||||
user: User object adding the dataset
|
||||
tenant_id: UUID of the tenant for which dataset is being added
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the dataset
|
||||
"""
|
||||
|
||||
def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
||||
"""
|
||||
Legacy function, returns a unique UUID for dataset based on dataset name and user id.
|
||||
Needed to support legacy datasets without tenant information.
|
||||
Args:
|
||||
dataset_name: string representing the dataset name
|
||||
user: Current User object adding the dataset
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the dataset
|
||||
"""
|
||||
if isinstance(dataset_name, UUID):
|
||||
return dataset_name
|
||||
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
|
||||
|
||||
def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
||||
"""
|
||||
Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
|
||||
Args:
|
||||
dataset_name: string representing the dataset name
|
||||
user: Current User object adding the dataset
|
||||
tenant_id: UUID of the tenant for which dataset is being added
|
||||
|
||||
Returns:
|
||||
UUID: Unique identifier for the dataset
|
||||
"""
|
||||
if isinstance(dataset_name, UUID):
|
||||
return dataset_name
|
||||
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
|
||||
|
||||
# Get all possible dataset_id values
|
||||
dataset_id = {
|
||||
"modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
|
||||
"legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
|
||||
}
|
||||
|
||||
# Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
|
||||
db_engine = get_relational_engine()
|
||||
async with db_engine.get_async_session() as session:
|
||||
legacy_dataset = (
|
||||
await session.execute(
|
||||
select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"])
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if not legacy_dataset:
|
||||
return dataset_id["modern_dataset_id"]
|
||||
return dataset_id["legacy_dataset_id"]
|
||||
|
|
|
|||
21
cognee/modules/data/methods/has_dataset_data.py
Normal file
21
cognee/modules/data/methods/has_dataset_data.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from uuid import UUID
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
from cognee.modules.data.models import DatasetData
|
||||
|
||||
|
||||
async def has_dataset_data(dataset_id: UUID) -> bool:
|
||||
db_engine = get_relational_engine()
|
||||
|
||||
async with db_engine.get_async_session() as session:
|
||||
count_query = (
|
||||
select(func.count())
|
||||
.select_from(DatasetData)
|
||||
.where(DatasetData.dataset_id == dataset_id)
|
||||
)
|
||||
count = await session.execute(count_query)
|
||||
|
||||
return count.scalar_one() > 0
|
||||
|
|
@ -18,6 +18,7 @@ class Dataset(Base):
|
|||
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
||||
|
||||
owner_id = Column(UUID, index=True)
|
||||
tenant_id = Column(UUID, index=True, nullable=True)
|
||||
|
||||
acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan")
|
||||
|
||||
|
|
@ -36,5 +37,6 @@ class Dataset(Base):
|
|||
"createdAt": self.created_at.isoformat(),
|
||||
"updatedAt": self.updated_at.isoformat() if self.updated_at else None,
|
||||
"ownerId": str(self.owner_id),
|
||||
"tenantId": str(self.tenant_id),
|
||||
"data": [data.to_json() for data in self.data],
|
||||
}
|
||||
|
|
|
|||
|
|
@ -171,8 +171,10 @@ class CogneeGraph(CogneeAbstractGraph):
|
|||
embedding_map = {result.payload["text"]: result.score for result in edge_distances}
|
||||
|
||||
for edge in self.edges:
|
||||
relationship_type = edge.attributes.get("relationship_type")
|
||||
distance = embedding_map.get(relationship_type, None)
|
||||
edge_key = edge.attributes.get("edge_text") or edge.attributes.get(
|
||||
"relationship_type"
|
||||
)
|
||||
distance = embedding_map.get(edge_key, None)
|
||||
if distance is not None:
|
||||
edge.attributes["vector_distance"] = distance
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Optional
|
||||
|
||||
from cognee.infrastructure.engine.models.Edge import Edge
|
||||
from cognee.modules.chunking.models import DocumentChunk
|
||||
from cognee.modules.engine.models import Entity, EntityType
|
||||
from cognee.modules.engine.utils import (
|
||||
|
|
@ -243,10 +244,26 @@ def _process_graph_nodes(
|
|||
ontology_relationships,
|
||||
)
|
||||
|
||||
# Add entity to data chunk
|
||||
if data_chunk.contains is None:
|
||||
data_chunk.contains = []
|
||||
data_chunk.contains.append(entity_node)
|
||||
|
||||
edge_text = "; ".join(
|
||||
[
|
||||
"relationship_name: contains",
|
||||
f"entity_name: {entity_node.name}",
|
||||
f"entity_description: {entity_node.description}",
|
||||
]
|
||||
)
|
||||
|
||||
data_chunk.contains.append(
|
||||
(
|
||||
Edge(
|
||||
relationship_type="contains",
|
||||
edge_text=edge_text,
|
||||
),
|
||||
entity_node,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _process_graph_edges(
|
||||
|
|
|
|||
|
|
@ -1,71 +1,70 @@
|
|||
import string
|
||||
from typing import List
|
||||
from collections import Counter
|
||||
|
||||
from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
|
||||
from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS
|
||||
|
||||
|
||||
def _get_top_n_frequent_words(
|
||||
text: str, stop_words: set = None, top_n: int = 3, separator: str = ", "
|
||||
) -> str:
|
||||
"""Concatenates the top N frequent words in text."""
|
||||
if stop_words is None:
|
||||
stop_words = DEFAULT_STOP_WORDS
|
||||
|
||||
words = [word.lower().strip(string.punctuation) for word in text.split()]
|
||||
words = [word for word in words if word and word not in stop_words]
|
||||
|
||||
top_words = [word for word, freq in Counter(words).most_common(top_n)]
|
||||
return separator.join(top_words)
|
||||
|
||||
|
||||
def _create_title_from_text(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str:
|
||||
"""Creates a title by combining first words with most frequent words from the text."""
|
||||
first_words = text.split()[:first_n_words]
|
||||
top_words = _get_top_n_frequent_words(text, top_n=top_n_words)
|
||||
return f"{' '.join(first_words)}... [{top_words}]"
|
||||
|
||||
|
||||
def _extract_nodes_from_edges(retrieved_edges: List[Edge]) -> dict:
|
||||
"""Creates a dictionary of nodes with their names and content."""
|
||||
nodes = {}
|
||||
|
||||
for edge in retrieved_edges:
|
||||
for node in (edge.node1, edge.node2):
|
||||
if node.id in nodes:
|
||||
continue
|
||||
|
||||
text = node.attributes.get("text")
|
||||
if text:
|
||||
name = _create_title_from_text(text)
|
||||
content = text
|
||||
else:
|
||||
name = node.attributes.get("name", "Unnamed Node")
|
||||
content = node.attributes.get("description", name)
|
||||
|
||||
nodes[node.id] = {"node": node, "name": name, "content": content}
|
||||
|
||||
return nodes
|
||||
|
||||
|
||||
async def resolve_edges_to_text(retrieved_edges: List[Edge]) -> str:
|
||||
"""
|
||||
Converts retrieved graph edges into a human-readable string format.
|
||||
"""Converts retrieved graph edges into a human-readable string format."""
|
||||
nodes = _extract_nodes_from_edges(retrieved_edges)
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- retrieved_edges (list): A list of edges retrieved from the graph.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- str: A formatted string representation of the nodes and their connections.
|
||||
"""
|
||||
|
||||
def _get_nodes(retrieved_edges: List[Edge]) -> dict:
|
||||
def _get_title(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str:
|
||||
def _top_n_words(text, stop_words=None, top_n=3, separator=", "):
|
||||
"""Concatenates the top N frequent words in text."""
|
||||
if stop_words is None:
|
||||
from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS
|
||||
|
||||
stop_words = DEFAULT_STOP_WORDS
|
||||
|
||||
import string
|
||||
|
||||
words = [word.lower().strip(string.punctuation) for word in text.split()]
|
||||
|
||||
if stop_words:
|
||||
words = [word for word in words if word and word not in stop_words]
|
||||
|
||||
from collections import Counter
|
||||
|
||||
top_words = [word for word, freq in Counter(words).most_common(top_n)]
|
||||
|
||||
return separator.join(top_words)
|
||||
|
||||
"""Creates a title, by combining first words with most frequent words from the text."""
|
||||
first_words = text.split()[:first_n_words]
|
||||
top_words = _top_n_words(text, top_n=first_n_words)
|
||||
return f"{' '.join(first_words)}... [{top_words}]"
|
||||
|
||||
"""Creates a dictionary of nodes with their names and content."""
|
||||
nodes = {}
|
||||
for edge in retrieved_edges:
|
||||
for node in (edge.node1, edge.node2):
|
||||
if node.id not in nodes:
|
||||
text = node.attributes.get("text")
|
||||
if text:
|
||||
name = _get_title(text)
|
||||
content = text
|
||||
else:
|
||||
name = node.attributes.get("name", "Unnamed Node")
|
||||
content = node.attributes.get("description", name)
|
||||
nodes[node.id] = {"node": node, "name": name, "content": content}
|
||||
return nodes
|
||||
|
||||
nodes = _get_nodes(retrieved_edges)
|
||||
node_section = "\n".join(
|
||||
f"Node: {info['name']}\n__node_content_start__\n{info['content']}\n__node_content_end__\n"
|
||||
for info in nodes.values()
|
||||
)
|
||||
connection_section = "\n".join(
|
||||
f"{nodes[edge.node1.id]['name']} --[{edge.attributes['relationship_type']}]--> {nodes[edge.node2.id]['name']}"
|
||||
for edge in retrieved_edges
|
||||
)
|
||||
|
||||
connections = []
|
||||
for edge in retrieved_edges:
|
||||
source_name = nodes[edge.node1.id]["name"]
|
||||
target_name = nodes[edge.node2.id]["name"]
|
||||
edge_label = edge.attributes.get("edge_text") or edge.attributes.get("relationship_type")
|
||||
connections.append(f"{source_name} --[{edge_label}]--> {target_name}")
|
||||
|
||||
connection_section = "\n".join(connections)
|
||||
|
||||
return f"Nodes:\n{node_section}\n\nConnections:\n{connection_section}"
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ class BinaryData(IngestionData):
|
|||
|
||||
async def ensure_metadata(self):
|
||||
if self.metadata is None:
|
||||
self.metadata = await get_file_metadata(self.data)
|
||||
self.metadata = await get_file_metadata(self.data, name=self.name)
|
||||
|
||||
if self.metadata["name"] is None:
|
||||
self.metadata["name"] = self.name
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
from uuid import uuid5, NAMESPACE_OID
|
||||
from uuid import UUID
|
||||
from .data_types import IngestionData
|
||||
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.modules.data.methods import get_unique_data_id
|
||||
|
||||
|
||||
def identify(data: IngestionData, user: User) -> str:
|
||||
async def identify(data: IngestionData, user: User) -> UUID:
|
||||
data_content_hash: str = data.get_identifier()
|
||||
|
||||
# return UUID hash of file contents + owner id
|
||||
return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}")
|
||||
return await get_unique_data_id(data_identifier=data_content_hash, user=user)
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
from typing import BinaryIO, Union
|
||||
from typing import BinaryIO, Union, Optional
|
||||
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
||||
from .classify import classify
|
||||
import hashlib
|
||||
|
||||
|
||||
async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
||||
async def save_data_to_file(
|
||||
data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
|
||||
):
|
||||
storage_config = get_storage_config()
|
||||
|
||||
data_root_directory = storage_config["data_root_directory"]
|
||||
|
|
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
|
|||
|
||||
file_name = file_metadata["name"]
|
||||
|
||||
if file_extension is not None:
|
||||
extension = file_extension.lstrip(".")
|
||||
file_name_without_ext = file_name.rsplit(".", 1)[0]
|
||||
file_name = f"{file_name_without_ext}.{extension}"
|
||||
|
||||
storage = get_file_storage(data_root_directory)
|
||||
|
||||
full_file_path = await storage.store(file_name, data)
|
||||
|
|
|
|||
|
|
@ -21,7 +21,8 @@ def get_ontology_resolver_from_env(
|
|||
Supported value: "rdflib".
|
||||
matching_strategy (str): The matching strategy to apply.
|
||||
Supported value: "fuzzy".
|
||||
ontology_file_path (str): Path to the ontology file required for the resolver.
|
||||
ontology_file_path (str): Path to the ontology file(s) required for the resolver.
|
||||
Can be a single path or comma-separated paths for multiple files.
|
||||
|
||||
Returns:
|
||||
BaseOntologyResolver: An instance of the requested ontology resolver.
|
||||
|
|
@ -31,8 +32,13 @@ def get_ontology_resolver_from_env(
|
|||
or if required parameters are missing.
|
||||
"""
|
||||
if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
|
||||
if "," in ontology_file_path:
|
||||
file_paths = [path.strip() for path in ontology_file_path.split(",")]
|
||||
else:
|
||||
file_paths = ontology_file_path
|
||||
|
||||
return RDFLibOntologyResolver(
|
||||
matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
|
||||
matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths
|
||||
)
|
||||
else:
|
||||
raise EnvironmentError(
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import os
|
|||
import difflib
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from collections import deque
|
||||
from typing import List, Tuple, Dict, Optional, Any
|
||||
from typing import List, Tuple, Dict, Optional, Any, Union
|
||||
from rdflib import Graph, URIRef, RDF, RDFS, OWL
|
||||
|
||||
from cognee.modules.ontology.exceptions import (
|
||||
|
|
@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver):
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
ontology_file: Optional[str] = None,
|
||||
ontology_file: Optional[Union[str, List[str]]] = None,
|
||||
matching_strategy: Optional[MatchingStrategy] = None,
|
||||
) -> None:
|
||||
super().__init__(matching_strategy)
|
||||
self.ontology_file = ontology_file
|
||||
try:
|
||||
if ontology_file and os.path.exists(ontology_file):
|
||||
files_to_load = []
|
||||
if ontology_file is not None:
|
||||
if isinstance(ontology_file, str):
|
||||
files_to_load = [ontology_file]
|
||||
elif isinstance(ontology_file, list):
|
||||
files_to_load = ontology_file
|
||||
else:
|
||||
raise ValueError(
|
||||
f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}"
|
||||
)
|
||||
|
||||
if files_to_load:
|
||||
self.graph = Graph()
|
||||
self.graph.parse(ontology_file)
|
||||
logger.info("Ontology loaded successfully from file: %s", ontology_file)
|
||||
loaded_files = []
|
||||
for file_path in files_to_load:
|
||||
if os.path.exists(file_path):
|
||||
self.graph.parse(file_path)
|
||||
loaded_files.append(file_path)
|
||||
logger.info("Ontology loaded successfully from file: %s", file_path)
|
||||
else:
|
||||
logger.warning(
|
||||
"Ontology file '%s' not found. Skipping this file.",
|
||||
file_path,
|
||||
)
|
||||
|
||||
if not loaded_files:
|
||||
logger.info(
|
||||
"No valid ontology files found. No owl ontology will be attached to the graph."
|
||||
)
|
||||
self.graph = None
|
||||
else:
|
||||
logger.info("Total ontology files loaded: %d", len(loaded_files))
|
||||
else:
|
||||
logger.info(
|
||||
"Ontology file '%s' not found. No owl ontology will be attached to the graph.",
|
||||
ontology_file,
|
||||
"No ontology file provided. No owl ontology will be attached to the graph."
|
||||
)
|
||||
self.graph = None
|
||||
|
||||
self.build_lookup()
|
||||
except Exception as e:
|
||||
logger.error("Failed to load ontology", exc_info=e)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,6 @@ class PipelineRunFailedError(CogneeSystemError):
|
|||
self,
|
||||
message: str = "Pipeline run failed.",
|
||||
name: str = "PipelineRunFailedError",
|
||||
status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
|
||||
):
|
||||
super().__init__(message, name, status_code)
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue