Merge branch 'dev' into feat/csv-ingestion

Signed-off-by: EricXiao <taoiaox@gmail.com>
2025-11-14 14:46:11 +08:00 · 2025-11-14 14:46:11 +08:00 · 4c609d6074
commit 4c609d6074
parent 8566516cec b6a507b435
212 changed files with 11740 additions and 6509 deletions
--- a/.env.template
+++ b/.env.template
@ -169,8 +169,9 @@ REQUIRE_AUTHENTICATION=False
 #       Vector: LanceDB
 #       Graph: KuzuDB
 #
-# It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset
-ENABLE_BACKEND_ACCESS_CONTROL=False
+# It enforces creation of databases per Cognee user + dataset. Does not work with some graph and database providers.
+# Disable mode when using not supported graph/vector databases.
+ENABLE_BACKEND_ACCESS_CONTROL=True

 ################################################################################
 # ☁️ Cloud Sync Settings
@ -242,13 +243,14 @@ LITELLM_LOG="ERROR"

 ########## Local LLM via Ollama ###############################################

+
 #LLM_API_KEY ="ollama"
 #LLM_MODEL="llama3.1:8b"
 #LLM_PROVIDER="ollama"
 #LLM_ENDPOINT="http://localhost:11434/v1"
 #EMBEDDING_PROVIDER="ollama"
 #EMBEDDING_MODEL="nomic-embed-text:latest"
-#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
+#EMBEDDING_ENDPOINT="http://localhost:11434/api/embed"
 #EMBEDDING_DIMENSIONS=768
 #HUGGINGFACE_TOKENIZER="nomic-ai/nomic-embed-text-v1.5"

--- a/.github/actions/cognee_setup/action.yml
+++ b/.github/actions/cognee_setup/action.yml
@ -42,3 +42,8 @@ runs:
          done
        fi
        uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS
+
+    - name: Add telemetry identifier for telemetry test and in case telemetry is enabled by accident
+      shell: bash
+      run: |
+        echo "test-machine" > .anon_id
--- a/.github/workflows/basic_tests.yml
+++ b/.github/workflows/basic_tests.yml
@ -75,6 +75,7 @@ jobs:
    name: Run Unit Tests
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -104,6 +105,7 @@ jobs:
    name: Run Integration Tests
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -123,6 +125,7 @@ jobs:
        uses: ./.github/actions/cognee_setup
        with:
          python-version: ${{ inputs.python-version }}
+          extra-dependencies: "scraping"

      - name: Run Integration Tests
        run: uv run pytest cognee/tests/integration/
@ -131,6 +134,7 @@ jobs:
    name: Run Simple Examples
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -160,12 +164,13 @@ jobs:
    name: Run Simple Examples BAML
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      STRUCTURED_OUTPUT_FRAMEWORK: "BAML"
-      BAML_LLM_PROVIDER: azure-openai
-      BAML_LLM_MODEL: ${{ secrets.LLM_MODEL }}
-      BAML_LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-      BAML_LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-      BAML_LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+      BAML_LLM_PROVIDER: openai
+      BAML_LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
+      BAML_LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
+      BAML_LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+#      BAML_LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}

      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
@ -197,6 +202,7 @@ jobs:
    name: Run Basic Graph Tests
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
--- a/.github/workflows/cli_tests.yml
+++ b/.github/workflows/cli_tests.yml
@ -39,6 +39,7 @@ jobs:
    name: CLI Unit Tests
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -66,6 +67,7 @@ jobs:
    name: CLI Integration Tests
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -93,6 +95,7 @@ jobs:
    name: CLI Functionality Tests
    runs-on: ubuntu-22.04
    env:
+      ENV: 'dev'
      LLM_PROVIDER: openai
      LLM_MODEL: ${{ secrets.LLM_MODEL }}
      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
--- a/.github/workflows/db_examples_tests.yml
+++ b/.github/workflows/db_examples_tests.yml
@ -60,7 +60,7 @@ jobs:

      - name: Run Neo4j Example
        env:
-          ENV: dev
+          ENV: 'dev'
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
@ -95,7 +95,7 @@ jobs:

      - name: Run Kuzu Example
        env:
-          ENV: dev
+          ENV: 'dev'
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
@ -141,7 +141,7 @@ jobs:

      - name: Run PGVector Example
        env:
-          ENV: dev
+          ENV: 'dev'
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
--- a/.github/workflows/dockerhub-mcp.yml
+++ b/.github/workflows/dockerhub-mcp.yml
@ -7,14 +7,29 @@ on:

 jobs:
  docker-build-and-push:
-    runs-on: ubuntu-latest
+    runs-on:
+      group: Default
+      labels:
+        - docker_build_runner

    steps:
+      - name: Check and free disk space before build
+        run: |
+          echo "=== Before cleanup ==="
+          df -h
+          echo "Removing unused preinstalled SDKs to free space..."
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc || true
+          docker system prune -af || true
+          echo "=== After cleanup ==="
+          df -h
+
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
+        with:
+          buildkitd-flags: --root /tmp/buildkit

      - name: Log in to Docker Hub
        uses: docker/login-action@v3
@ -34,7 +49,7 @@ jobs:

      - name: Build and push
        id: build
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          platforms: linux/amd64,linux/arm64
@ -45,5 +60,6 @@ jobs:
          cache-from: type=registry,ref=cognee/cognee-mcp:buildcache
          cache-to: type=registry,ref=cognee/cognee-mcp:buildcache,mode=max

+
      - name: Image digest
        run: echo ${{ steps.build.outputs.digest }}
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@ -226,7 +226,7 @@ jobs:
      - name: Dependencies already installed
        run: echo "Dependencies already installed in setup"

-      - name: Run parallel databases test
+      - name: Run permissions test
        env:
          ENV: 'dev'
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
@ -239,6 +239,31 @@ jobs:
          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
        run: uv run python ./cognee/tests/test_permissions.py

+  test-multi-tenancy:
+    name: Test multi tenancy with different situations in Cognee
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run multi tenancy test
+        env:
+          ENV: 'dev'
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/test_multi_tenancy.py
+
  test-graph-edges:
    name: Test graph edge ingestion
    runs-on: ubuntu-22.04
@ -358,6 +383,34 @@ jobs:
          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
        run: uv run python ./cognee/tests/tasks/entity_extraction/entity_extraction_test.py

+  test-feedback-enrichment:
+    name: Test Feedback Enrichment
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Dependencies already installed
+        run: echo "Dependencies already installed in setup"
+
+      - name: Run Feedback Enrichment Test
+        env:
+          ENV: 'dev'
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/test_feedback_enrichment.py
+
  run_conversation_sessions_test:
    name: Conversation sessions test
    runs-on: ubuntu-latest
@ -401,7 +454,7 @@ jobs:

      - name: Run Conversation session tests
        env:
-          ENV: dev
+          ENV: 'dev'
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
--- a/.github/workflows/examples_tests.yml
+++ b/.github/workflows/examples_tests.yml
@ -21,6 +21,7 @@ jobs:

      - name: Run Multimedia Example
        env:
+          ENV: 'dev'
          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: uv run python ./examples/python/multimedia_example.py
@ -40,6 +41,7 @@ jobs:

      - name: Run Evaluation Framework Example
        env:
+          ENV: 'dev'
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
@ -69,6 +71,7 @@ jobs:

      - name: Run Descriptive Graph Metrics Example
        env:
+          ENV: 'dev'
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
@ -99,6 +102,7 @@ jobs:

      - name: Run Dynamic Steps Tests
        env:
+          ENV: 'dev'
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -124,6 +128,7 @@ jobs:

      - name: Run Temporal Example
        env:
+          ENV: 'dev'
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -149,6 +154,7 @@ jobs:

      - name: Run Ontology Demo Example
        env:
+          ENV: 'dev'
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -174,6 +180,7 @@ jobs:

      - name: Run Agentic Reasoning Example
        env:
+          ENV: 'dev'
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -199,6 +206,7 @@ jobs:

      - name: Run Memify Tests
        env:
+          ENV: 'dev'
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -210,6 +218,32 @@ jobs:
          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
        run: uv run python ./examples/python/memify_coding_agent_example.py

+  test-custom-pipeline:
+    name: Run Custom Pipeline Example
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Run Custom Pipeline Example
+        env:
+          ENV: 'dev'
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./examples/python/run_custom_pipeline_example.py
+
  test-permissions-example:
    name: Run Permissions Example
    runs-on: ubuntu-22.04
@ -224,6 +258,7 @@ jobs:

      - name: Run Memify Tests
        env:
+          ENV: 'dev'
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
@ -249,6 +284,7 @@ jobs:

      - name: Run Docling Test
        env:
+          ENV: 'dev'
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
--- a/.github/workflows/load_tests.yml
+++ b/.github/workflows/load_tests.yml
@ -0,0 +1,70 @@
+name: Load tests
+
+permissions:
+  contents: read
+
+on:
+  workflow_dispatch:
+  workflow_call:
+    secrets:
+      LLM_MODEL:
+        required: true
+      LLM_ENDPOINT:
+        required: true
+      LLM_API_KEY:
+        required: true
+      LLM_API_VERSION:
+        required: true
+      EMBEDDING_MODEL:
+        required: true
+      EMBEDDING_ENDPOINT:
+        required: true
+      EMBEDDING_API_KEY:
+        required: true
+      EMBEDDING_API_VERSION:
+        required: true
+      OPENAI_API_KEY:
+        required: true
+      AWS_ACCESS_KEY_ID:
+        required: true
+      AWS_SECRET_ACCESS_KEY:
+        required: true
+
+jobs:
+  test-load:
+    name: Test Load
+    runs-on: ubuntu-22.04
+    timeout-minutes: 60
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+          extra-dependencies: "aws"
+
+      - name: Verify File Descriptor Limit
+        run: ulimit -n
+
+      - name: Run Load Test
+        env:
+          ENV: 'dev'
+          ENABLE_BACKEND_ACCESS_CONTROL: True
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+          STORAGE_BACKEND: s3
+          AWS_REGION: eu-west-1
+          AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }}
+        run: uv run python ./cognee/tests/test_load.py
+  
+    
--- a/.github/workflows/release_test.yml
+++ b/.github/workflows/release_test.yml
@ -0,0 +1,17 @@
+# Long-running, heavy and resource-consuming tests for release validation
+name: Release Test Workflow
+
+permissions:
+  contents: read
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  load-tests:
+    name: Load Tests
+    uses: ./.github/workflows/load_tests.yml
+    secrets: inherit
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@ -0,0 +1,78 @@
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '35 8 * * 2'
+  push:
+    branches: [ "main" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    # `publish_results: true` only works when run from the default branch. conditional can be removed if disabled.
+    if: github.event.repository.default_branch == github.ref_name || github.event_name == 'pull_request'
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+      # Uncomment the permissions below if installing in a private repository.
+      # contents: read
+      # actions: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # - you want to enable the Branch-Protection check on a *public* repository, or
+          # - you are installing Scorecard on a *private* repository
+          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
+          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # Public repositories:
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories:
+          #   - `publish_results` will always be set to `false`, regardless
+          #     of the value entered here.
+          publish_results: true
+
+          # (Optional) Uncomment file_mode if you have a .gitattributes with files marked export-ignore
+          # file_mode: git
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard (optional).
+      # Commenting out will disable upload of results to your repo's Code Scanning dashboard
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@v3
+        with:
+          sarif_file: results.sarif
--- a/.github/workflows/search_db_tests.yml
+++ b/.github/workflows/search_db_tests.yml
@ -84,6 +84,7 @@ jobs:
          GRAPH_DATABASE_PROVIDER: 'neo4j'
          VECTOR_DB_PROVIDER: 'lancedb'
          DB_PROVIDER: 'sqlite'
+          ENABLE_BACKEND_ACCESS_CONTROL: 'false'
          GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }}
          GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }}
          GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }}
@ -135,6 +136,7 @@ jobs:
            EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
            GRAPH_DATABASE_PROVIDER: 'kuzu'
            VECTOR_DB_PROVIDER: 'pgvector'
+            ENABLE_BACKEND_ACCESS_CONTROL: 'false'
            DB_PROVIDER: 'postgres'
            DB_NAME: 'cognee_db'
            DB_HOST: '127.0.0.1'
@ -197,6 +199,7 @@ jobs:
          GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }}
          GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }}
          GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }}
+          ENABLE_BACKEND_ACCESS_CONTROL: 'false'
          DB_NAME: cognee_db
          DB_HOST: 127.0.0.1
          DB_PORT: 5432
--- a/.github/workflows/temporal_graph_tests.yml
+++ b/.github/workflows/temporal_graph_tests.yml
@ -34,10 +34,9 @@ jobs:
      - name: Run Temporal Graph with Kuzu (lancedb + sqlite)
        env:
          ENV: 'dev'
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
@ -73,10 +72,9 @@ jobs:
      - name: Run Temporal Graph with Neo4j (lancedb + sqlite)
        env:
          ENV: 'dev'
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
@ -125,10 +123,9 @@ jobs:
        - name: Run Temporal Graph with Kuzu (postgres + pgvector)
          env:
            ENV: dev
-            LLM_MODEL: ${{ secrets.LLM_MODEL }}
-            LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-            LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-            LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+            LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
+            LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
+            LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
            EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
            EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
            EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
@ -192,10 +189,9 @@ jobs:
      - name: Run Temporal Graph with Neo4j (postgres + pgvector)
        env:
          ENV: dev
-          LLM_MODEL: ${{ secrets.LLM_MODEL }}
-          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          LLM_MODEL: ${{ secrets.OPENAI_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
--- a/.github/workflows/test_different_operating_systems.yml
+++ b/.github/workflows/test_different_operating_systems.yml
@ -10,6 +10,10 @@ on:
        required: false
        type: string
        default: '["3.10.x", "3.12.x", "3.13.x"]'
+      os: 
+        required: false
+        type: string
+        default: '["ubuntu-22.04", "macos-15", "windows-latest"]'
    secrets:
      LLM_PROVIDER:
        required: true
@ -40,10 +44,11 @@ jobs:
  run-unit-tests:
    name: Unit tests ${{ matrix.python-version }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
    strategy:
      matrix:
        python-version: ${{ fromJSON(inputs.python-versions) }}
-        os: [ubuntu-22.04, macos-15, windows-latest]
+        os: ${{ fromJSON(inputs.os) }}
      fail-fast: false
    steps:
      - name: Check out
@ -76,10 +81,11 @@ jobs:
  run-integration-tests:
    name: Integration tests ${{ matrix.python-version }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
    strategy:
      matrix:
        python-version: ${{ fromJSON(inputs.python-versions) }}
-        os: [ ubuntu-22.04, macos-15, windows-latest ]
+        os: ${{ fromJSON(inputs.os) }}
      fail-fast: false
    steps:
      - name: Check out
@ -112,10 +118,11 @@ jobs:
  run-library-test:
    name: Library test ${{ matrix.python-version }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
    strategy:
      matrix:
        python-version: ${{ fromJSON(inputs.python-versions) }}
-        os: [ ubuntu-22.04, macos-15, windows-latest ]
+        os: ${{ fromJSON(inputs.os) }}
      fail-fast: false
    steps:
      - name: Check out
@ -148,10 +155,11 @@ jobs:
  run-build-test:
    name: Build test ${{ matrix.python-version }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
    strategy:
      matrix:
        python-version: ${{ fromJSON(inputs.python-versions) }}
-        os: [ ubuntu-22.04, macos-15, windows-latest ]
+        os: ${{ fromJSON(inputs.os) }}
      fail-fast: false
    steps:
      - name: Check out
@ -177,10 +185,11 @@ jobs:
  run-soft-deletion-test:
    name: Soft Delete test ${{ matrix.python-version }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
    strategy:
      matrix:
        python-version: ${{ fromJSON(inputs.python-versions) }}
-        os: [ ubuntu-22.04, macos-15, windows-latest ]
+        os: ${{ fromJSON(inputs.os) }}
      fail-fast: false
    steps:
      - name: Check out
@ -214,10 +223,11 @@ jobs:
  run-hard-deletion-test:
    name: Hard Delete test ${{ matrix.python-version }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
    strategy:
      matrix:
        python-version: ${{ fromJSON(inputs.python-versions) }}
-        os: [ ubuntu-22.04, macos-15, windows-latest ]
+        os: ${{ fromJSON(inputs.os) }}
      fail-fast: false
    steps:
      - name: Check out
--- a/.github/workflows/test_ollama.yml
+++ b/.github/workflows/test_ollama.yml
@ -75,7 +75,7 @@ jobs:
                { "role": "user", "content": "Whatever I say, answer with Yes." }
              ]
            }'
-          curl -X POST http://127.0.0.1:11434/v1/embeddings \
+          curl -X POST http://127.0.0.1:11434/api/embed \
            -H "Content-Type: application/json" \
             -d '{
              "model": "avr/sfr-embedding-mistral:latest",
@ -98,7 +98,7 @@ jobs:
          LLM_MODEL: "phi4"
          EMBEDDING_PROVIDER: "ollama"
          EMBEDDING_MODEL: "avr/sfr-embedding-mistral:latest"
-          EMBEDDING_ENDPOINT: "http://localhost:11434/api/embeddings"
+          EMBEDDING_ENDPOINT: "http://localhost:11434/api/embed"
          EMBEDDING_DIMENSIONS: "4096"
          HUGGINGFACE_TOKENIZER: "Salesforce/SFR-Embedding-Mistral"
        run: uv run python ./examples/python/simple_example.py
--- a/.github/workflows/test_suites.yml
+++ b/.github/workflows/test_suites.yml
@ -1,4 +1,6 @@
 name: Test Suites
+permissions:
+  contents: read

 on:
  push:
@ -80,12 +82,22 @@ jobs:
    uses: ./.github/workflows/notebooks_tests.yml
    secrets: inherit

-  different-operating-systems-tests:
-    name: Operating System and Python Tests
+  different-os-tests-basic:
+    name: OS and Python Tests Ubuntu
    needs: [basic-tests, e2e-tests]
    uses: ./.github/workflows/test_different_operating_systems.yml
    with:
      python-versions: '["3.10.x", "3.11.x", "3.12.x", "3.13.x"]'
+      os: '["ubuntu-22.04"]'
+    secrets: inherit
+
+  different-os-tests-extended:
+    name: OS and Python Tests Extended
+    needs: [basic-tests, e2e-tests]
+    uses: ./.github/workflows/test_different_operating_systems.yml
+    with:
+      python-versions: '["3.13.x"]'
+      os: '["macos-15", "windows-latest"]'
    secrets: inherit

  # Matrix-based vector database tests
@ -135,7 +147,8 @@ jobs:
      e2e-tests,
      graph-db-tests,
      notebook-tests,
-      different-operating-systems-tests,
+      different-os-tests-basic,
+      different-os-tests-extended,
      vector-db-tests,
      example-tests,
      llm-tests,
@ -155,7 +168,8 @@ jobs:
      cli-tests,
      graph-db-tests,
      notebook-tests,
-      different-operating-systems-tests,
+      different-os-tests-basic,
+      different-os-tests-extended,
      vector-db-tests,
      example-tests,
      db-examples-tests,
@ -176,7 +190,8 @@ jobs:
                "${{ needs.cli-tests.result }}" == "success" &&
                "${{ needs.graph-db-tests.result }}" == "success" &&
                "${{ needs.notebook-tests.result }}" == "success" &&
-                "${{ needs.different-operating-systems-tests.result }}" == "success" &&
+                "${{ needs.different-os-tests-basic.result }}" == "success" &&
+                "${{ needs.different-os-tests-extended.result }}" == "success" &&
                "${{ needs.vector-db-tests.result }}" == "success" &&
                "${{ needs.example-tests.result }}" == "success" &&
                "${{ needs.db-examples-tests.result }}" == "success" &&
--- a/.github/workflows/weighted_edges_tests.yml
+++ b/.github/workflows/weighted_edges_tests.yml
@ -2,7 +2,7 @@ name: Weighted Edges Tests

 on:
  push:
-    branches: [ main, weighted_edges ]
+    branches: [ main, dev, weighted_edges ]
    paths:
      - 'cognee/modules/graph/utils/get_graph_from_model.py'
      - 'cognee/infrastructure/engine/models/Edge.py'
@ -10,7 +10,7 @@ on:
      - 'examples/python/weighted_edges_example.py'
      - '.github/workflows/weighted_edges_tests.yml'
  pull_request:
-    branches: [ main ]
+    branches: [ main, dev ]
    paths:
      - 'cognee/modules/graph/utils/get_graph_from_model.py'
      - 'cognee/infrastructure/engine/models/Edge.py'
@ -32,7 +32,7 @@ jobs:
    env:
      LLM_PROVIDER: openai
      LLM_MODEL: gpt-5-mini
-      LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+      LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}

    steps:
      - name: Check out repository
@ -67,14 +67,13 @@ jobs:
    env:
      LLM_PROVIDER: openai
      LLM_MODEL: gpt-5-mini
-      LLM_ENDPOINT: https://api.openai.com/v1/
-      LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+      LLM_ENDPOINT: https://api.openai.com/v1
+      LLM_API_KEY: ${{ secrets.OPENAI_API_KEY  }}
      LLM_API_VERSION: "2024-02-01"
-      EMBEDDING_PROVIDER: openai
-      EMBEDDING_MODEL: text-embedding-3-small
-      EMBEDDING_ENDPOINT: https://api.openai.com/v1/
-      EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }}
-      EMBEDDING_API_VERSION: "2024-02-01"
+      EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+      EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+      EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+      EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
@ -108,14 +107,14 @@ jobs:
    env:
      LLM_PROVIDER: openai
      LLM_MODEL: gpt-5-mini
-      LLM_ENDPOINT: https://api.openai.com/v1/
-      LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+      LLM_ENDPOINT: https://api.openai.com/v1
+      LLM_API_KEY: ${{ secrets.OPENAI_API_KEY  }}
      LLM_API_VERSION: "2024-02-01"
-      EMBEDDING_PROVIDER: openai
-      EMBEDDING_MODEL: text-embedding-3-small
-      EMBEDDING_ENDPOINT: https://api.openai.com/v1/
-      EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }}
-      EMBEDDING_API_VERSION: "2024-02-01"
+      EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+      EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+      EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+      EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
--- a/AGENTS.md
+++ b/AGENTS.md
@ -0,0 +1,132 @@
+## Repository Guidelines
+
+This document summarizes how to work with the cognee repository: how it’s organized, how to build, test, lint, and contribute. It mirrors our actual tooling and CI while providing quick commands for local development.
+
+## Project Structure & Module Organization
+
+- `cognee/`: Core Python library and API.
+  - `api/`: FastAPI application and versioned routers (add, cognify, memify, search, delete, users, datasets, responses, visualize, settings, sync, update, checks).
+  - `cli/`: CLI entry points and subcommands invoked via `cognee` / `cognee-cli`.
+  - `infrastructure/`: Databases, LLM providers, embeddings, loaders, and storage adapters.
+  - `modules/`: Domain logic (graph, retrieval, ontology, users, processing, observability, etc.).
+  - `tasks/`: Reusable tasks (e.g., code graph, web scraping, storage). Extend with new tasks here.
+  - `eval_framework/`: Evaluation utilities and adapters.
+  - `shared/`: Cross-cutting helpers (logging, settings, utils).
+  - `tests/`: Unit, integration, CLI, and end-to-end tests organized by feature.
+  - `__main__.py`: Entrypoint to route to CLI.
+- `cognee-mcp/`: Model Context Protocol server exposing cognee as MCP tools (SSE/HTTP/stdio). Contains its own README and Dockerfile.
+- `cognee-frontend/`: Next.js UI for local development and demos.
+- `distributed/`: Utilities for distributed execution (Modal, workers, queues).
+- `examples/`: Example scripts demonstrating the public APIs and features (graph, code graph, multimodal, permissions, etc.).
+- `notebooks/`: Jupyter notebooks for demos and tutorials.
+- `alembic/`: Database migrations for relational backends.
+
+Notes:
+- Co-locate feature-specific helpers under their respective package (`modules/`, `infrastructure/`, or `tasks/`).
+- Extend the system by adding new tasks, loaders, or retrievers rather than modifying core pipeline mechanisms.
+
+## Build, Test, and Development Commands
+
+Python (root) – requires Python >= 3.10 and < 3.14. We recommend `uv` for speed and reproducibility.
+
+- Create/refresh env and install dev deps:
+```bash
+uv sync --dev --all-extras --reinstall
+```
+
+- Run the CLI (examples):
+```bash
+uv run cognee-cli add "Cognee turns documents into AI memory."
+uv run cognee-cli cognify
+uv run cognee-cli search "What does cognee do?"
+uv run cognee-cli -ui   # Launches UI, backend API, and MCP server together
+```
+
+- Start the FastAPI server directly:
+```bash
+uv run python -m cognee.api.client
+```
+
+- Run tests (CI mirrors these commands):
+```bash
+uv run pytest cognee/tests/unit/ -v
+uv run pytest cognee/tests/integration/ -v
+```
+
+- Lint and format (ruff):
+```bash
+uv run ruff check .
+uv run ruff format .
+```
+
+- Optional static type checks (mypy):
+```bash
+uv run mypy cognee/
+```
+
+MCP Server (`cognee-mcp/`):
+
+- Install and run locally:
+```bash
+cd cognee-mcp
+uv sync --dev --all-extras --reinstall
+uv run python src/server.py               # stdio (default)
+uv run python src/server.py --transport sse
+uv run python src/server.py --transport http --host 127.0.0.1 --port 8000 --path /mcp
+```
+
+- API Mode (connect to a running Cognee API):
+```bash
+uv run python src/server.py --transport sse --api-url http://localhost:8000 --api-token YOUR_TOKEN
+```
+
+- Docker quickstart (examples): see `cognee-mcp/README.md` for full details
+```bash
+docker run -e TRANSPORT_MODE=http --env-file ./.env -p 8000:8000 --rm -it cognee/cognee-mcp:main
+```
+
+Frontend (`cognee-frontend/`):
+```bash
+cd cognee-frontend
+npm install
+npm run dev     # Next.js dev server
+npm run lint    # ESLint
+npm run build && npm start
+```
+
+## Coding Style & Naming Conventions
+
+Python:
+- 4-space indentation, modules and functions in `snake_case`, classes in `PascalCase`.
+- Public APIs should be type-annotated where practical.
+- Use `ruff format` before committing; `ruff check` enforces import hygiene and style (line-length 100 configured in `pyproject.toml`).
+- Prefer explicit, structured error handling. Use shared logging utilities in `cognee.shared.logging_utils`.
+
+MCP server and Frontend:
+- Follow the local `README.md` and ESLint/TypeScript configuration in `cognee-frontend/`.
+
+## Testing Guidelines
+
+- Place Python tests under `cognee/tests/`.
+  - Unit tests: `cognee/tests/unit/`
+  - Integration tests: `cognee/tests/integration/`
+  - CLI tests: `cognee/tests/cli_tests/`
+- Name test files `test_*.py`. Use `pytest.mark.asyncio` for async tests.
+- Avoid external state; rely on test fixtures and the CI-provided env vars when LLM/embedding providers are required. See CI workflows under `.github/workflows/` for expected environment variables.
+- When adding public APIs, provide/update targeted examples under `examples/python/`.
+
+## Commit & Pull Request Guidelines
+
+- Use clear, imperative subjects (≤ 72 chars) and conventional commit styling in PR titles. Our CI validates semantic PR titles (see `.github/workflows/pr_lint`). Examples:
+  - `feat(graph): add temporal edge weighting`
+  - `fix(api): handle missing auth cookie`
+  - `docs: update installation instructions`
+- Reference related issues/discussions in the PR body and provide brief context.
+- PRs should describe scope, list local test commands run, and mention any impacts on MCP server or UI if applicable.
+- Sign commits and affirm the DCO (see `CONTRIBUTING.md`).
+
+## CI Mirrors Local Commands
+
+Our GitHub Actions run the same ruff checks and pytest suites shown above (`.github/workflows/basic_tests.yml` and related workflows). Use the commands in this document locally to minimize CI surprises.
+
+
--- a/README.md
+++ b/README.md
@ -97,7 +97,7 @@ Hosted platform:

 ### 📦 Installation

-You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager.
+You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager..

 Cognee supports Python 3.10 to 3.12

--- a/alembic/env.py
+++ b/alembic/env.py
@ -87,11 +87,6 @@ db_engine = get_relational_engine()

 print("Using database:", db_engine.db_uri)

-if "sqlite" in db_engine.db_uri:
-    from cognee.infrastructure.utils.run_sync import run_sync
-
-    run_sync(db_engine.create_database())
-
 config.set_section_option(
    config.config_ini_section,
    "SQLALCHEMY_DATABASE_URI",
--- a/alembic/versions/211ab850ef3d_add_sync_operations_table.py
+++ b/alembic/versions/211ab850ef3d_add_sync_operations_table.py
@ -10,6 +10,7 @@ from typing import Sequence, Union

 from alembic import op
 import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql


 # revision identifiers, used by Alembic.
@ -26,7 +27,34 @@ def upgrade() -> None:
    connection = op.get_bind()
    inspector = sa.inspect(connection)

+    if op.get_context().dialect.name == "postgresql":
+        syncstatus_enum = postgresql.ENUM(
+            "STARTED", "IN_PROGRESS", "COMPLETED", "FAILED", "CANCELLED", name="syncstatus"
+        )
+        syncstatus_enum.create(op.get_bind(), checkfirst=True)
+
    if "sync_operations" not in inspector.get_table_names():
+        if op.get_context().dialect.name == "postgresql":
+            syncstatus = postgresql.ENUM(
+                "STARTED",
+                "IN_PROGRESS",
+                "COMPLETED",
+                "FAILED",
+                "CANCELLED",
+                name="syncstatus",
+                create_type=False,
+            )
+        else:
+            syncstatus = sa.Enum(
+                "STARTED",
+                "IN_PROGRESS",
+                "COMPLETED",
+                "FAILED",
+                "CANCELLED",
+                name="syncstatus",
+                create_type=False,
+            )
+
        # Table doesn't exist, create it normally
        op.create_table(
            "sync_operations",
@ -34,15 +62,7 @@ def upgrade() -> None:
            sa.Column("run_id", sa.Text(), nullable=True),
            sa.Column(
                "status",
-                sa.Enum(
-                    "STARTED",
-                    "IN_PROGRESS",
-                    "COMPLETED",
-                    "FAILED",
-                    "CANCELLED",
-                    name="syncstatus",
-                    create_type=False,
-                ),
+                syncstatus,
                nullable=True,
            ),
            sa.Column("progress_percentage", sa.Integer(), nullable=True),
--- a/alembic/versions/482cd6517ce4_add_default_user.py
+++ b/alembic/versions/482cd6517ce4_add_default_user.py
@ -23,11 +23,8 @@ depends_on: Union[str, Sequence[str], None] = "8057ae7329c2"


 def upgrade() -> None:
-    try:
-        await_only(create_default_user())
-    except UserAlreadyExists:
-        pass  # It's fine if the default user already exists
+    pass


 def downgrade() -> None:
-    await_only(delete_user("default_user@example.com"))
+    pass
--- a/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py
+++ b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py
@ -0,0 +1,98 @@
+"""Expand dataset database for multi user
+
+Revision ID: 76625596c5c3
+Revises: 211ab850ef3d
+Create Date: 2025-10-30 12:55:20.239562
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = "76625596c5c3"
+down_revision: Union[str, None] = "c946955da633"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def _get_column(inspector, table, name, schema=None):
+    for col in inspector.get_columns(table, schema=schema):
+        if col["name"] == name:
+            return col
+    return None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    insp = sa.inspect(conn)
+
+    vector_database_provider_column = _get_column(
+        insp, "dataset_database", "vector_database_provider"
+    )
+    if not vector_database_provider_column:
+        op.add_column(
+            "dataset_database",
+            sa.Column(
+                "vector_database_provider",
+                sa.String(),
+                unique=False,
+                nullable=False,
+                server_default="lancedb",
+            ),
+        )
+
+    graph_database_provider_column = _get_column(
+        insp, "dataset_database", "graph_database_provider"
+    )
+    if not graph_database_provider_column:
+        op.add_column(
+            "dataset_database",
+            sa.Column(
+                "graph_database_provider",
+                sa.String(),
+                unique=False,
+                nullable=False,
+                server_default="kuzu",
+            ),
+        )
+
+    vector_database_url_column = _get_column(insp, "dataset_database", "vector_database_url")
+    if not vector_database_url_column:
+        op.add_column(
+            "dataset_database",
+            sa.Column("vector_database_url", sa.String(), unique=False, nullable=True),
+        )
+
+    graph_database_url_column = _get_column(insp, "dataset_database", "graph_database_url")
+    if not graph_database_url_column:
+        op.add_column(
+            "dataset_database",
+            sa.Column("graph_database_url", sa.String(), unique=False, nullable=True),
+        )
+
+    vector_database_key_column = _get_column(insp, "dataset_database", "vector_database_key")
+    if not vector_database_key_column:
+        op.add_column(
+            "dataset_database",
+            sa.Column("vector_database_key", sa.String(), unique=False, nullable=True),
+        )
+
+    graph_database_key_column = _get_column(insp, "dataset_database", "graph_database_key")
+    if not graph_database_key_column:
+        op.add_column(
+            "dataset_database",
+            sa.Column("graph_database_key", sa.String(), unique=False, nullable=True),
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("dataset_database", "vector_database_provider")
+    op.drop_column("dataset_database", "graph_database_provider")
+    op.drop_column("dataset_database", "vector_database_url")
+    op.drop_column("dataset_database", "graph_database_url")
+    op.drop_column("dataset_database", "vector_database_key")
+    op.drop_column("dataset_database", "graph_database_key")
--- a/alembic/versions/8057ae7329c2_initial_migration.py
+++ b/alembic/versions/8057ae7329c2_initial_migration.py
@ -18,11 +18,8 @@ depends_on: Union[str, Sequence[str], None] = None


 def upgrade() -> None:
-    db_engine = get_relational_engine()
-    # we might want to delete this
-    await_only(db_engine.create_database())
+    pass


 def downgrade() -> None:
-    db_engine = get_relational_engine()
-    await_only(db_engine.delete_database())
+    pass
--- a/alembic/versions/ab7e313804ae_permission_system_rework.py
+++ b/alembic/versions/ab7e313804ae_permission_system_rework.py
@ -144,44 +144,58 @@ def _create_data_permission(conn, user_id, data_id, permission_name):
    )


+def _get_column(inspector, table, name, schema=None):
+    for col in inspector.get_columns(table, schema=schema):
+        if col["name"] == name:
+            return col
+    return None
+
+
 def upgrade() -> None:
    conn = op.get_bind()
+    insp = sa.inspect(conn)

-    # Recreate ACLs table with default permissions set to datasets instead of documents
-    op.drop_table("acls")
+    dataset_id_column = _get_column(insp, "acls", "dataset_id")
+    if not dataset_id_column:
+        # Recreate ACLs table with default permissions set to datasets instead of documents
+        op.drop_table("acls")

-    acls_table = op.create_table(
-        "acls",
-        sa.Column("id", UUID, primary_key=True, default=uuid4),
-        sa.Column(
-            "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
-        ),
-        sa.Column(
-            "updated_at", sa.DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)
-        ),
-        sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")),
-        sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")),
-        sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")),
-    )
+        acls_table = op.create_table(
+            "acls",
+            sa.Column("id", UUID, primary_key=True, default=uuid4),
+            sa.Column(
+                "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
+            ),
+            sa.Column(
+                "updated_at",
+                sa.DateTime(timezone=True),
+                onupdate=lambda: datetime.now(timezone.utc),
+            ),
+            sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")),
+            sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")),
+            sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")),
+        )

-    # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table
-    #       definition or load what is in the database
-    dataset_table = _define_dataset_table()
-    datasets = conn.execute(sa.select(dataset_table)).fetchall()
+        # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table
+        #       definition or load what is in the database
+        dataset_table = _define_dataset_table()
+        datasets = conn.execute(sa.select(dataset_table)).fetchall()

-    if not datasets:
-        return
+        if not datasets:
+            return

-    acl_list = []
+        acl_list = []

-    for dataset in datasets:
-        acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read"))
-        acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write"))
-        acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share"))
-        acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete"))
+        for dataset in datasets:
+            acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read"))
+            acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write"))
+            acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share"))
+            acl_list.append(
+                _create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete")
+            )

-    if acl_list:
-        op.bulk_insert(acls_table, acl_list)
+        if acl_list:
+            op.bulk_insert(acls_table, acl_list)


 def downgrade() -> None:
--- a/alembic/versions/c946955da633_multi_tenant_support.py
+++ b/alembic/versions/c946955da633_multi_tenant_support.py
@ -0,0 +1,137 @@
+"""Multi Tenant Support
+
+Revision ID: c946955da633
+Revises: 211ab850ef3d
+Create Date: 2025-11-04 18:11:09.325158
+
+"""
+
+from typing import Sequence, Union
+from datetime import datetime, timezone
+from uuid import uuid4
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision: str = "c946955da633"
+down_revision: Union[str, None] = "211ab850ef3d"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def _now():
+    return datetime.now(timezone.utc)
+
+
+def _define_user_table() -> sa.Table:
+    table = sa.Table(
+        "users",
+        sa.MetaData(),
+        sa.Column(
+            "id",
+            sa.UUID,
+            sa.ForeignKey("principals.id", ondelete="CASCADE"),
+            primary_key=True,
+            nullable=False,
+        ),
+        sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), index=True, nullable=True),
+    )
+    return table
+
+
+def _define_dataset_table() -> sa.Table:
+    # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table
+    #       definition or load what is in the database
+    table = sa.Table(
+        "datasets",
+        sa.MetaData(),
+        sa.Column("id", sa.UUID, primary_key=True, default=uuid4),
+        sa.Column("name", sa.Text),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            default=lambda: datetime.now(timezone.utc),
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            onupdate=lambda: datetime.now(timezone.utc),
+        ),
+        sa.Column("owner_id", sa.UUID(), sa.ForeignKey("principals.id"), index=True),
+        sa.Column("tenant_id", sa.UUID(), sa.ForeignKey("tenants.id"), index=True, nullable=True),
+    )
+
+    return table
+
+
+def _get_column(inspector, table, name, schema=None):
+    for col in inspector.get_columns(table, schema=schema):
+        if col["name"] == name:
+            return col
+    return None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    insp = sa.inspect(conn)
+
+    dataset = _define_dataset_table()
+    user = _define_user_table()
+
+    if "user_tenants" not in insp.get_table_names():
+        # Define table with all necessary columns including primary key
+        user_tenants = op.create_table(
+            "user_tenants",
+            sa.Column("user_id", sa.UUID, sa.ForeignKey("users.id"), primary_key=True),
+            sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), primary_key=True),
+            sa.Column(
+                "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
+            ),
+        )
+
+        # Get all users with their tenant_id
+        user_data = conn.execute(
+            sa.select(user.c.id, user.c.tenant_id).where(user.c.tenant_id.isnot(None))
+        ).fetchall()
+
+        # Insert into user_tenants table
+        if user_data:
+            op.bulk_insert(
+                user_tenants,
+                [
+                    {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()}
+                    for user_id, tenant_id in user_data
+                ],
+            )
+
+    tenant_id_column = _get_column(insp, "datasets", "tenant_id")
+    if not tenant_id_column:
+        op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True))
+
+        # Build subquery, select users.tenant_id for each dataset.owner_id
+        tenant_id_from_dataset_owner = (
+            sa.select(user.c.tenant_id).where(user.c.id == dataset.c.owner_id).scalar_subquery()
+        )
+
+        if op.get_context().dialect.name == "sqlite":
+            # If column doesn't exist create new original_extension column and update from values of extension column
+            with op.batch_alter_table("datasets") as batch_op:
+                batch_op.execute(
+                    dataset.update().values(
+                        tenant_id=tenant_id_from_dataset_owner,
+                    )
+                )
+        else:
+            conn = op.get_bind()
+            conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner))
+
+        op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"])
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("user_tenants")
+    op.drop_index(op.f("ix_datasets_tenant_id"), table_name="datasets")
+    op.drop_column("datasets", "tenant_id")
+    # ### end Alembic commands ###
--- a/cognee-mcp/README.md
+++ b/cognee-mcp/README.md
@ -110,6 +110,47 @@ If you'd rather run cognee-mcp in a container, you have two options:
      # For stdio transport (default)
      docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
      ```
+      
+      **Installing optional dependencies at runtime:**
+      
+      You can install optional dependencies when running the container by setting the `EXTRAS` environment variable:
+      ```bash
+      # Install a single optional dependency group at runtime
+      docker run \
+        -e TRANSPORT_MODE=http \
+        -e EXTRAS=aws \
+        --env-file ./.env \
+        -p 8000:8000 \
+        --rm -it cognee/cognee-mcp:main
+      
+      # Install multiple optional dependency groups at runtime (comma-separated)
+      docker run \
+        -e TRANSPORT_MODE=sse \
+        -e EXTRAS=aws,postgres,neo4j \
+        --env-file ./.env \
+        -p 8000:8000 \
+        --rm -it cognee/cognee-mcp:main
+      ```
+      
+      **Available optional dependency groups:**
+      - `aws` - S3 storage support
+      - `postgres` / `postgres-binary` - PostgreSQL database support
+      - `neo4j` - Neo4j graph database support
+      - `neptune` - AWS Neptune support
+      - `chromadb` - ChromaDB vector store support
+      - `scraping` - Web scraping capabilities
+      - `distributed` - Modal distributed execution
+      - `langchain` - LangChain integration
+      - `llama-index` - LlamaIndex integration
+      - `anthropic` - Anthropic models
+      - `groq` - Groq models
+      - `mistral` - Mistral models
+      - `ollama` / `huggingface` - Local model support
+      - `docs` - Document processing
+      - `codegraph` - Code analysis
+      - `monitoring` - Sentry & Langfuse monitoring
+      - `redis` - Redis support
+      - And more (see [pyproject.toml](https://github.com/topoteretes/cognee/blob/main/pyproject.toml) for full list)
 2. **Pull from Docker Hub** (no build required):
   ```bash
   # With HTTP transport (recommended for web deployments)
@ -119,6 +160,17 @@ If you'd rather run cognee-mcp in a container, you have two options:
   # With stdio transport (default)
   docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
   ```
+   
+   **With runtime installation of optional dependencies:**
+   ```bash
+   # Install optional dependencies from Docker Hub image
+   docker run \
+     -e TRANSPORT_MODE=http \
+     -e EXTRAS=aws,postgres \
+     --env-file ./.env \
+     -p 8000:8000 \
+     --rm -it cognee/cognee-mcp:main
+   ```

 ### **Important: Docker vs Direct Usage**
 **Docker uses environment variables**, not command line arguments:
--- a/cognee-mcp/entrypoint.sh
+++ b/cognee-mcp/entrypoint.sh
@ -4,6 +4,42 @@ set -e  # Exit on error
 echo "Debug mode: $DEBUG"
 echo "Environment: $ENVIRONMENT"

+# Install optional dependencies if EXTRAS is set
+if [ -n "$EXTRAS" ]; then
+    echo "Installing optional dependencies: $EXTRAS"
+    
+    # Get the cognee version that's currently installed
+    COGNEE_VERSION=$(uv pip show cognee | grep "Version:" | awk '{print $2}')
+    echo "Current cognee version: $COGNEE_VERSION"
+    
+    # Build the extras list for cognee
+    IFS=',' read -ra EXTRA_ARRAY <<< "$EXTRAS"
+    # Combine base extras from pyproject.toml with requested extras
+    ALL_EXTRAS=""
+    for extra in "${EXTRA_ARRAY[@]}"; do
+        # Trim whitespace
+        extra=$(echo "$extra" | xargs)
+        # Add to extras list if not already present
+        if [[ ! "$ALL_EXTRAS" =~ (^|,)"$extra"(,|$) ]]; then
+            if [ -z "$ALL_EXTRAS" ]; then
+                ALL_EXTRAS="$extra"
+            else
+                ALL_EXTRAS="$ALL_EXTRAS,$extra"
+            fi
+        fi
+    done
+    
+    echo "Installing cognee with extras: $ALL_EXTRAS"
+    echo "Running: uv pip install 'cognee[$ALL_EXTRAS]==$COGNEE_VERSION'"
+    uv pip install "cognee[$ALL_EXTRAS]==$COGNEE_VERSION"
+    
+    # Verify installation
+    echo ""
+    echo "✓ Optional dependencies installation completed"
+else
+    echo "No optional dependencies specified"
+fi
+
 # Set default transport mode if not specified
 TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
 echo "Transport mode: $TRANSPORT_MODE"
--- a/cognee-mcp/pyproject.toml
+++ b/cognee-mcp/pyproject.toml
@ -9,7 +9,7 @@ dependencies = [
    # For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes.
    #"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j] @ file:/Users/igorilic/Desktop/cognee",
    # TODO: Remove gemini from optional dependecnies for new Cognee version after 0.3.4
-    "cognee[postgres,codegraph,gemini,huggingface,docs,neo4j]==0.3.4",
+    "cognee[postgres,docs,neo4j]==0.3.7",
    "fastmcp>=2.10.0,<3.0.0",
    "mcp>=1.12.0,<2.0.0",
    "uv>=0.6.3,<1.0.0",
--- a/cognee-mcp/src/client.py
+++ b/cognee-mcp/src/client.py
@ -37,12 +37,10 @@ async def run():

            toolResult = await session.call_tool("prune", arguments={})

-            toolResult = await session.call_tool(
-                "codify", arguments={"repo_path": "SOME_REPO_PATH"}
-            )
+            toolResult = await session.call_tool("cognify", arguments={})

            toolResult = await session.call_tool(
-                "search", arguments={"search_type": "CODE", "search_query": "exceptions"}
+                "search", arguments={"search_type": "GRAPH_COMPLETION"}
            )

            print(f"Cognify result: {toolResult.content}")
--- a/cognee-mcp/src/server.py
+++ b/cognee-mcp/src/server.py
@ -1096,6 +1096,10 @@ async def main():

    # Skip migrations when in API mode (the API server handles its own database)
    if not args.no_migration and not args.api_url:
+        from cognee.modules.engine.operations.setup import setup
+
+        await setup()
+
        # Run Alembic migrations from the main cognee directory where alembic.ini is located
        logger.info("Running database migrations...")
        migration_result = subprocess.run(
--- a/cognee-mcp/uv.lock
+++ b/cognee-mcp/uv.lock
--- a/cognee/init.py
+++ b/cognee/init.py
@ -19,6 +19,7 @@ from .api.v1.add import add
 from .api.v1.delete import delete
 from .api.v1.cognify import cognify
 from .modules.memify import memify
+from .modules.run_custom_pipeline import run_custom_pipeline
 from .api.v1.update import update
 from .api.v1.config.config import config
 from .api.v1.datasets.datasets import datasets
--- a/cognee/api/client.py
+++ b/cognee/api/client.py
@ -39,6 +39,8 @@ from cognee.api.v1.users.routers import (
 )
 from cognee.modules.users.methods.get_authenticated_user import REQUIRE_AUTHENTICATION

+# Ensure application logging is configured for container stdout/stderr
+setup_logging()
 logger = get_logger()

 if os.getenv("ENV", "prod") == "prod":
@ -74,6 +76,9 @@ async def lifespan(app: FastAPI):

    await get_default_user()

+    # Emit a clear startup message for docker logs
+    logger.info("Backend server has started")
+
    yield


--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@ -1,8 +1,5 @@
 from uuid import UUID
-import os
-from typing import Union, BinaryIO, List, Optional, Dict, Any
-from pydantic import BaseModel
-from urllib.parse import urlparse
+from typing import Union, BinaryIO, List, Optional, Any
 from cognee.modules.users.models import User
 from cognee.modules.pipelines import Task, run_pipeline
 from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
@ -17,16 +14,6 @@ from cognee.shared.logging_utils import get_logger

 logger = get_logger()

-try:
-    from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
-    from cognee.context_global_variables import (
-        tavily_config as tavily,
-        soup_crawler_config as soup_crawler,
-    )
-except ImportError:
-    logger.debug(f"Unable to import {str(ImportError)}")
-    pass
-

 async def add(
    data: Union[BinaryIO, list[BinaryIO], str, list[str]],
@ -36,11 +23,8 @@ async def add(
    vector_db_config: dict = None,
    graph_db_config: dict = None,
    dataset_id: Optional[UUID] = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: Optional[List[Union[str, dict[str, dict[str, Any]]]]] = None,
    incremental_loading: bool = True,
-    extraction_rules: Optional[Dict[str, Any]] = None,
-    tavily_config: Optional[BaseModel] = None,
-    soup_crawler_config: Optional[BaseModel] = None,
    data_per_batch: Optional[int] = 20,
 ):
    """
@ -180,28 +164,14 @@ async def add(
        - TAVILY_API_KEY: YOUR_TAVILY_API_KEY

    """
-
-    try:
-        if not soup_crawler_config and extraction_rules:
-            soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
-        if not tavily_config and os.getenv("TAVILY_API_KEY"):
-            tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))
-
-        soup_crawler.set(soup_crawler_config)
-        tavily.set(tavily_config)
-
-        http_schemes = {"http", "https"}
-
-        def _is_http_url(item: Union[str, BinaryIO]) -> bool:
-            return isinstance(item, str) and urlparse(item).scheme in http_schemes
-
-        if _is_http_url(data):
-            node_set = ["web_content"] if not node_set else node_set + ["web_content"]
-        elif isinstance(data, list) and any(_is_http_url(item) for item in data):
-            node_set = ["web_content"] if not node_set else node_set + ["web_content"]
-    except NameError:
-        logger.debug(f"Unable to import {str(ImportError)}")
-        pass
+    if preferred_loaders is not None:
+        transformed = {}
+        for item in preferred_loaders:
+            if isinstance(item, dict):
+                transformed.update(item)
+            else:
+                transformed[item] = {}
+        preferred_loaders = transformed

    tasks = [
        Task(resolve_data_directories, include_subdirectories=True),
--- a/cognee/api/v1/add/routers/get_add_router.py
+++ b/cognee/api/v1/add/routers/get_add_router.py
@ -10,6 +10,7 @@ from cognee.modules.users.methods import get_authenticated_user
 from cognee.shared.utils import send_telemetry
 from cognee.modules.pipelines.models import PipelineRunErrored
 from cognee.shared.logging_utils import get_logger
+from cognee import __version__ as cognee_version

 logger = get_logger()

@ -63,7 +64,11 @@ def get_add_router() -> APIRouter:
        send_telemetry(
            "Add API Endpoint Invoked",
            user.id,
-            additional_properties={"endpoint": "POST /v1/add", "node_set": node_set},
+            additional_properties={
+                "endpoint": "POST /v1/add",
+                "node_set": node_set,
+                "cognee_version": cognee_version,
+            },
        )

        from cognee.api.v1.add import add as cognee_add
@ -77,7 +82,9 @@ def get_add_router() -> APIRouter:
                datasetName,
                user=user,
                dataset_id=datasetId,
-                node_set=node_set if node_set else None,
+                node_set=node_set
+                if node_set != [""]
+                else None,  # Transform default node_set endpoint value to None
            )

            if isinstance(add_run, PipelineRunErrored):
--- a/cognee/api/v1/cognify/routers/get_cognify_router.py
+++ b/cognee/api/v1/cognify/routers/get_cognify_router.py
@ -29,7 +29,7 @@ from cognee.modules.pipelines.queues.pipeline_run_info_queues import (
 )
 from cognee.shared.logging_utils import get_logger
 from cognee.shared.utils import send_telemetry
-
+from cognee import __version__ as cognee_version

 logger = get_logger("api.cognify")

@ -98,6 +98,7 @@ def get_cognify_router() -> APIRouter:
            user.id,
            additional_properties={
                "endpoint": "POST /v1/cognify",
+                "cognee_version": cognee_version,
            },
        )

--- a/cognee/api/v1/datasets/datasets.py
+++ b/cognee/api/v1/datasets/datasets.py
@ -1,4 +1,5 @@
 from uuid import UUID
+from cognee.modules.data.methods import has_dataset_data
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.ingestion import discover_directory_datasets
 from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
@ -26,6 +27,16 @@ class datasets:

        return await get_dataset_data(dataset.id)

+    @staticmethod
+    async def has_data(dataset_id: str) -> bool:
+        from cognee.modules.data.methods import get_dataset
+
+        user = await get_default_user()
+
+        dataset = await get_dataset(user.id, dataset_id)
+
+        return await has_dataset_data(dataset.id)
+
    @staticmethod
    async def get_status(dataset_ids: list[UUID]) -> dict:
        return await get_pipeline_status(dataset_ids, pipeline_name="cognify_pipeline")
--- a/cognee/api/v1/datasets/routers/get_datasets_router.py
+++ b/cognee/api/v1/datasets/routers/get_datasets_router.py
@ -24,6 +24,7 @@ from cognee.modules.users.permissions.methods import (
 from cognee.modules.graph.methods import get_formatted_graph_data
 from cognee.modules.pipelines.models import PipelineRunStatus
 from cognee.shared.utils import send_telemetry
+from cognee import __version__ as cognee_version

 logger = get_logger()

@ -100,6 +101,7 @@ def get_datasets_router() -> APIRouter:
            user.id,
            additional_properties={
                "endpoint": "GET /v1/datasets",
+                "cognee_version": cognee_version,
            },
        )

@ -147,6 +149,7 @@ def get_datasets_router() -> APIRouter:
            user.id,
            additional_properties={
                "endpoint": "POST /v1/datasets",
+                "cognee_version": cognee_version,
            },
        )

@ -201,6 +204,7 @@ def get_datasets_router() -> APIRouter:
            additional_properties={
                "endpoint": f"DELETE /v1/datasets/{str(dataset_id)}",
                "dataset_id": str(dataset_id),
+                "cognee_version": cognee_version,
            },
        )

@ -246,6 +250,7 @@ def get_datasets_router() -> APIRouter:
                "endpoint": f"DELETE /v1/datasets/{str(dataset_id)}/data/{str(data_id)}",
                "dataset_id": str(dataset_id),
                "data_id": str(data_id),
+                "cognee_version": cognee_version,
            },
        )

@ -327,6 +332,7 @@ def get_datasets_router() -> APIRouter:
            additional_properties={
                "endpoint": f"GET /v1/datasets/{str(dataset_id)}/data",
                "dataset_id": str(dataset_id),
+                "cognee_version": cognee_version,
            },
        )

@ -387,6 +393,7 @@ def get_datasets_router() -> APIRouter:
            additional_properties={
                "endpoint": "GET /v1/datasets/status",
                "datasets": [str(dataset_id) for dataset_id in datasets],
+                "cognee_version": cognee_version,
            },
        )

@ -433,6 +440,7 @@ def get_datasets_router() -> APIRouter:
                "endpoint": f"GET /v1/datasets/{str(dataset_id)}/data/{str(data_id)}/raw",
                "dataset_id": str(dataset_id),
                "data_id": str(data_id),
+                "cognee_version": cognee_version,
            },
        )

--- a/cognee/api/v1/delete/routers/get_delete_router.py
+++ b/cognee/api/v1/delete/routers/get_delete_router.py
@ -6,6 +6,7 @@ from cognee.shared.logging_utils import get_logger
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_authenticated_user
 from cognee.shared.utils import send_telemetry
+from cognee import __version__ as cognee_version

 logger = get_logger()

@ -39,6 +40,7 @@ def get_delete_router() -> APIRouter:
                "endpoint": "DELETE /v1/delete",
                "dataset_id": str(dataset_id),
                "data_id": str(data_id),
+                "cognee_version": cognee_version,
            },
        )

--- a/cognee/api/v1/memify/routers/get_memify_router.py
+++ b/cognee/api/v1/memify/routers/get_memify_router.py
@ -12,6 +12,7 @@ from cognee.modules.users.methods import get_authenticated_user
 from cognee.shared.utils import send_telemetry
 from cognee.modules.pipelines.models import PipelineRunErrored
 from cognee.shared.logging_utils import get_logger
+from cognee import __version__ as cognee_version

 logger = get_logger()

@ -73,7 +74,7 @@ def get_memify_router() -> APIRouter:
        send_telemetry(
            "Memify API Endpoint Invoked",
            user.id,
-            additional_properties={"endpoint": "POST /v1/memify"},
+            additional_properties={"endpoint": "POST /v1/memify", "cognee_version": cognee_version},
        )

        if not payload.dataset_id and not payload.dataset_name:
--- a/cognee/api/v1/permissions/routers/get_permissions_router.py
+++ b/cognee/api/v1/permissions/routers/get_permissions_router.py
@ -1,12 +1,18 @@
 from uuid import UUID
-from typing import List
+from typing import List, Union

 from fastapi import APIRouter, Depends
 from fastapi.responses import JSONResponse

 from cognee.modules.users.models import User
+from cognee.api.DTO import InDTO
 from cognee.modules.users.methods import get_authenticated_user
 from cognee.shared.utils import send_telemetry
+from cognee import __version__ as cognee_version
+
+
+class SelectTenantDTO(InDTO):
+    tenant_id: UUID | None = None


 def get_permissions_router() -> APIRouter:
@ -48,6 +54,7 @@ def get_permissions_router() -> APIRouter:
                "endpoint": f"POST /v1/permissions/datasets/{str(principal_id)}",
                "dataset_ids": str(dataset_ids),
                "principal_id": str(principal_id),
+                "cognee_version": cognee_version,
            },
        )

@ -89,6 +96,7 @@ def get_permissions_router() -> APIRouter:
            additional_properties={
                "endpoint": "POST /v1/permissions/roles",
                "role_name": role_name,
+                "cognee_version": cognee_version,
            },
        )

@ -133,6 +141,7 @@ def get_permissions_router() -> APIRouter:
                "endpoint": f"POST /v1/permissions/users/{str(user_id)}/roles",
                "user_id": str(user_id),
                "role_id": str(role_id),
+                "cognee_version": cognee_version,
            },
        )

@ -175,6 +184,7 @@ def get_permissions_router() -> APIRouter:
                "endpoint": f"POST /v1/permissions/users/{str(user_id)}/tenants",
                "user_id": str(user_id),
                "tenant_id": str(tenant_id),
+                "cognee_version": cognee_version,
            },
        )

@ -209,6 +219,7 @@ def get_permissions_router() -> APIRouter:
            additional_properties={
                "endpoint": "POST /v1/permissions/tenants",
                "tenant_name": tenant_name,
+                "cognee_version": cognee_version,
            },
        )

@ -220,4 +231,39 @@ def get_permissions_router() -> APIRouter:
            status_code=200, content={"message": "Tenant created.", "tenant_id": str(tenant_id)}
        )

+    @permissions_router.post("/tenants/select")
+    async def select_tenant(payload: SelectTenantDTO, user: User = Depends(get_authenticated_user)):
+        """
+        Select current tenant.
+
+        This endpoint selects a tenant with the specified UUID. Tenants are used
+        to organize users and resources in multi-tenant environments, providing
+        isolation and access control between different groups or organizations.
+
+        Sending a null/None value as tenant_id selects his default single user tenant
+
+        ## Request Parameters
+        - **tenant_id** (Union[UUID, None]): UUID of the tenant to select, If null/None is provided use the default single user tenant
+
+        ## Response
+        Returns a success message along with selected tenant id.
+        """
+        send_telemetry(
+            "Permissions API Endpoint Invoked",
+            user.id,
+            additional_properties={
+                "endpoint": f"POST /v1/permissions/tenants/{str(payload.tenant_id)}",
+                "tenant_id": str(payload.tenant_id),
+            },
+        )
+
+        from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method
+
+        await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id)
+
+        return JSONResponse(
+            status_code=200,
+            content={"message": "Tenant selected.", "tenant_id": str(payload.tenant_id)},
+        )
+
    return permissions_router
--- a/cognee/api/v1/search/routers/get_search_router.py
+++ b/cognee/api/v1/search/routers/get_search_router.py
@ -13,6 +13,7 @@ from cognee.modules.users.models import User
 from cognee.modules.search.operations import get_history
 from cognee.modules.users.methods import get_authenticated_user
 from cognee.shared.utils import send_telemetry
+from cognee import __version__ as cognee_version


 # Note: Datasets sent by name will only map to datasets owned by the request sender
@ -61,9 +62,7 @@ def get_search_router() -> APIRouter:
        send_telemetry(
            "Search API Endpoint Invoked",
            user.id,
-            additional_properties={
-                "endpoint": "GET /v1/search",
-            },
+            additional_properties={"endpoint": "GET /v1/search", "cognee_version": cognee_version},
        )

        try:
@ -118,6 +117,7 @@ def get_search_router() -> APIRouter:
                "top_k": payload.top_k,
                "only_context": payload.only_context,
                "use_combined_context": payload.use_combined_context,
+                "cognee_version": cognee_version,
            },
        )

--- a/cognee/api/v1/search/search.py
+++ b/cognee/api/v1/search/search.py
@ -1,6 +1,7 @@
 from uuid import UUID
 from typing import Union, Optional, List, Type

+from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.engine.models.node_set import NodeSet
 from cognee.modules.users.models import User
 from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult
--- a/cognee/api/v1/sync/routers/get_sync_router.py
+++ b/cognee/api/v1/sync/routers/get_sync_router.py
@ -12,6 +12,7 @@ from cognee.modules.sync.methods import get_running_sync_operations_for_user, ge
 from cognee.shared.utils import send_telemetry
 from cognee.shared.logging_utils import get_logger
 from cognee.api.v1.sync import SyncResponse
+from cognee import __version__ as cognee_version
 from cognee.context_global_variables import set_database_global_context_variables

 logger = get_logger()
@ -99,6 +100,7 @@ def get_sync_router() -> APIRouter:
            user.id,
            additional_properties={
                "endpoint": "POST /v1/sync",
+                "cognee_version": cognee_version,
                "dataset_ids": [str(id) for id in request.dataset_ids]
                if request.dataset_ids
                else "*",
@ -205,6 +207,7 @@ def get_sync_router() -> APIRouter:
            user.id,
            additional_properties={
                "endpoint": "GET /v1/sync/status",
+                "cognee_version": cognee_version,
            },
        )

--- a/cognee/api/v1/ui/ui.py
+++ b/cognee/api/v1/ui/ui.py
@ -503,7 +503,7 @@ def start_ui(
    if start_mcp:
        logger.info("Starting Cognee MCP server with Docker...")
        try:
-            image = "cognee/cognee-mcp:feature-standalone-mcp"  # TODO: change to "cognee/cognee-mcp:main" right before merging into main
+            image = "cognee/cognee-mcp:main"
            subprocess.run(["docker", "pull", image], check=True)

            import uuid
@ -538,9 +538,7 @@ def start_ui(
                env_file = os.path.join(cwd, ".env")
                docker_cmd.extend(["--env-file", env_file])

-            docker_cmd.append(
-                image
-            )  # TODO: change to "cognee/cognee-mcp:main" right before merging into main
+            docker_cmd.append(image)

            mcp_process = subprocess.Popen(
                docker_cmd,
--- a/cognee/api/v1/update/routers/get_update_router.py
+++ b/cognee/api/v1/update/routers/get_update_router.py
@ -9,6 +9,7 @@ from cognee.shared.logging_utils import get_logger
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_authenticated_user
 from cognee.shared.utils import send_telemetry
+from cognee import __version__ as cognee_version
 from cognee.modules.pipelines.models.PipelineRunInfo import (
    PipelineRunErrored,
 )
@ -64,6 +65,7 @@ def get_update_router() -> APIRouter:
                "dataset_id": str(dataset_id),
                "data_id": str(data_id),
                "node_set": str(node_set),
+                "cognee_version": cognee_version,
            },
        )

--- a/cognee/api/v1/update/update.py
+++ b/cognee/api/v1/update/update.py
@ -1,5 +1,5 @@
 from uuid import UUID
-from typing import Union, BinaryIO, List, Optional
+from typing import Union, BinaryIO, List, Optional, Any

 from cognee.modules.users.models import User
 from cognee.api.v1.delete import delete
@ -15,7 +15,7 @@ async def update(
    node_set: Optional[List[str]] = None,
    vector_db_config: dict = None,
    graph_db_config: dict = None,
-    preferred_loaders: List[str] = None,
+    preferred_loaders: dict[str, dict[str, Any]] = None,
    incremental_loading: bool = True,
 ):
    """
--- a/cognee/api/v1/users/routers/get_visualize_router.py
+++ b/cognee/api/v1/users/routers/get_visualize_router.py
@ -8,6 +8,7 @@ from cognee.modules.users.models import User

 from cognee.context_global_variables import set_database_global_context_variables
 from cognee.shared.utils import send_telemetry
+from cognee import __version__ as cognee_version

 logger = get_logger()

@ -46,6 +47,7 @@ def get_visualize_router() -> APIRouter:
            additional_properties={
                "endpoint": "GET /v1/visualize",
                "dataset_id": str(dataset_id),
+                "cognee_version": cognee_version,
            },
        )

--- a/cognee/base_config.py
+++ b/cognee/base_config.py
@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 from typing import Optional
 from functools import lru_cache
 from cognee.root_dir import get_absolute_path, ensure_absolute_path
@ -11,6 +12,9 @@ class BaseConfig(BaseSettings):
    data_root_directory: str = get_absolute_path(".data_storage")
    system_root_directory: str = get_absolute_path(".cognee_system")
    cache_root_directory: str = get_absolute_path(".cognee_cache")
+    logs_root_directory: str = os.getenv(
+        "COGNEE_LOGS_DIR", str(os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs"))
+    )
    monitoring_tool: object = Observer.NONE

    @pydantic.model_validator(mode="after")
@ -30,6 +34,8 @@ class BaseConfig(BaseSettings):
        # Require absolute paths for root directories
        self.data_root_directory = ensure_absolute_path(self.data_root_directory)
        self.system_root_directory = ensure_absolute_path(self.system_root_directory)
+        self.logs_root_directory = ensure_absolute_path(self.logs_root_directory)
+
        # Set monitoring tool based on available keys
        if self.langfuse_public_key and self.langfuse_secret_key:
            self.monitoring_tool = Observer.LANGFUSE
@ -49,6 +55,7 @@ class BaseConfig(BaseSettings):
            "system_root_directory": self.system_root_directory,
            "monitoring_tool": self.monitoring_tool,
            "cache_root_directory": self.cache_root_directory,
+            "logs_root_directory": self.logs_root_directory,
        }


--- a/cognee/context_global_variables.py
+++ b/cognee/context_global_variables.py
@ -4,6 +4,8 @@ from typing import Union
 from uuid import UUID

 from cognee.base_config import get_base_config
+from cognee.infrastructure.databases.vector.config import get_vectordb_context_config
+from cognee.infrastructure.databases.graph.config import get_graph_context_config
 from cognee.infrastructure.databases.utils import get_or_create_dataset_database
 from cognee.infrastructure.files.storage.config import file_storage_config
 from cognee.modules.users.methods import get_user
@ -13,14 +15,41 @@ from cognee.modules.users.methods import get_user
 vector_db_config = ContextVar("vector_db_config", default=None)
 graph_db_config = ContextVar("graph_db_config", default=None)
 session_user = ContextVar("session_user", default=None)
-soup_crawler_config = ContextVar("soup_crawler_config", default=None)
-tavily_config = ContextVar("tavily_config", default=None)
+
+VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"]
+GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"]


 async def set_session_user_context_variable(user):
    session_user.set(user)


+def multi_user_support_possible():
+    graph_db_config = get_graph_context_config()
+    vector_db_config = get_vectordb_context_config()
+    return (
+        graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT
+        and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT
+    )
+
+
+def backend_access_control_enabled():
+    backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None)
+    if backend_access_control is None:
+        # If backend access control is not defined in environment variables,
+        # enable it by default if graph and vector DBs can support it, otherwise disable it
+        return multi_user_support_possible()
+    elif backend_access_control.lower() == "true":
+        # If enabled, ensure that the current graph and vector DBs can support it
+        multi_user_support = multi_user_support_possible()
+        if not multi_user_support:
+            raise EnvironmentError(
+                "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control."
+            )
+        return True
+    return False
+
+
 async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID):
    """
    If backend access control is enabled this function will ensure all datasets have their own databases,
@ -40,9 +69,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_

    """

-    base_config = get_base_config()
-
-    if not os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true":
+    if not backend_access_control_enabled():
        return

    user = await get_user(user_id)
@ -50,6 +77,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_
    # To ensure permissions are enforced properly all datasets will have their own databases
    dataset_database = await get_or_create_dataset_database(dataset, user)

+    base_config = get_base_config()
    data_root_directory = os.path.join(
        base_config.data_root_directory, str(user.tenant_id or user.id)
    )
@ -59,15 +87,17 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_

    # Set vector and graph database configuration based on dataset database information
    vector_config = {
-        "vector_db_url": os.path.join(
-            databases_directory_path, dataset_database.vector_database_name
-        ),
-        "vector_db_key": "",
-        "vector_db_provider": "lancedb",
+        "vector_db_provider": dataset_database.vector_database_provider,
+        "vector_db_url": dataset_database.vector_database_url,
+        "vector_db_key": dataset_database.vector_database_key,
+        "vector_db_name": dataset_database.vector_database_name,
    }

    graph_config = {
-        "graph_database_provider": "kuzu",
+        "graph_database_provider": dataset_database.graph_database_provider,
+        "graph_database_url": dataset_database.graph_database_url,
+        "graph_database_name": dataset_database.graph_database_name,
+        "graph_database_key": dataset_database.graph_database_key,
        "graph_file_path": os.path.join(
            databases_directory_path, dataset_database.graph_database_name
        ),
--- a/cognee/exceptions/exceptions.py
+++ b/cognee/exceptions/exceptions.py
@ -56,7 +56,7 @@ class CogneeValidationError(CogneeApiError):
        self,
        message: str = "A validation error occurred.",
        name: str = "CogneeValidationError",
-        status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
        log=True,
        log_level="ERROR",
    ):
--- a/cognee/infrastructure/databases/exceptions/exceptions.py
+++ b/cognee/infrastructure/databases/exceptions/exceptions.py
@ -15,7 +15,7 @@ class DatabaseNotCreatedError(CogneeSystemError):
        self,
        message: str = "The database has not been created yet. Please call `await setup()` first.",
        name: str = "DatabaseNotCreatedError",
-        status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
    ):
        super().__init__(message, name, status_code)

@ -99,7 +99,7 @@ class EmbeddingException(CogneeConfigurationError):
        self,
        message: str = "Embedding Exception.",
        name: str = "EmbeddingException",
-        status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
    ):
        super().__init__(message, name, status_code)

--- a/cognee/infrastructure/databases/graph/config.py
+++ b/cognee/infrastructure/databases/graph/config.py
@ -26,6 +26,7 @@ class GraphConfig(BaseSettings):
    - graph_database_username
    - graph_database_password
    - graph_database_port
+    - graph_database_key
    - graph_file_path
    - graph_model
    - graph_topology
@ -41,6 +42,7 @@ class GraphConfig(BaseSettings):
    graph_database_username: str = ""
    graph_database_password: str = ""
    graph_database_port: int = 123
+    graph_database_key: str = ""
    graph_file_path: str = ""
    graph_filename: str = ""
    graph_model: object = KnowledgeGraph
@ -90,6 +92,7 @@ class GraphConfig(BaseSettings):
            "graph_database_username": self.graph_database_username,
            "graph_database_password": self.graph_database_password,
            "graph_database_port": self.graph_database_port,
+            "graph_database_key": self.graph_database_key,
            "graph_file_path": self.graph_file_path,
            "graph_model": self.graph_model,
            "graph_topology": self.graph_topology,
@ -116,6 +119,7 @@ class GraphConfig(BaseSettings):
            "graph_database_username": self.graph_database_username,
            "graph_database_password": self.graph_database_password,
            "graph_database_port": self.graph_database_port,
+            "graph_database_key": self.graph_database_key,
            "graph_file_path": self.graph_file_path,
        }

--- a/cognee/infrastructure/databases/graph/get_graph_engine.py
+++ b/cognee/infrastructure/databases/graph/get_graph_engine.py
@ -33,6 +33,7 @@ def create_graph_engine(
    graph_database_username="",
    graph_database_password="",
    graph_database_port="",
+    graph_database_key="",
 ):
    """
    Create a graph engine based on the specified provider type.
@ -69,6 +70,7 @@ def create_graph_engine(
            graph_database_url=graph_database_url,
            graph_database_username=graph_database_username,
            graph_database_password=graph_database_password,
+            database_name=graph_database_name,
        )

    if graph_database_provider == "neo4j":
--- a/cognee/infrastructure/databases/graph/graph_db_interface.py
+++ b/cognee/infrastructure/databases/graph/graph_db_interface.py
@ -159,6 +159,11 @@ class GraphDBInterface(ABC):
    - get_connections
    """

+    @abstractmethod
+    async def is_empty(self) -> bool:
+        logger.warning("is_empty() is not implemented")
+        return True
+
    @abstractmethod
    async def query(self, query: str, params: dict) -> List[Any]:
        """
--- a/cognee/infrastructure/databases/graph/kuzu/adapter.py
+++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py
@ -198,6 +198,15 @@ class KuzuAdapter(GraphDBInterface):
        except FileNotFoundError:
            logger.warning(f"Kuzu S3 storage file not found: {self.db_path}")

+    async def is_empty(self) -> bool:
+        query = """
+        MATCH (n)
+        RETURN true
+        LIMIT 1;
+        """
+        query_result = await self.query(query)
+        return len(query_result) == 0
+
    async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]:
        """
        Execute a Kuzu query asynchronously with automatic reconnection.
@ -1357,9 +1366,15 @@ class KuzuAdapter(GraphDBInterface):
                params[param_name] = values

        where_clause = " AND ".join(where_clauses)
-        nodes_query = (
-            f"MATCH (n:Node) WHERE {where_clause} RETURN n.id, {{properties: n.properties}}"
-        )
+        nodes_query = f"""
+        MATCH (n:Node)
+        WHERE {where_clause}
+        RETURN n.id, {{
+            name: n.name,
+            type: n.type,
+            properties: n.properties
+        }}
+        """
        edges_query = f"""
        MATCH (n1:Node)-[r:EDGE]->(n2:Node)
        WHERE {where_clause.replace("n.", "n1.")} AND {where_clause.replace("n.", "n2.")}
--- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
+++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py
@ -87,6 +87,15 @@ class Neo4jAdapter(GraphDBInterface):
        async with self.driver.session(database=self.graph_database_name) as session:
            yield session

+    async def is_empty(self) -> bool:
+        query = """
+        RETURN EXISTS {
+        MATCH (n)
+        } AS node_exists;
+        """
+        query_result = await self.query(query)
+        return not query_result[0]["node_exists"]
+
    @deadlock_retry()
    async def query(
        self,
--- a/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py
+++ b/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py
@ -416,6 +416,15 @@ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface):
        self._client.query(f"MATCH (n :{self._VECTOR_NODE_LABEL}) DETACH DELETE n")
        pass

+    async def is_empty(self) -> bool:
+        query = """
+        MATCH (n)
+        RETURN true
+        LIMIT 1;
+        """
+        query_result = await self._client.query(query)
+        return len(query_result) == 0
+
    @staticmethod
    def _get_scored_result(
        item: dict, with_vector: bool = False, with_score: bool = False
--- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py
+++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py
@ -1,11 +1,15 @@
+import os
 from uuid import UUID
 from typing import Union

 from sqlalchemy import select
 from sqlalchemy.exc import IntegrityError
-from cognee.modules.data.methods import create_dataset

+from cognee.base_config import get_base_config
+from cognee.modules.data.methods import create_dataset
 from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.infrastructure.databases.vector import get_vectordb_config
+from cognee.infrastructure.databases.graph.config import get_graph_config
 from cognee.modules.data.methods import get_unique_dataset_id
 from cognee.modules.users.models import DatasetDatabase
 from cognee.modules.users.models import User
@ -32,8 +36,32 @@ async def get_or_create_dataset_database(

    dataset_id = await get_unique_dataset_id(dataset, user)

-    vector_db_name = f"{dataset_id}.lance.db"
-    graph_db_name = f"{dataset_id}.pkl"
+    vector_config = get_vectordb_config()
+    graph_config = get_graph_config()
+
+    # Note: for hybrid databases both graph and vector DB name have to be the same
+    if graph_config.graph_database_provider == "kuzu":
+        graph_db_name = f"{dataset_id}.pkl"
+    else:
+        graph_db_name = f"{dataset_id}"
+
+    if vector_config.vector_db_provider == "lancedb":
+        vector_db_name = f"{dataset_id}.lance.db"
+    else:
+        vector_db_name = f"{dataset_id}"
+
+    base_config = get_base_config()
+    databases_directory_path = os.path.join(
+        base_config.system_root_directory, "databases", str(user.id)
+    )
+
+    # Determine vector database URL
+    if vector_config.vector_db_provider == "lancedb":
+        vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name)
+    else:
+        vector_db_url = vector_config.vector_database_url
+
+    # Determine graph database URL

    async with db_engine.get_async_session() as session:
        # Create dataset if it doesn't exist
@ -55,6 +83,12 @@ async def get_or_create_dataset_database(
            dataset_id=dataset_id,
            vector_database_name=vector_db_name,
            graph_database_name=graph_db_name,
+            vector_database_provider=vector_config.vector_db_provider,
+            graph_database_provider=graph_config.graph_database_provider,
+            vector_database_url=vector_db_url,
+            graph_database_url=graph_config.graph_database_url,
+            vector_database_key=vector_config.vector_db_key,
+            graph_database_key=graph_config.graph_database_key,
        )

        try:
--- a/cognee/infrastructure/databases/vector/config.py
+++ b/cognee/infrastructure/databases/vector/config.py
@ -18,12 +18,14 @@ class VectorConfig(BaseSettings):
    Instance variables:
    - vector_db_url: The URL of the vector database.
    - vector_db_port: The port for the vector database.
+    - vector_db_name: The name of the vector database.
    - vector_db_key: The key for accessing the vector database.
    - vector_db_provider: The provider for the vector database.
    """

    vector_db_url: str = ""
    vector_db_port: int = 1234
+    vector_db_name: str = ""
    vector_db_key: str = ""
    vector_db_provider: str = "lancedb"

@ -58,6 +60,7 @@ class VectorConfig(BaseSettings):
        return {
            "vector_db_url": self.vector_db_url,
            "vector_db_port": self.vector_db_port,
+            "vector_db_name": self.vector_db_name,
            "vector_db_key": self.vector_db_key,
            "vector_db_provider": self.vector_db_provider,
        }
--- a/cognee/infrastructure/databases/vector/create_vector_engine.py
+++ b/cognee/infrastructure/databases/vector/create_vector_engine.py
@ -1,5 +1,6 @@
 from .supported_databases import supported_databases
 from .embeddings import get_embedding_engine
+from cognee.infrastructure.databases.graph.config import get_graph_context_config

 from functools import lru_cache

@ -8,6 +9,7 @@ from functools import lru_cache
 def create_vector_engine(
    vector_db_provider: str,
    vector_db_url: str,
+    vector_db_name: str,
    vector_db_port: str = "",
    vector_db_key: str = "",
 ):
@ -27,6 +29,7 @@ def create_vector_engine(
        - vector_db_url (str): The URL for the vector database instance.
        - vector_db_port (str): The port for the vector database instance. Required for some
          providers.
+        - vector_db_name (str): The name of the vector database instance.
        - vector_db_key (str): The API key or access token for the vector database instance.
        - vector_db_provider (str): The name of the vector database provider to use (e.g.,
          'pgvector').
@ -45,9 +48,10 @@ def create_vector_engine(
            url=vector_db_url,
            api_key=vector_db_key,
            embedding_engine=embedding_engine,
+            database_name=vector_db_name,
        )

-    if vector_db_provider == "pgvector":
+    if vector_db_provider.lower() == "pgvector":
        from cognee.infrastructure.databases.relational import get_relational_config

        # Get configuration for postgres database
@ -78,7 +82,7 @@ def create_vector_engine(
            embedding_engine,
        )

-    elif vector_db_provider == "chromadb":
+    elif vector_db_provider.lower() == "chromadb":
        try:
            import chromadb
        except ImportError:
@ -94,7 +98,7 @@ def create_vector_engine(
            embedding_engine=embedding_engine,
        )

-    elif vector_db_provider == "neptune_analytics":
+    elif vector_db_provider.lower() == "neptune_analytics":
        try:
            from langchain_aws import NeptuneAnalyticsGraph
        except ImportError:
@ -122,7 +126,7 @@ def create_vector_engine(
            embedding_engine=embedding_engine,
        )

-    else:
+    elif vector_db_provider.lower() == "lancedb":
        from .lancedb.LanceDBAdapter import LanceDBAdapter

        return LanceDBAdapter(
@ -130,3 +134,9 @@ def create_vector_engine(
            api_key=vector_db_key,
            embedding_engine=embedding_engine,
        )
+
+    else:
+        raise EnvironmentError(
+            f"Unsupported vector database provider: {vector_db_provider}. "
+            f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}"
+        )
--- a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py
@ -124,7 +124,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
                self.endpoint, json=payload, headers=headers, timeout=60.0
            ) as response:
                data = await response.json()
-                return data["embedding"]
+                return data["embeddings"][0]

    def get_vector_size(self) -> int:
        """
--- a/cognee/infrastructure/databases/vector/exceptions/exceptions.py
+++ b/cognee/infrastructure/databases/vector/exceptions/exceptions.py
@ -15,7 +15,7 @@ class CollectionNotFoundError(CogneeValidationError):
        self,
        message,
        name: str = "CollectionNotFoundError",
-        status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
        log=True,
        log_level="DEBUG",
    ):
--- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
+++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py
@ -324,7 +324,6 @@ class LanceDBAdapter(VectorDBInterface):

    def get_data_point_schema(self, model_type: BaseModel):
        related_models_fields = []
-
        for field_name, field_config in model_type.model_fields.items():
            if hasattr(field_config, "model_fields"):
                related_models_fields.append(field_name)
--- a/cognee/infrastructure/engine/models/Edge.py
+++ b/cognee/infrastructure/engine/models/Edge.py
@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, field_validator
 from typing import Optional, Any, Dict


@ -18,9 +18,21 @@ class Edge(BaseModel):

        # Mixed usage
        has_items: (Edge(weight=0.5, weights={"confidence": 0.9}), list[Item])
+
+        # With edge_text for rich embedding representation
+        contains: (Edge(relationship_type="contains", edge_text="relationship_name: contains; entity_description: Alice"), Entity)
    """

    weight: Optional[float] = None
    weights: Optional[Dict[str, float]] = None
    relationship_type: Optional[str] = None
    properties: Optional[Dict[str, Any]] = None
+    edge_text: Optional[str] = None
+
+    @field_validator("edge_text", mode="before")
+    @classmethod
+    def ensure_edge_text(cls, v, info):
+        """Auto-populate edge_text from relationship_type if not explicitly provided."""
+        if v is None and info.data.get("relationship_type"):
+            return info.data["relationship_type"]
+        return v
--- a/cognee/infrastructure/files/exceptions.py
+++ b/cognee/infrastructure/files/exceptions.py
@ -8,6 +8,6 @@ class FileContentHashingError(Exception):
        self,
        message: str = "Failed to hash content of the file.",
        name: str = "FileContentHashingError",
-        status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
    ):
        super().__init__(message, name, status_code)
--- a/cognee/infrastructure/files/storage/LocalFileStorage.py
+++ b/cognee/infrastructure/files/storage/LocalFileStorage.py
@ -82,16 +82,16 @@ class LocalFileStorage(Storage):
        self.ensure_directory_exists(file_dir_path)

        if overwrite or not os.path.exists(full_file_path):
-            with open(
-                full_file_path,
-                mode="w" if isinstance(data, str) else "wb",
-                encoding="utf-8" if isinstance(data, str) else None,
-            ) as file:
-                if hasattr(data, "read"):
-                    data.seek(0)
-                    file.write(data.read())
-                else:
+            if isinstance(data, str):
+                with open(full_file_path, mode="w", encoding="utf-8", newline="\n") as file:
                    file.write(data)
+            else:
+                with open(full_file_path, mode="wb") as file:
+                    if hasattr(data, "read"):
+                        data.seek(0)
+                        file.write(data.read())
+                    else:
+                        file.write(data)

                file.close()

--- a/cognee/infrastructure/files/storage/S3FileStorage.py
+++ b/cognee/infrastructure/files/storage/S3FileStorage.py
@ -70,18 +70,18 @@ class S3FileStorage(Storage):
        if overwrite or not await self.file_exists(file_path):

            def save_data_to_file():
-                with self.s3.open(
-                    full_file_path,
-                    mode="w" if isinstance(data, str) else "wb",
-                    encoding="utf-8" if isinstance(data, str) else None,
-                ) as file:
-                    if hasattr(data, "read"):
-                        data.seek(0)
-                        file.write(data.read())
-                    else:
+                if isinstance(data, str):
+                    with self.s3.open(
+                        full_file_path, mode="w", encoding="utf-8", newline="\n"
+                    ) as file:
                        file.write(data)
-
-                    file.close()
+                else:
+                    with self.s3.open(full_file_path, mode="wb") as file:
+                        if hasattr(data, "read"):
+                            data.seek(0)
+                            file.write(data.read())
+                        else:
+                            file.write(data)

            await run_async(save_data_to_file)

--- a/cognee/infrastructure/files/utils/get_file_metadata.py
+++ b/cognee/infrastructure/files/utils/get_file_metadata.py
@ -1,6 +1,6 @@
 import io
 import os.path
-from typing import BinaryIO, TypedDict
+from typing import BinaryIO, TypedDict, Optional
 from pathlib import Path

 from cognee.shared.logging_utils import get_logger
@ -27,7 +27,7 @@ class FileMetadata(TypedDict):
    file_size: int


-async def get_file_metadata(file: BinaryIO) -> FileMetadata:
+async def get_file_metadata(file: BinaryIO, name: Optional[str] = None) -> FileMetadata:
    """
    Retrieve metadata from a file object.

@ -53,7 +53,7 @@ async def get_file_metadata(file: BinaryIO) -> FileMetadata:
    except io.UnsupportedOperation as error:
        logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n")

-    file_type = guess_file_type(file)
+    file_type = guess_file_type(file, name)

    file_path = getattr(file, "name", None) or getattr(file, "full_name", None)

--- a/cognee/infrastructure/files/utils/guess_file_type.py
+++ b/cognee/infrastructure/files/utils/guess_file_type.py
@ -1,8 +1,9 @@
-from typing import BinaryIO
+import io
+from pathlib import Path
+from typing import BinaryIO, Optional, Any
 import filetype
-
-from .is_text_content import is_text_content
-from .is_csv_content import is_csv_content
+from tempfile import SpooledTemporaryFile
+from filetype.types.base import Type


 class FileTypeException(Exception):
@ -24,90 +25,7 @@ class FileTypeException(Exception):
        self.message = message


-class TxtFileType(filetype.Type):
-    """
-    Represents a text file type with specific MIME and extension properties.
-
-    Public methods:
-    - match: Determines whether a given buffer matches the text file type.
-    """
-
-    MIME = "text/plain"
-    EXTENSION = "txt"
-
-    def __init__(self):
-        super(TxtFileType, self).__init__(mime=TxtFileType.MIME, extension=TxtFileType.EXTENSION)
-
-    def match(self, buf):
-        """
-        Determine if the given buffer contains text content.
-
-        Parameters:
-        -----------
-
-            - buf: The buffer to check for text content.
-
-        Returns:
-        --------
-
-            Returns True if the buffer is identified as text content, otherwise False.
-        """
-        return is_text_content(buf)
-
-
-txt_file_type = TxtFileType()
-
-filetype.add_type(txt_file_type)
-
-
-class CustomPdfMatcher(filetype.Type):
-    """
-    Match PDF file types based on MIME type and extension.
-
-    Public methods:
-    - match
-
-    Instance variables:
-    - MIME: The MIME type of the PDF.
-    - EXTENSION: The file extension of the PDF.
-    """
-
-    MIME = "application/pdf"
-    EXTENSION = "pdf"
-
-    def __init__(self):
-        super(CustomPdfMatcher, self).__init__(
-            mime=CustomPdfMatcher.MIME, extension=CustomPdfMatcher.EXTENSION
-        )
-
-    def match(self, buf):
-        """
-        Determine if the provided buffer is a PDF file.
-
-        This method checks for the presence of the PDF signature in the buffer.
-
-        Raises:
-        - TypeError: If the buffer is not of bytes type.
-
-        Parameters:
-        -----------
-
-            - buf: The buffer containing the data to be checked.
-
-        Returns:
-        --------
-
-            Returns True if the buffer contains a PDF signature, otherwise returns False.
-        """
-        return b"PDF-" in buf
-
-
-custom_pdf_matcher = CustomPdfMatcher()
-
-filetype.add_type(custom_pdf_matcher)
-
-
-def guess_file_type(file: BinaryIO) -> filetype.Type:
+def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type:
    """
    Guess the file type from the given binary file stream.

@ -124,56 +42,26 @@ def guess_file_type(file: BinaryIO) -> filetype.Type:

        - filetype.Type: The guessed file type, represented as filetype.Type.
    """
+
+    # Note: If file has .txt or .text extension, consider it a plain text file as filetype.guess may not detect it properly
+    # as it contains no magic number encoding
+    ext = None
+    if isinstance(file, str):
+        ext = Path(file).suffix
+    elif name is not None:
+        ext = Path(name).suffix
+
+    if ext in [".txt", ".text"]:
+        file_type = Type("text/plain", "txt")
+        return file_type
+
    file_type = filetype.guess(file)

    # If file type could not be determined consider it a plain text file as they don't have magic number encoding
    if file_type is None:
-        from filetype.types.base import Type
-
        file_type = Type("text/plain", "txt")

    if file_type is None:
        raise FileTypeException(f"Unknown file detected: {file.name}.")

    return file_type
-
-
-class CsvFileType(filetype.Type):
-    """
-    Match CSV file types based on MIME type and extension.
-
-    Public methods:
-    - match
-
-    Instance variables:
-    - MIME: The MIME type of the CSV.
-    - EXTENSION: The file extension of the CSV.
-    """
-
-    MIME = "text/csv"
-    EXTENSION = "csv"
-
-    def __init__(self):
-        super().__init__(mime=self.MIME, extension=self.EXTENSION)
-
-    def match(self, buf):
-        """
-        Determine if the given buffer contains csv content.
-
-        Parameters:
-        -----------
-
-            - buf: The buffer to check for csv content.
-
-        Returns:
-        --------
-
-            Returns True if the buffer is identified as csv content, otherwise False.
-        """
-
-        return is_csv_content(buf)
-
-
-csv_file_type = CsvFileType()
-
-filetype.add_type(csv_file_type)
--- a/cognee/infrastructure/llm/prompts/extract_query_time.txt
+++ b/cognee/infrastructure/llm/prompts/extract_query_time.txt
@ -1,15 +1,13 @@
-For the purposes of identifying timestamps in a query, you are tasked with extracting relevant timestamps from the query.
-## Timestamp requirements
- If the query contains interval extrack both starts_at and ends_at  properties
- If the query contains an instantaneous timestamp, starts_at and ends_at should be the same
- If the query its open-ended (before 2009 or after 2009), the corresponding non defined end of the time should be none
-    -For example: "before 2009" -- starts_at: None, ends_at: 2009 or  "after 2009" -- starts_at: 2009, ends_at: None
- Put always the data that comes first in time as starts_at and the timestamps that comes second in time as ends_at
- If starts_at or ends_at cannot be extracted both of them has to be None
-## Output Format
-Your reply should be a JSON: list of dictionaries with the following structure:
-```python
-class QueryInterval(BaseModel):
-    starts_at: Optional[Timestamp] = None
-    ends_at: Optional[Timestamp] = None
-```
+You are tasked with identifying relevant time periods where the answer to a given query should be searched.
+Current date is:  `{{ time_now }}`. Determine relevant period(s) and return structured intervals.
+
+Extraction rules:
+
+1. Query without specific timestamp: use the time period with starts_at set to None and ends_at set to now.
+2. Explicit time intervals: If the query specifies a range (e.g., from 2010 to 2020, between January and March 2023), extract both start and end dates. Always assign the earlier date to starts_at and the later date to ends_at.
+3. Single timestamp: If the query refers to one specific moment (e.g., in 2015, on March 5, 2022), set starts_at and ends_at to that same timestamp.
+4. Open-ended time references: For phrases such as "before X" or "after X", represent the unspecified side as None. For example: before 2009 → starts_at: None, ends_at: 2009; after 2009 → starts_at: 2009, ends_at: None.
+5. Current-time references ("now", "current", "today"): If the query explicitly refers to the present, set both starts_at and ends_at to now (the ingestion timestamp).
+6. "Who is" and "Who was" questions: These imply a general identity or biographical inquiry without a specific temporal scope. Set both starts_at and ends_at to None.
+7. Ordering rule: Always ensure the earlier date is assigned to starts_at and the later date to ends_at.
+8. No temporal information: If no valid or inferable time reference is found, set both starts_at and ends_at to None.
--- a/cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt
+++ b/cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt
@ -0,0 +1,14 @@
+A question was previously answered, but the answer received negative feedback.
+Please reconsider and improve the response.
+
+Question: {question}
+Context originally used: {context}
+Previous answer: {wrong_answer}
+Feedback on that answer: {negative_feedback}
+
+Task: Provide a better response. The new answer should be short and direct.
+Then explain briefly why this answer is better.
+
+Format your reply as:
+Answer: <improved answer>
+Explanation: <short explanation>
--- a/cognee/infrastructure/llm/prompts/feedback_report_prompt.txt
+++ b/cognee/infrastructure/llm/prompts/feedback_report_prompt.txt
@ -0,0 +1,13 @@
+Write a concise, stand-alone paragraph that explains the correct answer to the question below.
+The paragraph should read naturally on its own, providing all necessary context and reasoning
+so the answer is clear and well-supported.
+
+Question: {question}
+Correct answer: {improved_answer}
+Supporting context: {new_context}
+
+Your paragraph should:
+- First sentence clearly states the correct answer as a full sentence
+- Remainder flows from first sentence and provides explanation based on context
+- Use simple, direct language that is easy to follow
+- Use shorter sentences, no long-winded explanations
--- a/cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt
+++ b/cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt
@ -0,0 +1,5 @@
+Question: {question}
+Context: {context}
+
+Provide a one paragraph human readable summary of this interaction context,
+listing all the relevant facts and information in a simple and direct way.
--- a/cognee/infrastructure/loaders/LoaderEngine.py
+++ b/cognee/infrastructure/loaders/LoaderEngine.py
@ -1,6 +1,7 @@
 import filetype
 from typing import Dict, List, Optional, Any
 from .LoaderInterface import LoaderInterface
+from cognee.infrastructure.files.utils.guess_file_type import guess_file_type
 from cognee.shared.logging_utils import get_logger

 logger = get_logger(__name__)
@ -65,7 +66,9 @@ class LoaderEngine:
        return True

    def get_loader(
-        self, file_path: str, preferred_loaders: List[str] = None
+        self,
+        file_path: str,
+        preferred_loaders: dict[str, dict[str, Any]],
    ) -> Optional[LoaderInterface]:
        """
        Get appropriate loader for a file.
@ -77,14 +80,21 @@ class LoaderEngine:
        Returns:
            LoaderInterface that can handle the file, or None if not found
        """
+        from pathlib import Path

-        file_info = filetype.guess(file_path)
+        file_info = guess_file_type(file_path)
+
+        path_extension = Path(file_path).suffix.lstrip(".")

        # Try preferred loaders first
        if preferred_loaders:
            for loader_name in preferred_loaders:
                if loader_name in self._loaders:
                    loader = self._loaders[loader_name]
+                    # Try with path extension first (for text formats like html)
+                    if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                        return loader
+                    # Fall back to content-detected extension
                    if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                        return loader
                else:
@ -94,6 +104,10 @@ class LoaderEngine:
        for loader_name in self.default_loader_priority:
            if loader_name in self._loaders:
                loader = self._loaders[loader_name]
+                # Try with path extension first (for text formats like html)
+                if loader.can_handle(extension=path_extension, mime_type=file_info.mime):
+                    return loader
+                # Fall back to content-detected extension
                if loader.can_handle(extension=file_info.extension, mime_type=file_info.mime):
                    return loader
            else:
@ -106,7 +120,7 @@ class LoaderEngine:
    async def load_file(
        self,
        file_path: str,
-        preferred_loaders: Optional[List[str]] = None,
+        preferred_loaders: dict[str, dict[str, Any]] = None,
        **kwargs,
    ):
        """
@ -114,7 +128,7 @@ class LoaderEngine:

        Args:
            file_path: Path to the file to be processed
-            preferred_loaders: List of preferred loader names to try first
+            preferred_loaders: Dict of loader names to their configurations
            **kwargs: Additional loader-specific configuration

        Raises:
@ -126,8 +140,16 @@ class LoaderEngine:
            raise ValueError(f"No loader found for file: {file_path}")

        logger.debug(f"Loading {file_path} with {loader.loader_name}")
-        # TODO: loading needs to be reworked to work with both file streams and file locations
-        return await loader.load(file_path, **kwargs)
+
+        # Extract loader-specific config from preferred_loaders
+        loader_config = {}
+        if preferred_loaders and loader.loader_name in preferred_loaders:
+            loader_config = preferred_loaders[loader.loader_name]
+
+        # Merge with any additional kwargs (kwargs take precedence)
+        merged_kwargs = {**loader_config, **kwargs}
+
+        return await loader.load(file_path, **merged_kwargs)

    def get_available_loaders(self) -> List[str]:
        """
--- a/cognee/infrastructure/loaders/core/audio_loader.py
+++ b/cognee/infrastructure/loaders/core/audio_loader.py
@ -42,6 +42,7 @@ class AudioLoader(LoaderInterface):
            "audio/wav",
            "audio/amr",
            "audio/aiff",
+            "audio/x-wav",
        ]

    @property
--- a/cognee/infrastructure/loaders/external/init.py
+++ b/cognee/infrastructure/loaders/external/init.py
@ -27,3 +27,10 @@ try:
    __all__.append("AdvancedPdfLoader")
 except ImportError:
    pass
+
+try:
+    from .beautiful_soup_loader import BeautifulSoupLoader
+
+    __all__.append("BeautifulSoupLoader")
+except ImportError:
+    pass
--- a/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
+++ b/cognee/infrastructure/loaders/external/beautiful_soup_loader.py
@ -0,0 +1,310 @@
+"""BeautifulSoup-based web crawler for extracting content from web pages.
+
+This module provides the BeautifulSoupCrawler class for fetching and extracting content
+from web pages using BeautifulSoup or Playwright for JavaScript-rendered pages. It
+supports robots.txt handling, rate limiting, and custom extraction rules.
+"""
+
+from typing import Union, Dict, Any, Optional, List
+from dataclasses import dataclass
+from bs4 import BeautifulSoup
+from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
+from cognee.shared.logging_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class ExtractionRule:
+    """Normalized extraction rule for web content.
+
+    Attributes:
+        selector: CSS selector for extraction (if any).
+        xpath: XPath expression for extraction (if any).
+        attr: HTML attribute to extract (if any).
+        all: If True, extract all matching elements; otherwise, extract first.
+        join_with: String to join multiple extracted elements.
+    """
+
+    selector: Optional[str] = None
+    xpath: Optional[str] = None
+    attr: Optional[str] = None
+    all: bool = False
+    join_with: str = " "
+
+
+class BeautifulSoupLoader(LoaderInterface):
+    """Crawler for fetching and extracting web content using BeautifulSoup.
+
+    Supports asynchronous HTTP requests, Playwright for JavaScript rendering, robots.txt
+    compliance, and rate limiting. Extracts content using CSS selectors or XPath rules.
+
+    Attributes:
+        concurrency: Number of concurrent requests allowed.
+        crawl_delay: Minimum seconds between requests to the same domain.
+        max_crawl_delay: Maximum crawl delay to respect from robots.txt (None = no limit).
+        timeout: Per-request timeout in seconds.
+        max_retries: Number of retries for failed requests.
+        retry_delay_factor: Multiplier for exponential backoff on retries.
+        headers: HTTP headers for requests (e.g., User-Agent).
+        robots_cache_ttl: Time-to-live for robots.txt cache in seconds.
+    """
+
+    @property
+    def supported_extensions(self) -> List[str]:
+        return ["html"]
+
+    @property
+    def supported_mime_types(self) -> List[str]:
+        return ["text/html", "text/plain"]
+
+    @property
+    def loader_name(self) -> str:
+        return "beautiful_soup_loader"
+
+    def can_handle(self, extension: str, mime_type: str) -> bool:
+        can = extension in self.supported_extensions and mime_type in self.supported_mime_types
+        return can
+
+    def _get_default_extraction_rules(self):
+        # Comprehensive default extraction rules for common HTML content
+        return {
+            # Meta information
+            "title": {"selector": "title", "all": False},
+            "meta_description": {
+                "selector": "meta[name='description']",
+                "attr": "content",
+                "all": False,
+            },
+            "meta_keywords": {
+                "selector": "meta[name='keywords']",
+                "attr": "content",
+                "all": False,
+            },
+            # Open Graph meta tags
+            "og_title": {
+                "selector": "meta[property='og:title']",
+                "attr": "content",
+                "all": False,
+            },
+            "og_description": {
+                "selector": "meta[property='og:description']",
+                "attr": "content",
+                "all": False,
+            },
+            # Main content areas (prioritized selectors)
+            "article": {"selector": "article", "all": True, "join_with": "\n\n"},
+            "main": {"selector": "main", "all": True, "join_with": "\n\n"},
+            # Semantic content sections
+            "headers_h1": {"selector": "h1", "all": True, "join_with": "\n"},
+            "headers_h2": {"selector": "h2", "all": True, "join_with": "\n"},
+            "headers_h3": {"selector": "h3", "all": True, "join_with": "\n"},
+            "headers_h4": {"selector": "h4", "all": True, "join_with": "\n"},
+            "headers_h5": {"selector": "h5", "all": True, "join_with": "\n"},
+            "headers_h6": {"selector": "h6", "all": True, "join_with": "\n"},
+            # Text content
+            "paragraphs": {"selector": "p", "all": True, "join_with": "\n\n"},
+            "blockquotes": {"selector": "blockquote", "all": True, "join_with": "\n\n"},
+            "preformatted": {"selector": "pre", "all": True, "join_with": "\n\n"},
+            # Lists
+            "ordered_lists": {"selector": "ol", "all": True, "join_with": "\n"},
+            "unordered_lists": {"selector": "ul", "all": True, "join_with": "\n"},
+            "list_items": {"selector": "li", "all": True, "join_with": "\n"},
+            "definition_lists": {"selector": "dl", "all": True, "join_with": "\n"},
+            # Tables
+            "tables": {"selector": "table", "all": True, "join_with": "\n\n"},
+            "table_captions": {
+                "selector": "caption",
+                "all": True,
+                "join_with": "\n",
+            },
+            # Code blocks
+            "code_blocks": {"selector": "code", "all": True, "join_with": "\n"},
+            # Figures and media descriptions
+            "figures": {"selector": "figure", "all": True, "join_with": "\n\n"},
+            "figcaptions": {"selector": "figcaption", "all": True, "join_with": "\n"},
+            "image_alts": {"selector": "img", "attr": "alt", "all": True, "join_with": " "},
+            # Links (text content, not URLs to avoid clutter)
+            "link_text": {"selector": "a", "all": True, "join_with": " "},
+            # Emphasized text
+            "strong": {"selector": "strong", "all": True, "join_with": " "},
+            "emphasis": {"selector": "em", "all": True, "join_with": " "},
+            "marked": {"selector": "mark", "all": True, "join_with": " "},
+            # Time and data elements
+            "time": {"selector": "time", "all": True, "join_with": " "},
+            "data": {"selector": "data", "all": True, "join_with": " "},
+            # Sections and semantic structure
+            "sections": {"selector": "section", "all": True, "join_with": "\n\n"},
+            "asides": {"selector": "aside", "all": True, "join_with": "\n\n"},
+            "details": {"selector": "details", "all": True, "join_with": "\n"},
+            "summary": {"selector": "summary", "all": True, "join_with": "\n"},
+            # Navigation (may contain important links/structure)
+            "nav": {"selector": "nav", "all": True, "join_with": "\n"},
+            # Footer information
+            "footer": {"selector": "footer", "all": True, "join_with": "\n"},
+            # Divs with specific content roles
+            "content_divs": {
+                "selector": "div[role='main'], div[role='article'], div.content, div#content",
+                "all": True,
+                "join_with": "\n\n",
+            },
+        }
+
+    async def load(
+        self,
+        file_path: str,
+        extraction_rules: dict[str, Any] = None,
+        join_all_matches: bool = False,
+        **kwargs,
+    ):
+        """Load an HTML file, extract content, and save to storage.
+
+        Args:
+            file_path: Path to the HTML file
+            extraction_rules: Dict of CSS selector rules for content extraction
+            join_all_matches: If True, extract all matching elements for each rule
+            **kwargs: Additional arguments
+
+        Returns:
+            Path to the stored extracted text file
+        """
+        if extraction_rules is None:
+            extraction_rules = self._get_default_extraction_rules()
+            logger.info("Using default comprehensive extraction rules for HTML content")
+
+        logger.info(f"Processing HTML file: {file_path}")
+
+        from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
+        from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
+
+        with open(file_path, "rb") as f:
+            file_metadata = await get_file_metadata(f)
+            f.seek(0)
+            html = f.read()
+
+        storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
+
+        # Normalize extraction rules
+        normalized_rules: List[ExtractionRule] = []
+        for _, rule in extraction_rules.items():
+            r = self._normalize_rule(rule)
+            if join_all_matches:
+                r.all = True
+            normalized_rules.append(r)
+
+        pieces = []
+        for rule in normalized_rules:
+            text = self._extract_from_html(html, rule)
+            if text:
+                pieces.append(text)
+
+        full_content = " ".join(pieces).strip()
+
+        # remove after defaults for extraction rules
+        # Fallback: If no content extracted, check if the file is plain text (not HTML)
+        if not full_content:
+            from bs4 import BeautifulSoup
+
+            soup = BeautifulSoup(html, "html.parser")
+            # If there are no HTML tags, treat as plain text
+            if not soup.find():
+                logger.warning(
+                    f"No HTML tags found in {file_path}. Treating as plain text. "
+                    "This may happen when content is pre-extracted (e.g., via Tavily with text format)."
+                )
+                full_content = html.decode("utf-8") if isinstance(html, bytes) else html
+                full_content = full_content.strip()
+
+        if not full_content:
+            logger.warning(f"No content extracted from HTML file: {file_path}")
+
+        # Store the extracted content
+        storage_config = get_storage_config()
+        data_root_directory = storage_config["data_root_directory"]
+        storage = get_file_storage(data_root_directory)
+
+        full_file_path = await storage.store(storage_file_name, full_content)
+
+        logger.info(f"Extracted {len(full_content)} characters from HTML")
+        return full_file_path
+
+    def _normalize_rule(self, rule: Union[str, Dict[str, Any]]) -> ExtractionRule:
+        """Normalize an extraction rule to an ExtractionRule dataclass.
+
+        Args:
+            rule: A string (CSS selector) or dict with extraction parameters.
+
+        Returns:
+            ExtractionRule: Normalized extraction rule.
+
+        Raises:
+            ValueError: If the rule is invalid.
+        """
+        if isinstance(rule, str):
+            return ExtractionRule(selector=rule)
+        if isinstance(rule, dict):
+            return ExtractionRule(
+                selector=rule.get("selector"),
+                xpath=rule.get("xpath"),
+                attr=rule.get("attr"),
+                all=bool(rule.get("all", False)),
+                join_with=rule.get("join_with", " "),
+            )
+        raise ValueError(f"Invalid extraction rule: {rule}")
+
+    def _extract_from_html(self, html: str, rule: ExtractionRule) -> str:
+        """Extract content from HTML using BeautifulSoup or lxml XPath.
+
+        Args:
+            html: The HTML content to extract from.
+            rule: The extraction rule to apply.
+
+        Returns:
+            str: The extracted content.
+
+        Raises:
+            RuntimeError: If XPath is used but lxml is not installed.
+        """
+        soup = BeautifulSoup(html, "html.parser")
+
+        if rule.xpath:
+            try:
+                from lxml import html as lxml_html
+            except ImportError:
+                raise RuntimeError(
+                    "XPath requested but lxml is not available. Install lxml or use CSS selectors."
+                )
+            doc = lxml_html.fromstring(html)
+            nodes = doc.xpath(rule.xpath)
+            texts = []
+            for n in nodes:
+                if hasattr(n, "text_content"):
+                    texts.append(n.text_content().strip())
+                else:
+                    texts.append(str(n).strip())
+            return rule.join_with.join(t for t in texts if t)
+
+        if not rule.selector:
+            return ""
+
+        if rule.all:
+            nodes = soup.select(rule.selector)
+            pieces = []
+            for el in nodes:
+                if rule.attr:
+                    val = el.get(rule.attr)
+                    if val:
+                        pieces.append(val.strip())
+                else:
+                    text = el.get_text(strip=True)
+                    if text:
+                        pieces.append(text)
+            return rule.join_with.join(pieces).strip()
+        else:
+            el = soup.select_one(rule.selector)
+            if el is None:
+                return ""
+            if rule.attr:
+                val = el.get(rule.attr)
+                return (val or "").strip()
+            return el.get_text(strip=True)
--- a/cognee/infrastructure/loaders/supported_loaders.py
+++ b/cognee/infrastructure/loaders/supported_loaders.py
@ -24,3 +24,10 @@ try:
    supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
 except ImportError:
    pass
+
+try:
+    from cognee.infrastructure.loaders.external import BeautifulSoupLoader
+
+    supported_loaders[BeautifulSoupLoader.loader_name] = BeautifulSoupLoader
+except ImportError:
+    pass
--- a/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py
+++ b/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py
@ -0,0 +1,55 @@
+from typing import Optional, List
+
+from cognee import memify
+from cognee.context_global_variables import (
+    set_database_global_context_variables,
+    set_session_user_context_variable,
+)
+from cognee.exceptions import CogneeValidationError
+from cognee.modules.data.methods import get_authorized_existing_datasets
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.pipelines.tasks.task import Task
+from cognee.modules.users.models import User
+from cognee.tasks.memify import extract_user_sessions, cognify_session
+
+
+logger = get_logger("persist_sessions_in_knowledge_graph")
+
+
+async def persist_sessions_in_knowledge_graph_pipeline(
+    user: User,
+    session_ids: Optional[List[str]] = None,
+    dataset: str = "main_dataset",
+    run_in_background: bool = False,
+):
+    await set_session_user_context_variable(user)
+    dataset_to_write = await get_authorized_existing_datasets(
+        user=user, datasets=[dataset], permission_type="write"
+    )
+
+    if not dataset_to_write:
+        raise CogneeValidationError(
+            message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}",
+            log=False,
+        )
+
+    await set_database_global_context_variables(
+        dataset_to_write[0].id, dataset_to_write[0].owner_id
+    )
+
+    extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)]
+
+    enrichment_tasks = [
+        Task(cognify_session, dataset_id=dataset_to_write[0].id),
+    ]
+
+    result = await memify(
+        extraction_tasks=extraction_tasks,
+        enrichment_tasks=enrichment_tasks,
+        dataset=dataset_to_write[0].id,
+        data=[{}],
+        run_in_background=run_in_background,
+    )
+
+    logger.info("Session persistence pipeline completed")
+    return result
--- a/cognee/modules/chunking/models/DocumentChunk.py
+++ b/cognee/modules/chunking/models/DocumentChunk.py
@ -1,6 +1,7 @@
 from typing import List, Union

 from cognee.infrastructure.engine import DataPoint
+from cognee.infrastructure.engine.models.Edge import Edge
 from cognee.modules.data.processing.document_types import Document
 from cognee.modules.engine.models import Entity
 from cognee.tasks.temporal_graph.models import Event
@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
    chunk_index: int
    cut_type: str
    is_part_of: Document
-    contains: List[Union[Entity, Event]] = None
+    contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None

    metadata: dict = {"index_fields": ["text"]}
--- a/cognee/modules/chunking/text_chunker_with_overlap.py
+++ b/cognee/modules/chunking/text_chunker_with_overlap.py
@ -0,0 +1,124 @@
+from cognee.shared.logging_utils import get_logger
+from uuid import NAMESPACE_OID, uuid5
+
+from cognee.tasks.chunks import chunk_by_paragraph
+from cognee.modules.chunking.Chunker import Chunker
+from .models.DocumentChunk import DocumentChunk
+
+logger = get_logger()
+
+
+class TextChunkerWithOverlap(Chunker):
+    def __init__(
+        self,
+        document,
+        get_text: callable,
+        max_chunk_size: int,
+        chunk_overlap_ratio: float = 0.0,
+        get_chunk_data: callable = None,
+    ):
+        super().__init__(document, get_text, max_chunk_size)
+        self._accumulated_chunk_data = []
+        self._accumulated_size = 0
+        self.chunk_overlap_ratio = chunk_overlap_ratio
+        self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio)
+
+        if get_chunk_data is not None:
+            self.get_chunk_data = get_chunk_data
+        elif chunk_overlap_ratio > 0:
+            paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size)
+            self.get_chunk_data = lambda text: chunk_by_paragraph(
+                text, paragraph_max_size, batch_paragraphs=True
+            )
+        else:
+            self.get_chunk_data = lambda text: chunk_by_paragraph(
+                text, self.max_chunk_size, batch_paragraphs=True
+            )
+
+    def _accumulation_overflows(self, chunk_data):
+        """Check if adding chunk_data would exceed max_chunk_size."""
+        return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size
+
+    def _accumulate_chunk_data(self, chunk_data):
+        """Add chunk_data to the current accumulation."""
+        self._accumulated_chunk_data.append(chunk_data)
+        self._accumulated_size += chunk_data["chunk_size"]
+
+    def _clear_accumulation(self):
+        """Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio."""
+        if self.chunk_overlap == 0:
+            self._accumulated_chunk_data = []
+            self._accumulated_size = 0
+            return
+
+        # Keep chunk_data from the end that fit in overlap
+        overlap_chunk_data = []
+        overlap_size = 0
+
+        for chunk_data in reversed(self._accumulated_chunk_data):
+            if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap:
+                overlap_chunk_data.insert(0, chunk_data)
+                overlap_size += chunk_data["chunk_size"]
+            else:
+                break
+
+        self._accumulated_chunk_data = overlap_chunk_data
+        self._accumulated_size = overlap_size
+
+    def _create_chunk(self, text, size, cut_type, chunk_id=None):
+        """Create a DocumentChunk with standard metadata."""
+        try:
+            return DocumentChunk(
+                id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
+                text=text,
+                chunk_size=size,
+                is_part_of=self.document,
+                chunk_index=self.chunk_index,
+                cut_type=cut_type,
+                contains=[],
+                metadata={"index_fields": ["text"]},
+            )
+        except Exception as e:
+            logger.error(e)
+            raise e
+
+    def _create_chunk_from_accumulation(self):
+        """Create a DocumentChunk from current accumulated chunk_data."""
+        chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data)
+        return self._create_chunk(
+            text=chunk_text,
+            size=self._accumulated_size,
+            cut_type=self._accumulated_chunk_data[-1]["cut_type"],
+        )
+
+    def _emit_chunk(self, chunk_data):
+        """Emit a chunk when accumulation overflows."""
+        if len(self._accumulated_chunk_data) > 0:
+            chunk = self._create_chunk_from_accumulation()
+            self._clear_accumulation()
+            self._accumulate_chunk_data(chunk_data)
+        else:
+            # Handle single chunk_data exceeding max_chunk_size
+            chunk = self._create_chunk(
+                text=chunk_data["text"],
+                size=chunk_data["chunk_size"],
+                cut_type=chunk_data["cut_type"],
+                chunk_id=chunk_data["chunk_id"],
+            )
+
+        self.chunk_index += 1
+        return chunk
+
+    async def read(self):
+        async for content_text in self.get_text():
+            for chunk_data in self.get_chunk_data(content_text):
+                if not self._accumulation_overflows(chunk_data):
+                    self._accumulate_chunk_data(chunk_data)
+                    continue
+
+                yield self._emit_chunk(chunk_data)
+
+        if len(self._accumulated_chunk_data) == 0:
+            return
+
+        yield self._create_chunk_from_accumulation()
--- a/cognee/modules/data/exceptions/exceptions.py
+++ b/cognee/modules/data/exceptions/exceptions.py
@ -10,7 +10,7 @@ class UnstructuredLibraryImportError(CogneeConfigurationError):
        self,
        message: str = "Import error. Unstructured library is not installed.",
        name: str = "UnstructuredModuleImportError",
-        status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code=status.HTTP_422_UNPROCESSABLE_CONTENT,
    ):
        super().__init__(message, name, status_code)

--- a/cognee/modules/data/methods/init.py
+++ b/cognee/modules/data/methods/init.py
@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
 from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
 from .get_data import get_data
 from .get_unique_dataset_id import get_unique_dataset_id
+from .get_unique_data_id import get_unique_data_id
 from .get_authorized_existing_datasets import get_authorized_existing_datasets
 from .get_dataset_ids import get_dataset_ids

@ -23,3 +24,6 @@ from .create_authorized_dataset import create_authorized_dataset

 # Check
 from .check_dataset_name import check_dataset_name
+
+# Boolean check
+from .has_dataset_data import has_dataset_data
--- a/cognee/modules/data/methods/create_dataset.py
+++ b/cognee/modules/data/methods/create_dataset.py
@ -16,14 +16,16 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) -
            .options(joinedload(Dataset.data))
            .filter(Dataset.name == dataset_name)
            .filter(Dataset.owner_id == owner_id)
+            .filter(Dataset.tenant_id == user.tenant_id)
        )
    ).first()

    if dataset is None:
        # Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name
        dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user)
-        dataset = Dataset(id=dataset_id, name=dataset_name, data=[])
-        dataset.owner_id = owner_id
+        dataset = Dataset(
+            id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id
+        )

        session.add(dataset)

--- a/cognee/modules/data/methods/get_dataset_ids.py
+++ b/cognee/modules/data/methods/get_dataset_ids.py
@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user):
            # Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.)
            user_datasets = await get_datasets(user.id)
            # Filter out non name mentioned datasets
-            dataset_ids = [dataset.id for dataset in user_datasets if dataset.name in datasets]
+            dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets]
+            # Filter out non current tenant datasets
+            dataset_ids = [
+                dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id
+            ]
        else:
            raise DatasetTypeError(
                f"One or more of the provided dataset types is not handled: f{datasets}"
--- a/cognee/modules/data/methods/get_unique_data_id.py
+++ b/cognee/modules/data/methods/get_unique_data_id.py
@ -0,0 +1,68 @@
+from uuid import uuid5, NAMESPACE_OID, UUID
+from sqlalchemy import select
+
+from cognee.modules.data.models.Data import Data
+from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.modules.users.models import User
+
+
+async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
+    """
+    Function returns a unique UUID for data based on data identifier, user id and tenant id.
+    If data with legacy ID exists, return that ID to maintain compatibility.
+
+    Args:
+        data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
+        user: User object adding the data
+        tenant_id: UUID of the tenant for which data is being added
+
+    Returns:
+        UUID: Unique identifier for the data
+    """
+
+    def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
+        """
+        Deprecated function, returns a unique UUID for data based on data identifier and user id.
+        Needed to support legacy data without tenant information.
+        Args:
+            data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
+            user: User object adding the data
+
+        Returns:
+            UUID: Unique identifier for the data
+        """
+        # return UUID hash of file contents + owner id + tenant_id
+        return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
+
+    def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
+        """
+        Function returns a unique UUID for data based on data identifier, user id and tenant id.
+        Args:
+            data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
+            user: User object adding the data
+            tenant_id: UUID of the tenant for which data is being added
+
+        Returns:
+            UUID: Unique identifier for the data
+        """
+        # return UUID hash of file contents + owner id + tenant_id
+        return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
+
+    # Get all possible data_id values
+    data_id = {
+        "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
+        "legacy_data_id": _get_deprecated_unique_data_id(
+            data_identifier=data_identifier, user=user
+        ),
+    }
+
+    # Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
+    db_engine = get_relational_engine()
+    async with db_engine.get_async_session() as session:
+        legacy_data_point = (
+            await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
+        ).scalar_one_or_none()
+
+        if not legacy_data_point:
+            return data_id["modern_data_id"]
+        return data_id["legacy_data_id"]
--- a/cognee/modules/data/methods/get_unique_dataset_id.py
+++ b/cognee/modules/data/methods/get_unique_dataset_id.py
@ -1,9 +1,71 @@
 from uuid import UUID, uuid5, NAMESPACE_OID
-from cognee.modules.users.models import User
 from typing import Union
+from sqlalchemy import select
+
+from cognee.modules.data.models.Dataset import Dataset
+from cognee.modules.users.models import User
+from cognee.infrastructure.databases.relational import get_relational_engine


 async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
-    if isinstance(dataset_name, UUID):
-        return dataset_name
-    return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
+    """
+    Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
+    If dataset with legacy ID exists, return that ID to maintain compatibility.
+
+    Args:
+        dataset_name: string representing the dataset name
+        user: User object adding the dataset
+        tenant_id: UUID of the tenant for which dataset is being added
+
+    Returns:
+        UUID: Unique identifier for the dataset
+    """
+
+    def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
+        """
+        Legacy function, returns a unique UUID for dataset based on dataset name and user id.
+        Needed to support legacy datasets without tenant information.
+        Args:
+            dataset_name: string representing the dataset name
+            user: Current User object adding the dataset
+
+        Returns:
+            UUID: Unique identifier for the dataset
+        """
+        if isinstance(dataset_name, UUID):
+            return dataset_name
+        return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
+
+    def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
+        """
+        Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
+        Args:
+            dataset_name: string representing the dataset name
+            user: Current User object adding the dataset
+            tenant_id: UUID of the tenant for which dataset is being added
+
+        Returns:
+            UUID: Unique identifier for the dataset
+        """
+        if isinstance(dataset_name, UUID):
+            return dataset_name
+        return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
+
+    # Get all possible dataset_id values
+    dataset_id = {
+        "modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
+        "legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
+    }
+
+    # Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
+    db_engine = get_relational_engine()
+    async with db_engine.get_async_session() as session:
+        legacy_dataset = (
+            await session.execute(
+                select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"])
+            )
+        ).scalar_one_or_none()
+
+        if not legacy_dataset:
+            return dataset_id["modern_dataset_id"]
+        return dataset_id["legacy_dataset_id"]
--- a/cognee/modules/data/methods/has_dataset_data.py
+++ b/cognee/modules/data/methods/has_dataset_data.py
@ -0,0 +1,21 @@
+from uuid import UUID
+
+from sqlalchemy import select
+from sqlalchemy.sql import func
+
+from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.modules.data.models import DatasetData
+
+
+async def has_dataset_data(dataset_id: UUID) -> bool:
+    db_engine = get_relational_engine()
+
+    async with db_engine.get_async_session() as session:
+        count_query = (
+            select(func.count())
+            .select_from(DatasetData)
+            .where(DatasetData.dataset_id == dataset_id)
+        )
+        count = await session.execute(count_query)
+
+        return count.scalar_one() > 0
--- a/cognee/modules/data/models/Dataset.py
+++ b/cognee/modules/data/models/Dataset.py
@ -18,6 +18,7 @@ class Dataset(Base):
    updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))

    owner_id = Column(UUID, index=True)
+    tenant_id = Column(UUID, index=True, nullable=True)

    acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan")

@ -36,5 +37,6 @@ class Dataset(Base):
            "createdAt": self.created_at.isoformat(),
            "updatedAt": self.updated_at.isoformat() if self.updated_at else None,
            "ownerId": str(self.owner_id),
+            "tenantId": str(self.tenant_id),
            "data": [data.to_json() for data in self.data],
        }
--- a/cognee/modules/graph/cognee_graph/CogneeGraph.py
+++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py
@ -171,8 +171,10 @@ class CogneeGraph(CogneeAbstractGraph):
            embedding_map = {result.payload["text"]: result.score for result in edge_distances}

            for edge in self.edges:
-                relationship_type = edge.attributes.get("relationship_type")
-                distance = embedding_map.get(relationship_type, None)
+                edge_key = edge.attributes.get("edge_text") or edge.attributes.get(
+                    "relationship_type"
+                )
+                distance = embedding_map.get(edge_key, None)
                if distance is not None:
                    edge.attributes["vector_distance"] = distance

--- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py
+++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py
@ -1,5 +1,6 @@
 from typing import Optional

+from cognee.infrastructure.engine.models.Edge import Edge
 from cognee.modules.chunking.models import DocumentChunk
 from cognee.modules.engine.models import Entity, EntityType
 from cognee.modules.engine.utils import (
@ -243,10 +244,26 @@ def _process_graph_nodes(
            ontology_relationships,
        )

-        # Add entity to data chunk
        if data_chunk.contains is None:
            data_chunk.contains = []
-        data_chunk.contains.append(entity_node)
+
+        edge_text = "; ".join(
+            [
+                "relationship_name: contains",
+                f"entity_name: {entity_node.name}",
+                f"entity_description: {entity_node.description}",
+            ]
+        )
+
+        data_chunk.contains.append(
+            (
+                Edge(
+                    relationship_type="contains",
+                    edge_text=edge_text,
+                ),
+                entity_node,
+            )
+        )


 def _process_graph_edges(
--- a/cognee/modules/graph/utils/resolve_edges_to_text.py
+++ b/cognee/modules/graph/utils/resolve_edges_to_text.py
@ -1,71 +1,70 @@
+import string
 from typing import List
+from collections import Counter
+
 from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
+from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS
+
+
+def _get_top_n_frequent_words(
+    text: str, stop_words: set = None, top_n: int = 3, separator: str = ", "
+) -> str:
+    """Concatenates the top N frequent words in text."""
+    if stop_words is None:
+        stop_words = DEFAULT_STOP_WORDS
+
+    words = [word.lower().strip(string.punctuation) for word in text.split()]
+    words = [word for word in words if word and word not in stop_words]
+
+    top_words = [word for word, freq in Counter(words).most_common(top_n)]
+    return separator.join(top_words)
+
+
+def _create_title_from_text(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str:
+    """Creates a title by combining first words with most frequent words from the text."""
+    first_words = text.split()[:first_n_words]
+    top_words = _get_top_n_frequent_words(text, top_n=top_n_words)
+    return f"{' '.join(first_words)}... [{top_words}]"
+
+
+def _extract_nodes_from_edges(retrieved_edges: List[Edge]) -> dict:
+    """Creates a dictionary of nodes with their names and content."""
+    nodes = {}
+
+    for edge in retrieved_edges:
+        for node in (edge.node1, edge.node2):
+            if node.id in nodes:
+                continue
+
+            text = node.attributes.get("text")
+            if text:
+                name = _create_title_from_text(text)
+                content = text
+            else:
+                name = node.attributes.get("name", "Unnamed Node")
+                content = node.attributes.get("description", name)
+
+            nodes[node.id] = {"node": node, "name": name, "content": content}
+
+    return nodes


 async def resolve_edges_to_text(retrieved_edges: List[Edge]) -> str:
-    """
-    Converts retrieved graph edges into a human-readable string format.
+    """Converts retrieved graph edges into a human-readable string format."""
+    nodes = _extract_nodes_from_edges(retrieved_edges)

-    Parameters:
-    -----------
-
-        - retrieved_edges (list): A list of edges retrieved from the graph.
-
-    Returns:
-    --------
-
-        - str: A formatted string representation of the nodes and their connections.
-    """
-
-    def _get_nodes(retrieved_edges: List[Edge]) -> dict:
-        def _get_title(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str:
-            def _top_n_words(text, stop_words=None, top_n=3, separator=", "):
-                """Concatenates the top N frequent words in text."""
-                if stop_words is None:
-                    from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS
-
-                    stop_words = DEFAULT_STOP_WORDS
-
-                import string
-
-                words = [word.lower().strip(string.punctuation) for word in text.split()]
-
-                if stop_words:
-                    words = [word for word in words if word and word not in stop_words]
-
-                from collections import Counter
-
-                top_words = [word for word, freq in Counter(words).most_common(top_n)]
-
-                return separator.join(top_words)
-
-            """Creates a title, by combining first words with most frequent words from the text."""
-            first_words = text.split()[:first_n_words]
-            top_words = _top_n_words(text, top_n=first_n_words)
-            return f"{' '.join(first_words)}... [{top_words}]"
-
-        """Creates a dictionary of nodes with their names and content."""
-        nodes = {}
-        for edge in retrieved_edges:
-            for node in (edge.node1, edge.node2):
-                if node.id not in nodes:
-                    text = node.attributes.get("text")
-                    if text:
-                        name = _get_title(text)
-                        content = text
-                    else:
-                        name = node.attributes.get("name", "Unnamed Node")
-                        content = node.attributes.get("description", name)
-                    nodes[node.id] = {"node": node, "name": name, "content": content}
-        return nodes
-
-    nodes = _get_nodes(retrieved_edges)
    node_section = "\n".join(
        f"Node: {info['name']}\n__node_content_start__\n{info['content']}\n__node_content_end__\n"
        for info in nodes.values()
    )
-    connection_section = "\n".join(
-        f"{nodes[edge.node1.id]['name']} --[{edge.attributes['relationship_type']}]--> {nodes[edge.node2.id]['name']}"
-        for edge in retrieved_edges
-    )
+
+    connections = []
+    for edge in retrieved_edges:
+        source_name = nodes[edge.node1.id]["name"]
+        target_name = nodes[edge.node2.id]["name"]
+        edge_label = edge.attributes.get("edge_text") or edge.attributes.get("relationship_type")
+        connections.append(f"{source_name} --[{edge_label}]--> {target_name}")
+
+    connection_section = "\n".join(connections)
+
    return f"Nodes:\n{node_section}\n\nConnections:\n{connection_section}"
--- a/cognee/modules/ingestion/data_types/BinaryData.py
+++ b/cognee/modules/ingestion/data_types/BinaryData.py
@ -30,7 +30,7 @@ class BinaryData(IngestionData):

    async def ensure_metadata(self):
        if self.metadata is None:
-            self.metadata = await get_file_metadata(self.data)
+            self.metadata = await get_file_metadata(self.data, name=self.name)

            if self.metadata["name"] is None:
                self.metadata["name"] = self.name
--- a/cognee/modules/ingestion/identify.py
+++ b/cognee/modules/ingestion/identify.py
@ -1,11 +1,11 @@
-from uuid import uuid5, NAMESPACE_OID
+from uuid import UUID
 from .data_types import IngestionData

 from cognee.modules.users.models import User
+from cognee.modules.data.methods import get_unique_data_id


-def identify(data: IngestionData, user: User) -> str:
+async def identify(data: IngestionData, user: User) -> UUID:
    data_content_hash: str = data.get_identifier()

-    # return UUID hash of file contents + owner id
-    return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}")
+    return await get_unique_data_id(data_identifier=data_content_hash, user=user)
--- a/cognee/modules/ingestion/save_data_to_file.py
+++ b/cognee/modules/ingestion/save_data_to_file.py
@ -1,10 +1,12 @@
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, Optional
 from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
 from .classify import classify
 import hashlib


-async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
+async def save_data_to_file(
+    data: Union[str, BinaryIO], filename: str = None, file_extension: Optional[str] = None
+):
    storage_config = get_storage_config()

    data_root_directory = storage_config["data_root_directory"]
@ -21,6 +23,11 @@ async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):

        file_name = file_metadata["name"]

+        if file_extension is not None:
+            extension = file_extension.lstrip(".")
+            file_name_without_ext = file_name.rsplit(".", 1)[0]
+            file_name = f"{file_name_without_ext}.{extension}"
+
        storage = get_file_storage(data_root_directory)

        full_file_path = await storage.store(file_name, data)
--- a/cognee/modules/ontology/get_default_ontology_resolver.py
+++ b/cognee/modules/ontology/get_default_ontology_resolver.py
@ -21,7 +21,8 @@ def get_ontology_resolver_from_env(
            Supported value: "rdflib".
        matching_strategy (str): The matching strategy to apply.
            Supported value: "fuzzy".
-        ontology_file_path (str): Path to the ontology file required for the resolver.
+        ontology_file_path (str): Path to the ontology file(s) required for the resolver.
+            Can be a single path or comma-separated paths for multiple files.

    Returns:
        BaseOntologyResolver: An instance of the requested ontology resolver.
@ -31,8 +32,13 @@ def get_ontology_resolver_from_env(
            or if required parameters are missing.
    """
    if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
+        if "," in ontology_file_path:
+            file_paths = [path.strip() for path in ontology_file_path.split(",")]
+        else:
+            file_paths = ontology_file_path
+
        return RDFLibOntologyResolver(
-            matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
+            matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths
        )
    else:
        raise EnvironmentError(
--- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py
+++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py
@ -2,7 +2,7 @@ import os
 import difflib
 from cognee.shared.logging_utils import get_logger
 from collections import deque
-from typing import List, Tuple, Dict, Optional, Any
+from typing import List, Tuple, Dict, Optional, Any, Union
 from rdflib import Graph, URIRef, RDF, RDFS, OWL

 from cognee.modules.ontology.exceptions import (
@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver):

    def __init__(
        self,
-        ontology_file: Optional[str] = None,
+        ontology_file: Optional[Union[str, List[str]]] = None,
        matching_strategy: Optional[MatchingStrategy] = None,
    ) -> None:
        super().__init__(matching_strategy)
        self.ontology_file = ontology_file
        try:
-            if ontology_file and os.path.exists(ontology_file):
+            files_to_load = []
+            if ontology_file is not None:
+                if isinstance(ontology_file, str):
+                    files_to_load = [ontology_file]
+                elif isinstance(ontology_file, list):
+                    files_to_load = ontology_file
+                else:
+                    raise ValueError(
+                        f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}"
+                    )
+
+            if files_to_load:
                self.graph = Graph()
-                self.graph.parse(ontology_file)
-                logger.info("Ontology loaded successfully from file: %s", ontology_file)
+                loaded_files = []
+                for file_path in files_to_load:
+                    if os.path.exists(file_path):
+                        self.graph.parse(file_path)
+                        loaded_files.append(file_path)
+                        logger.info("Ontology loaded successfully from file: %s", file_path)
+                    else:
+                        logger.warning(
+                            "Ontology file '%s' not found. Skipping this file.",
+                            file_path,
+                        )
+
+                if not loaded_files:
+                    logger.info(
+                        "No valid ontology files found. No owl ontology will be attached to the graph."
+                    )
+                    self.graph = None
+                else:
+                    logger.info("Total ontology files loaded: %d", len(loaded_files))
            else:
                logger.info(
-                    "Ontology file '%s' not found. No owl ontology will be attached to the graph.",
-                    ontology_file,
+                    "No ontology file provided. No owl ontology will be attached to the graph."
                )
                self.graph = None
+
            self.build_lookup()
        except Exception as e:
            logger.error("Failed to load ontology", exc_info=e)
--- a/cognee/modules/pipelines/exceptions/exceptions.py
+++ b/cognee/modules/pipelines/exceptions/exceptions.py
@ -7,6 +7,6 @@ class PipelineRunFailedError(CogneeSystemError):
        self,
        message: str = "Pipeline run failed.",
        name: str = "PipelineRunFailedError",
-        status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
+        status_code: int = status.HTTP_422_UNPROCESSABLE_CONTENT,
    ):
        super().__init__(message, name, status_code)
--- a/Show more
+++ b/Show more