Merge branch 'dev' into merge-main-vol3

This commit is contained in:
Igor Ilic 2025-07-08 14:32:19 +02:00 committed by GitHub
commit 0d80ec49a2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 1126 additions and 251 deletions

View file

@ -1,189 +1,112 @@
###############################################################################
# NOTE: With default settings Cognee only needs an OpenAI LLM_API_KEY to be set.
# The rest of the settings don't have to be set.
# Default relational database: SQLite
# Default vector database : LanceDB
# Default graph database : Kuzu
#
# These default databases are all file-based, so no extra setup is needed
# for local use.
###############################################################################
### ################################################################################
### DEV # 🧠 LLM Settings
### ################################################################################
TOKENIZERS_PARALLELISM="false"
###
### LLM
###
###
### simple, "expensive", an OpenAPI key
###
LLM_API_KEY="your_api_key" LLM_API_KEY="your_api_key"
LLM_MODEL="openai/gpt-4o-mini"
### LLM_PROVIDER="openai"
### DEV LLM, cheap with content filters LLM_ENDPOINT=""
### LLM_API_VERSION=""
LLM_MODEL="azure/gpt-4o-mini"
LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
LLM_API_VERSION="2024-12-01-preview"
#llm api version might not be relevant
LLM_MAX_TOKENS="16384" LLM_MAX_TOKENS="16384"
EMBEDDING_MODEL="azure/text-embedding-3-large"
EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
EMBEDDING_API_VERSION="2024-12-01-preview"
EMBEDDING_DIMENSIONS=3072
EMBEDDING_MAX_TOKENS=8191
###
### free local LLM, install it
###
LLM_API_KEY = "ollama"
LLM_MODEL = "llama3.1:8b"
LLM_PROVIDER = "ollama"
LLM_ENDPOINT = "http://localhost:11434/v1"
EMBEDDING_PROVIDER = "ollama"
EMBEDDING_MODEL = "avr/sfr-embedding-mistral:latest"
EMBEDDING_ENDPOINT = "http://localhost:11434/api/embeddings"
EMBEDDING_DIMENSIONS = 4096
HUGGINGFACE_TOKENIZER = "Salesforce/SFR-Embedding-Mistral"
###
### openrouter, also frewe
###
LLM_API_KEY="<<go-get-one-yourself"
LLM_PROVIDER="custom"
LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
LLM_ENDPOINT="https://openrouter.ai/api/v1"
###
### deepinfra
###
LLM_API_KEY="<<>>"
LLM_PROVIDER="custom"
LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
EMBEDDING_PROVIDER="openai" EMBEDDING_PROVIDER="openai"
EMBEDDING_API_KEY="<<>>" EMBEDDING_MODEL="openai/text-embedding-3-large"
EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
EMBEDDING_ENDPOINT="" EMBEDDING_ENDPOINT=""
EMBEDDING_API_VERSION="" EMBEDDING_API_VERSION=""
EMBEDDING_DIMENSIONS=3072 EMBEDDING_DIMENSIONS=3072
EMBEDDING_MAX_TOKENS=8191 EMBEDDING_MAX_TOKENS=8191
# If embedding key is not provided same key set for LLM_API_KEY will be used
#EMBEDDING_API_KEY="your_api_key"
### ################################################################################
### DB # 🗄️ Relational database settings
### ################################################################################
###
### db minimal/default
###
GRAPH_DATABASE_PROVIDER="networkx"
VECTOR_DB_PROVIDER="lancedb"
DB_PROVIDER=sqlite
DB_NAME=cognee_db
###
### Relational options
###
DB_PROVIDER="sqlite" DB_PROVIDER="sqlite"
DB_NAME=cognee_db DB_NAME=cognee_db
DB_PROVIDER=postgres # -- To switch to Postgres / PGVector, uncomment and fill these: -------------
DB_NAME=cognee_db #DB_PROVIDER=postgres
DB_HOST=127.0.0.1 #DB_NAME=cognee_db
DB_PORT=5432 # To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
DB_USERNAME=cognee #DB_HOST=127.0.0.1
DB_PASSWORD=cognee #DB_PORT=5432
#DB_USERNAME=cognee
#DB_PASSWORD=cognee
### ################################################################################
### Graph options # 🕸️ Graph Database settings
### ################################################################################
#Default
# Default (local file-based)
GRAPH_DATABASE_PROVIDER="kuzu" GRAPH_DATABASE_PROVIDER="kuzu"
#or if using remote # -- To switch to Remote Kuzu uncomment and fill these: -------------------------------------------------------------
#GRAPH_DATABASE_PROVIDER="kuzu"
#GRAPH_DATABASE_PROVIDER="kuzu-remote"
#GRAPH_DATABASE_URL="http://localhost:8000"
#GRAPH_DATABASE_USERNAME=XXX
#GRAPH_DATABASE_PASSWORD=YYY
GRAPH_DATABASE_PROVIDER="kuzu" # -- To switch to Neo4j uncomment and fill these: -------------------------------------------------------------------
GRAPH_DATABASE_PROVIDER="kuzu-remote" #GRAPH_DATABASE_PROVIDER="neo4j"
GRAPH_DATABASE_URL="http://localhost:8000" #GRAPH_DATABASE_URL=bolt://localhost:7687
GRAPH_DATABASE_USERNAME=XXX #GRAPH_DATABASE_USERNAME=neo4j
GRAPH_DATABASE_PASSWORD=YYY #GRAPH_DATABASE_PASSWORD=localneo4j
# or if using neo4j ################################################################################
# 📐 Vector Database settings
GRAPH_DATABASE_PROVIDER="neo4j" ################################################################################
GRAPH_DATABASE_URL=bolt://localhost:7687
GRAPH_DATABASE_USERNAME=neo4j
GRAPH_DATABASE_PASSWORD=localneo4j
###
### Vector options
###
# Supported providers: pgvector | qdrant | weaviate | milvus | lancedb | chromadb
VECTOR_DB_PROVIDER="lancedb" VECTOR_DB_PROVIDER="lancedb"
# Not needed if a cloud vector database is not used
VECTOR_DB_URL=
VECTOR_DB_KEY=
VECTOR_DB_PROVIDER="pgvector" ################################################################################
# 📂 ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
################################################################################
# Set up the Cognee system directory. Cognee will store system files and databases here.
DATA_ROOT_DIRECTORY='/cognee_data/data'
SYSTEM_ROOT_DIRECTORY='/cognee_data/system'
###
### for release test
###
LLM_API_KEY="..." ################################################################################
# 🔄 MIGRATION (RELATIONAL → GRAPH) SETTINGS
################################################################################
OPENAI_API_KEY="..." MIGRATION_DB_PATH="/path/to/migration/directory"
MIGRATION_DB_NAME="migration_database.sqlite"
MIGRATION_DB_PATH="~/Downloads/"
MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
MIGRATION_DB_PROVIDER="sqlite" MIGRATION_DB_PROVIDER="sqlite"
GRAPH_DATABASE_URL="bolt://54.246.89.112:7687" # -- Postgres-specific migration params --------------------------------------
GRAPH_DATABASE_USERNAME="neo4j"
GRAPH_DATABASE_PASSWORD="pleaseletmein"
###
### ROOT DIRECTORY IF USING COGNEE LIB INSIDE A DOCKER
###
# Set up the Cognee system directory. Cognee will store system files and databases here.
DATA_ROOT_DIRECTORY ='/cognee_data/data'
SYSTEM_ROOT_DIRECTORY= '/cognee_data/system'
# Postgres specific parameters (Only if Postgres or PGVector is used). Do not use for cognee default simplest setup of SQLite-NetworkX-LanceDB
# DB_USERNAME=cognee
# DB_PASSWORD=cognee
# To use Postgres with the Cognee backend in Docker compose use the following instead: DB_HOST=host.docker.internal
# DB_HOST=127.0.0.1
# DB_PORT=5432
# Params for migrating relational database data to graph / Cognee ( PostgreSQL and SQLite supported )
# MIGRATION_DB_PATH="/path/to/migration/directory"
# MIGRATION_DB_NAME="migration_database.sqlite"
# MIGRATION_DB_PROVIDER="sqlite"
# Postgres specific parameters for migration
# MIGRATION_DB_USERNAME=cognee # MIGRATION_DB_USERNAME=cognee
# MIGRATION_DB_PASSWORD=cognee # MIGRATION_DB_PASSWORD=cognee
# MIGRATION_DB_HOST="127.0.0.1" # MIGRATION_DB_HOST="127.0.0.1"
# MIGRATION_DB_PORT=5432 # MIGRATION_DB_PORT=5432
# LITELLM Logging Level. Set to quiten down logging ################################################################################
LITELLM_LOG="ERROR" # 🔒 Security Settings
################################################################################
# Set this environment variable to disable sending telemetry data # When set to false don't allow adding of local system files to Cognee. Should be set to False when Cognee is used as a backend.
# TELEMETRY_DISABLED=1 ACCEPT_LOCAL_FILE_PATH=True
# When set to false don't allow HTTP requests to be sent from Cognee.
# This protects against Server Side Request Forgery when proper infrastructure is not in place.
ALLOW_HTTP_REQUESTS=True
# Set this variable to True to enforce usage of backend access control for Cognee # Set this variable to True to enforce usage of backend access control for Cognee
# Note: This is only currently supported by the following databases: # Note: This is only currently supported by the following databases:
@ -194,3 +117,94 @@ LITELLM_LOG="ERROR"
# It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset # It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset
ENABLE_BACKEND_ACCESS_CONTROL=False ENABLE_BACKEND_ACCESS_CONTROL=False
################################################################################
# 🛠️ DEV Settings
################################################################################
ENV="local"
TOKENIZERS_PARALLELISM="false"
# LITELLM Logging Level. Set to quiet down logging
LITELLM_LOG="ERROR"
# Set this environment variable to disable sending telemetry data
# TELEMETRY_DISABLED=1
# Default User Configuration
# DEFAULT_USER_EMAIL=""
# DEFAULT_USER_PASSWORD=""
------------------------------- END OF POSSIBLE SETTINGS -------------------------------
###############################################################################
# 🧪 EXAMPLE OVERRIDES (commented out)
###############################################################################
# The blocks below show how to configure alternative providers.
# Uncomment + fill values to switch.
########## Azure OpenAI #######################################################
#LLM_MODEL="azure/gpt-4o-mini"
#LLM_ENDPOINT="https://DNS.azure.com/openai/deployments/gpt-4o-mini"
#LLM_API_KEY="<<TALK TO YOUR AZURE GUY"
#LLM_API_VERSION="2024-12-01-preview"
## llm api version might not be relevant
#LLM_MAX_TOKENS="16384"
#EMBEDDING_MODEL="azure/text-embedding-3-large"
#EMBEDDING_ENDPOINT="https://DNS.openai.azure.com/openai/deployments/text-embedding-3-large"
#EMBEDDING_API_KEY="<<TALK TO YOUR AZURE GUY>"
#EMBEDDING_API_VERSION="2024-12-01-preview"
#EMBEDDING_DIMENSIONS=3072
#EMBEDDING_MAX_TOKENS=8191
########## Local LLM via Ollama ###############################################
#LLM_API_KEY ="ollama"
#LLM_MODEL="llama3.1:8b"
#LLM_PROVIDER="ollama"
#LLM_ENDPOINT="http://localhost:11434/v1"
#EMBEDDING_PROVIDER="ollama"
#EMBEDDING_MODEL="avr/sfr-embedding-mistral:latest"
#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
#EMBEDDING_DIMENSIONS=4096
#HUGGINGFACE_TOKENIZER="Salesforce/SFR-Embedding-Mistral"
########## OpenRouter (also free) #########################################################
#LLM_API_KEY="<<go-get-one-yourself"
#LLM_PROVIDER="custom"
#LLM_MODEL="openrouter/google/gemini-2.0-flash-lite-preview-02-05:free"
#LLM_ENDPOINT="https://openrouter.ai/api/v1"
########## DeepInfra ##########################################################
#LLM_API_KEY="<<>>"
#LLM_PROVIDER="custom"
#LLM_MODEL="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct"
#LLM_ENDPOINT="https://api.deepinfra.com/v1/openai"
#EMBEDDING_PROVIDER="openai"
#EMBEDDING_API_KEY="<<>>"
#EMBEDDING_MODEL="deepinfra/BAAI/bge-base-en-v1.5"
#EMBEDDING_ENDPOINT=""
#EMBEDDING_API_VERSION=""
#EMBEDDING_DIMENSIONS=3072
#EMBEDDING_MAX_TOKENS=8191
########## Release Test ###############################################
#LLM_API_KEY="..."
#OPENAI_API_KEY="..."
#MIGRATION_DB_PATH="~/Downloads/"
#MIGRATION_DB_NAME="Chinook_Sqlite.sqlite"
#MIGRATION_DB_PROVIDER="sqlite"
#GRAPH_DATABASE_URL="bolt://54.246.89.112:7687"
#GRAPH_DATABASE_USERNAME="neo4j"
#GRAPH_DATABASE_PASSWORD="pleaseletmein"

View file

@ -248,3 +248,32 @@ jobs:
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: poetry run python ./cognee/tests/test_parallel_databases.py run: poetry run python ./cognee/tests/test_parallel_databases.py
test-permissions:
name: Test permissions with different situations in Cognee
runs-on: ubuntu-22.04
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: '3.11.x'
- name: Install specific graph db dependency
run: |
poetry install
- name: Run parallel databases test
env:
ENV: 'dev'
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: poetry run python ./cognee/tests/test_permissions.py

View file

@ -102,7 +102,7 @@ handlers =
qualname = sqlalchemy.engine qualname = sqlalchemy.engine
[logger_alembic] [logger_alembic]
level = INFO level = WARN
handlers = handlers =
qualname = alembic qualname = alembic

View file

@ -33,9 +33,6 @@ COPY ./cognee-mcp/pyproject.toml ./cognee-mcp/uv.lock ./cognee-mcp/entrypoint.sh
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-install-project --no-dev --no-editable uv sync --frozen --no-install-project --no-dev --no-editable
# Copy .env file first if it exists (for environment variables)
COPY .env* /app/
# Copy Alembic configuration # Copy Alembic configuration
COPY alembic.ini /app/alembic.ini COPY alembic.ini /app/alembic.ini
COPY alembic/ /app/alembic COPY alembic/ /app/alembic

View file

@ -1,22 +1,24 @@
[project] [project]
name = "cognee-mcp" name = "cognee-mcp"
version = "0.3.0" version = "0.4.0"
description = "A MCP server project" description = "Cognee MCP server"
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = [ dependencies = [
# For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes. # For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes.
#"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j] @ file:/Users/<username>/Desktop/cognee", #"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j] @ file:/Users/<username>/Desktop/cognee",
"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j,kuzu]==0.2.0.dev0", "cognee[postgres,codegraph,gemini,huggingface,docs,neo4j]==0.2.0",
"fastmcp>=1.0", "fastmcp>=1.0",
"mcp==1.5.0", "mcp==1.5.0",
"uv>=0.6.3", "uv>=0.6.3",
] ]
[[project.authors]] authors = [
name = "Boris Arzentar" { name = "Boris Arzentar", email = "boris@topoteretes.com" },
email = "boris@topoteretes.com" { name = "Igor Ilic", email = "igor@topoteretes.com" },
{ name = "Laszlo Hajdu", email = "laszlo@topoteretes.com" },
]
[build-system] [build-system]
requires = [ "hatchling", ] requires = [ "hatchling", ]

View file

@ -18,6 +18,7 @@ from cognee.modules.search.types import SearchType
from cognee.shared.data_models import KnowledgeGraph from cognee.shared.data_models import KnowledgeGraph
from cognee.modules.storage.utils import JSONEncoder from cognee.modules.storage.utils import JSONEncoder
try: try:
from codingagents.coding_rule_associations import ( from codingagents.coding_rule_associations import (
add_rule_associations, add_rule_associations,

117
cognee-mcp/uv.lock generated
View file

@ -8,6 +8,29 @@ resolution-markers = [
"python_full_version < '3.11'", "python_full_version < '3.11'",
] ]
[[package]]
name = "aiobotocore"
version = "2.23.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "aiohttp" },
{ name = "aioitertools" },
{ name = "botocore" },
{ name = "jmespath" },
{ name = "multidict" },
{ name = "python-dateutil" },
{ name = "wrapt" },
]
sdist = { url = "https://files.pythonhosted.org/packages/9d/25/4b06ea1214ddf020a28df27dc7136ac9dfaf87929d51e6f6044dd350ed67/aiobotocore-2.23.0.tar.gz", hash = "sha256:0333931365a6c7053aee292fe6ef50c74690c4ae06bb019afdf706cb6f2f5e32", size = 115825, upload-time = "2025-06-12T23:46:38.055Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ea/43/ccf9b29669cdb09fd4bfc0a8effeb2973b22a0f3c3be4142d0b485975d11/aiobotocore-2.23.0-py3-none-any.whl", hash = "sha256:8202cebbf147804a083a02bc282fbfda873bfdd0065fd34b64784acb7757b66e", size = 84161, upload-time = "2025-06-12T23:46:36.305Z" },
]
[package.optional-dependencies]
boto3 = [
{ name = "boto3" },
]
[[package]] [[package]]
name = "aiofiles" name = "aiofiles"
version = "24.1.0" version = "24.1.0"
@ -112,6 +135,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/65/31/e252246332a12abf17f66c8f8360730a5a3a1dd354ca48ccfb90bbb122db/aiohttp-3.12.4-cp313-cp313-win_amd64.whl", hash = "sha256:4c78018c4e8118efac767d5d91c3565919c7e021762c4644198ec5b8d426a071", size = 439411, upload-time = "2025-05-29T01:36:16.365Z" }, { url = "https://files.pythonhosted.org/packages/65/31/e252246332a12abf17f66c8f8360730a5a3a1dd354ca48ccfb90bbb122db/aiohttp-3.12.4-cp313-cp313-win_amd64.whl", hash = "sha256:4c78018c4e8118efac767d5d91c3565919c7e021762c4644198ec5b8d426a071", size = 439411, upload-time = "2025-05-29T01:36:16.365Z" },
] ]
[[package]]
name = "aioitertools"
version = "0.12.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/06/de/38491a84ab323b47c7f86e94d2830e748780525f7a10c8600b67ead7e9ea/aioitertools-0.12.0.tar.gz", hash = "sha256:c2a9055b4fbb7705f561b9d86053e8af5d10cc845d22c32008c43490b2d8dd6b", size = 19369, upload-time = "2024-09-02T03:33:40.349Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl", hash = "sha256:fc1f5fac3d737354de8831cbba3eb04f79dd649d8f3afb4c5b114925e662a796", size = 24345, upload-time = "2024-09-02T03:34:59.454Z" },
]
[[package]] [[package]]
name = "aiosignal" name = "aiosignal"
version = "1.3.2" version = "1.3.2"
@ -349,6 +381,34 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" }, { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" },
] ]
[[package]]
name = "boto3"
version = "1.38.27"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "botocore" },
{ name = "jmespath" },
{ name = "s3transfer" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e7/96/fc74d8521d2369dd8c412438401ff12e1350a1cd3eab5c758ed3dd5e5f82/boto3-1.38.27.tar.gz", hash = "sha256:94bd7fdd92d5701b362d4df100d21e28f8307a67ff56b6a8b0398119cf22f859", size = 111875, upload-time = "2025-05-30T19:32:41.352Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/43/8b/b2361188bd1e293eede1bc165e2461d390394f71ec0c8c21211c8dabf62c/boto3-1.38.27-py3-none-any.whl", hash = "sha256:95f5fe688795303a8a15e8b7e7f255cadab35eae459d00cc281a4fd77252ea80", size = 139938, upload-time = "2025-05-30T19:32:38.006Z" },
]
[[package]]
name = "botocore"
version = "1.38.27"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jmespath" },
{ name = "python-dateutil" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/36/5e/67899214ad57f7f26af5bd776ac5eb583dc4ecf5c1e52e2cbfdc200e487a/botocore-1.38.27.tar.gz", hash = "sha256:9788f7efe974328a38cbade64cc0b1e67d27944b899f88cb786ae362973133b6", size = 13919963, upload-time = "2025-05-30T19:32:29.657Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/83/a753562020b69fa90cebc39e8af2c753b24dcdc74bee8355ee3f6cefdf34/botocore-1.38.27-py3-none-any.whl", hash = "sha256:a785d5e9a5eda88ad6ab9ed8b87d1f2ac409d0226bba6ff801c55359e94d91a8", size = 13580545, upload-time = "2025-05-30T19:32:26.712Z" },
]
[[package]] [[package]]
name = "cachetools" name = "cachetools"
version = "5.5.2" version = "5.5.2"
@ -517,7 +577,7 @@ wheels = [
[[package]] [[package]]
name = "cognee" name = "cognee"
version = "0.2.0.dev0" version = "0.2.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "aiofiles" }, { name = "aiofiles" },
@ -531,6 +591,7 @@ dependencies = [
{ name = "graphistry" }, { name = "graphistry" },
{ name = "instructor" }, { name = "instructor" },
{ name = "jinja2" }, { name = "jinja2" },
{ name = "kuzu" },
{ name = "lancedb" }, { name = "lancedb" },
{ name = "langfuse" }, { name = "langfuse" },
{ name = "limits" }, { name = "limits" },
@ -551,6 +612,7 @@ dependencies = [
{ name = "python-dotenv" }, { name = "python-dotenv" },
{ name = "python-multipart" }, { name = "python-multipart" },
{ name = "rdflib" }, { name = "rdflib" },
{ name = "s3fs", extra = ["boto3"] },
{ name = "scikit-learn" }, { name = "scikit-learn" },
{ name = "sentry-sdk", extra = ["fastapi"] }, { name = "sentry-sdk", extra = ["fastapi"] },
{ name = "sqlalchemy" }, { name = "sqlalchemy" },
@ -558,9 +620,9 @@ dependencies = [
{ name = "tiktoken" }, { name = "tiktoken" },
{ name = "typing-extensions" }, { name = "typing-extensions" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/e2/fb/e6b61ff0d959815385cb24f07ac811b2849900c3d96e2a69adfe0f4860b9/cognee-0.2.0.dev0.tar.gz", hash = "sha256:a3e5290615b06bd0ff03026afd8a093ad693830c6e7f5ffae7f9791d18c22d9b", size = 15446498, upload-time = "2025-06-18T20:22:20.281Z" } sdist = { url = "https://files.pythonhosted.org/packages/f7/6a/71ecadc9cfb90a512e3857ad2f02d4c47e69611aabb6bb5b7b20ac13e4ac/cognee-0.2.0.tar.gz", hash = "sha256:3ae302040dfe36dffcfb991452249c121f3f82787b99db0205a82a278112ac71", size = 15453724, upload-time = "2025-06-30T14:21:04.301Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/9c/c6/47aeabde012bec5b79ab87087bcc53940aca9df2f553f78f5b9a7995f632/cognee-0.2.0.dev0-py3-none-any.whl", hash = "sha256:f8a0479cb0eaf8f49777e57860eec8159d6c8c440fbdb6f6245a128734599e5f", size = 956675, upload-time = "2025-06-18T20:22:05.315Z" }, { url = "https://files.pythonhosted.org/packages/13/fe/f44e5217b40b49be8b238dcbe0dce3d45d3ae18284218d0b43039cf8235b/cognee-0.2.0-py3-none-any.whl", hash = "sha256:e11d76c8a56dc75045095f14f8efbd1a52357818688724c7e6ab147fbb1a447e", size = 963410, upload-time = "2025-06-30T14:20:50.165Z" },
] ]
[package.optional-dependencies] [package.optional-dependencies]
@ -579,9 +641,6 @@ gemini = [
huggingface = [ huggingface = [
{ name = "transformers" }, { name = "transformers" },
] ]
kuzu = [
{ name = "kuzu" },
]
neo4j = [ neo4j = [
{ name = "neo4j" }, { name = "neo4j" },
] ]
@ -593,10 +652,10 @@ postgres = [
[[package]] [[package]]
name = "cognee-mcp" name = "cognee-mcp"
version = "0.3.0" version = "0.4.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "cognee", extra = ["codegraph", "docs", "gemini", "huggingface", "kuzu", "neo4j", "postgres"] }, { name = "cognee", extra = ["codegraph", "docs", "gemini", "huggingface", "neo4j", "postgres"] },
{ name = "fastmcp" }, { name = "fastmcp" },
{ name = "mcp" }, { name = "mcp" },
{ name = "uv" }, { name = "uv" },
@ -609,7 +668,7 @@ dev = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "cognee", extras = ["postgres", "codegraph", "gemini", "huggingface", "docs", "neo4j", "kuzu"], specifier = ">=0.2.0.dev0" }, { name = "cognee", extras = ["postgres", "codegraph", "gemini", "huggingface", "docs", "neo4j"], specifier = "==0.2.0" },
{ name = "fastmcp", specifier = ">=1.0" }, { name = "fastmcp", specifier = ">=1.0" },
{ name = "mcp", specifier = "==1.5.0" }, { name = "mcp", specifier = "==1.5.0" },
{ name = "uv", specifier = ">=0.6.3" }, { name = "uv", specifier = ">=0.6.3" },
@ -1760,6 +1819,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/91/61/c80ef80ed8a0a21158e289ef70dac01e351d929a1c30cb0f49be60772547/jiter-0.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:3ac9f578c46f22405ff7f8b1f5848fb753cc4b8377fbec8470a7dc3997ca7566", size = 202374, upload-time = "2024-12-09T18:10:26.958Z" }, { url = "https://files.pythonhosted.org/packages/91/61/c80ef80ed8a0a21158e289ef70dac01e351d929a1c30cb0f49be60772547/jiter-0.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:3ac9f578c46f22405ff7f8b1f5848fb753cc4b8377fbec8470a7dc3997ca7566", size = 202374, upload-time = "2024-12-09T18:10:26.958Z" },
] ]
[[package]]
name = "jmespath"
version = "1.0.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
]
[[package]] [[package]]
name = "joblib" name = "joblib"
version = "1.5.1" version = "1.5.1"
@ -3999,6 +4067,37 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
] ]
[[package]]
name = "s3fs"
version = "2025.3.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "aiobotocore" },
{ name = "aiohttp" },
{ name = "fsspec" },
]
sdist = { url = "https://files.pythonhosted.org/packages/72/df/559dc6d796c38f1b8a09a5f6dcf62a467a84f3c87a837ee07c59f60a26ad/s3fs-2025.3.2.tar.gz", hash = "sha256:6798f896ec76dd3bfd8beb89f0bb7c5263cb2760e038bae0978505cd172a307c", size = 77280, upload-time = "2025-03-31T15:35:18.881Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/66/e1/4db0388df5655de92ce5f9b60d2bef220a58dde130e0453e5433c579986e/s3fs-2025.3.2-py3-none-any.whl", hash = "sha256:81eae3f37b4b04bcc08845d7bcc607c6ca45878813ef7e6a28d77b2688417130", size = 30485, upload-time = "2025-03-31T15:35:17.384Z" },
]
[package.optional-dependencies]
boto3 = [
{ name = "aiobotocore", extra = ["boto3"] },
]
[[package]]
name = "s3transfer"
version = "0.13.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "botocore" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ed/5d/9dcc100abc6711e8247af5aa561fc07c4a046f72f659c3adea9a449e191a/s3transfer-0.13.0.tar.gz", hash = "sha256:f5e6db74eb7776a37208001113ea7aa97695368242b364d73e91c981ac522177", size = 150232, upload-time = "2025-05-22T19:24:50.245Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/18/17/22bf8155aa0ea2305eefa3a6402e040df7ebe512d1310165eda1e233c3f8/s3transfer-0.13.0-py3-none-any.whl", hash = "sha256:0148ef34d6dd964d0d8cf4311b2b21c474693e57c2e069ec708ce043d2b527be", size = 85152, upload-time = "2025-05-22T19:24:48.703Z" },
]
[[package]] [[package]]
name = "safetensors" name = "safetensors"
version = "0.5.3" version = "0.5.3"

View file

@ -16,6 +16,128 @@ async def add(
graph_db_config: dict = None, graph_db_config: dict = None,
dataset_id: UUID = None, dataset_id: UUID = None,
): ):
"""
Add data to Cognee for knowledge graph processing.
This is the first step in the Cognee workflow - it ingests raw data and prepares it
for processing. The function accepts various data formats including text, files, and
binary streams, then stores them in a specified dataset for further processing.
Prerequisites:
- **LLM_API_KEY**: Must be set in environment variables for content processing
- **Database Setup**: Relational and vector databases must be configured
- **User Authentication**: Uses default user if none provided (created automatically)
Supported Input Types:
- **Text strings**: Direct text content (str) - any string not starting with "/" or "file://"
- **File paths**: Local file paths as strings in these formats:
* Absolute paths: "/path/to/document.pdf"
* File URLs: "file:///path/to/document.pdf" or "file://relative/path.txt"
* S3 paths: "s3://bucket-name/path/to/file.pdf"
- **Binary file objects**: File handles/streams (BinaryIO)
- **Lists**: Multiple files or text strings in a single call
Supported File Formats:
- Text files (.txt, .md, .csv)
- PDFs (.pdf)
- Images (.png, .jpg, .jpeg) - extracted via OCR/vision models
- Audio files (.mp3, .wav) - transcribed to text
- Code files (.py, .js, .ts, etc.) - parsed for structure and content
- Office documents (.docx, .pptx)
Workflow:
1. **Data Resolution**: Resolves file paths and validates accessibility
2. **Content Extraction**: Extracts text content from various file formats
3. **Dataset Storage**: Stores processed content in the specified dataset
4. **Metadata Tracking**: Records file metadata, timestamps, and user permissions
5. **Permission Assignment**: Grants user read/write/delete/share permissions on dataset
Args:
data: The data to ingest. Can be:
- Single text string: "Your text content here"
- Absolute file path: "/path/to/document.pdf"
- File URL: "file:///absolute/path/to/document.pdf" or "file://relative/path.txt"
- S3 path: "s3://my-bucket/documents/file.pdf"
- List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
- Binary file object: open("file.txt", "rb")
dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
Create separate datasets to organize different knowledge domains.
user: User object for authentication and permissions. Uses default user if None.
Default user: "default_user@example.com" (created automatically on first use).
Users can only access datasets they have permissions for.
node_set: Optional list of node identifiers for graph organization and access control.
Used for grouping related data points in the knowledge graph.
vector_db_config: Optional configuration for vector database (for custom setups).
graph_db_config: Optional configuration for graph database (for custom setups).
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
Returns:
PipelineRunInfo: Information about the ingestion pipeline execution including:
- Pipeline run ID for tracking
- Dataset ID where data was stored
- Processing status and any errors
- Execution timestamps and metadata
Next Steps:
After successfully adding data, call `cognify()` to process the ingested content:
```python
import cognee
# Step 1: Add your data (text content or file path)
await cognee.add("Your document content") # Raw text
# OR
await cognee.add("/path/to/your/file.pdf") # File path
# Step 2: Process into knowledge graph
await cognee.cognify()
# Step 3: Search and query
results = await cognee.search("What insights can you find?")
```
Example Usage:
```python
# Add a single text document
await cognee.add("Natural language processing is a field of AI...")
# Add multiple files with different path formats
await cognee.add([
"/absolute/path/to/research_paper.pdf", # Absolute path
"file://relative/path/to/dataset.csv", # Relative file URL
"file:///absolute/path/to/report.docx", # Absolute file URL
"s3://my-bucket/documents/data.json", # S3 path
"Additional context text" # Raw text content
])
# Add to a specific dataset
await cognee.add(
data="Project documentation content",
dataset_name="project_docs"
)
# Add a single file
await cognee.add("/home/user/documents/analysis.pdf")
```
Environment Variables:
Required:
- LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
Optional:
- LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
- LLM_MODEL: Model name (default: "gpt-4o-mini")
- DEFAULT_USER_EMAIL: Custom default user email
- DEFAULT_USER_PASSWORD: Custom default user password
- VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "qdrant", "weaviate"
- GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j", "networkx"
Raises:
FileNotFoundError: If specified file paths don't exist
PermissionError: If user lacks access to files or dataset
UnsupportedFileTypeError: If file format cannot be processed
InvalidValueError: If LLM_API_KEY is not set or invalid
"""
tasks = [ tasks = [
Task(resolve_data_directories), Task(resolve_data_directories),
Task(ingest_data, dataset_name, user, node_set, dataset_id), Task(ingest_data, dataset_name, user, node_set, dataset_id),

View file

@ -31,6 +31,7 @@ def get_add_router() -> APIRouter:
raise ValueError("Either datasetId or datasetName must be provided.") raise ValueError("Either datasetId or datasetName must be provided.")
try: try:
# TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
if isinstance(data, str) and data.startswith("http"): if isinstance(data, str) and data.startswith("http"):
if "github" in data: if "github" in data:
# Perform git clone if the URL is from GitHub # Perform git clone if the URL is from GitHub

View file

@ -1,6 +1,7 @@
import asyncio import asyncio
from pydantic import BaseModel from pydantic import BaseModel
from typing import Union, Optional from typing import Union, Optional
from uuid import UUID
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.shared.data_models import KnowledgeGraph from cognee.shared.data_models import KnowledgeGraph
@ -29,7 +30,7 @@ update_status_lock = asyncio.Lock()
async def cognify( async def cognify(
datasets: Union[str, list[str]] = None, datasets: Union[str, list[str], list[UUID]] = None,
user: User = None, user: User = None,
graph_model: BaseModel = KnowledgeGraph, graph_model: BaseModel = KnowledgeGraph,
chunker=TextChunker, chunker=TextChunker,
@ -39,6 +40,151 @@ async def cognify(
graph_db_config: dict = None, graph_db_config: dict = None,
run_in_background: bool = False, run_in_background: bool = False,
): ):
"""
Transform ingested data into a structured knowledge graph.
This is the core processing step in Cognee that converts raw text and documents
into an intelligent knowledge graph. It analyzes content, extracts entities and
relationships, and creates semantic connections for enhanced search and reasoning.
Prerequisites:
- **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation)
- **Data Added**: Must have data previously added via `cognee.add()`
- **Vector Database**: Must be accessible for embeddings storage
- **Graph Database**: Must be accessible for relationship storage
Input Requirements:
- **Datasets**: Must contain data previously added via `cognee.add()`
- **Content Types**: Works with any text-extractable content including:
* Natural language documents
* Structured data (CSV, JSON)
* Code repositories
* Academic papers and technical documentation
* Mixed multimedia content (with text extraction)
Processing Pipeline:
1. **Document Classification**: Identifies document types and structures
2. **Permission Validation**: Ensures user has processing rights
3. **Text Chunking**: Breaks content into semantically meaningful segments
4. **Entity Extraction**: Identifies key concepts, people, places, organizations
5. **Relationship Detection**: Discovers connections between entities
6. **Graph Construction**: Builds semantic knowledge graph with embeddings
7. **Content Summarization**: Creates hierarchical summaries for navigation
Graph Model Customization:
The `graph_model` parameter allows custom knowledge structures:
- **Default**: General-purpose KnowledgeGraph for any domain
- **Custom Models**: Domain-specific schemas (e.g., scientific papers, code analysis)
- **Ontology Integration**: Use `ontology_file_path` for predefined vocabularies
Args:
datasets: Dataset name(s) or dataset uuid to process. Processes all available data if None.
- Single dataset: "my_dataset"
- Multiple datasets: ["docs", "research", "reports"]
- None: Process all datasets for the user
user: User context for authentication and data access. Uses default if None.
graph_model: Pydantic model defining the knowledge graph structure.
Defaults to KnowledgeGraph for general-purpose processing.
chunker: Text chunking strategy (TextChunker, LangchainChunker).
- TextChunker: Paragraph-based chunking (default, most reliable)
- LangchainChunker: Recursive character splitting with overlap
Determines how documents are segmented for processing.
chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None.
Formula: min(embedding_max_tokens, llm_max_tokens // 2)
Default limits: ~512-8192 tokens depending on models.
Smaller chunks = more granular but potentially fragmented knowledge.
ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types.
Useful for specialized fields like medical or legal documents.
vector_db_config: Custom vector database configuration for embeddings storage.
graph_db_config: Custom graph database configuration for relationship storage.
run_in_background: If True, starts processing asynchronously and returns immediately.
If False, waits for completion before returning.
Background mode recommended for large datasets (>100MB).
Use pipeline_run_id from return value to monitor progress.
Returns:
Union[dict, list[PipelineRunInfo]]:
- **Blocking mode**: Dictionary mapping dataset_id -> PipelineRunInfo with:
* Processing status (completed/failed/in_progress)
* Extracted entity and relationship counts
* Processing duration and resource usage
* Error details if any failures occurred
- **Background mode**: List of PipelineRunInfo objects for tracking progress
* Use pipeline_run_id to monitor status
* Check completion via pipeline monitoring APIs
Next Steps:
After successful cognify processing, use search functions to query the knowledge:
```python
import cognee
from cognee import SearchType
# Process your data into knowledge graph
await cognee.cognify()
# Query for insights using different search types:
# 1. Natural language completion with graph context
insights = await cognee.search(
"What are the main themes?",
query_type=SearchType.GRAPH_COMPLETION
)
# 2. Get entity relationships and connections
relationships = await cognee.search(
"connections between concepts",
query_type=SearchType.INSIGHTS
)
# 3. Find relevant document chunks
chunks = await cognee.search(
"specific topic",
query_type=SearchType.CHUNKS
)
```
Advanced Usage:
```python
# Custom domain model for scientific papers
class ScientificPaper(DataPoint):
title: str
authors: List[str]
methodology: str
findings: List[str]
await cognee.cognify(
datasets=["research_papers"],
graph_model=ScientificPaper,
ontology_file_path="scientific_ontology.owl"
)
# Background processing for large datasets
run_info = await cognee.cognify(
datasets=["large_corpus"],
run_in_background=True
)
# Check status later with run_info.pipeline_run_id
```
Environment Variables:
Required:
- LLM_API_KEY: API key for your LLM provider
Optional (same as add function):
- LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER
- LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False)
- LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60)
Raises:
DatasetNotFoundError: If specified datasets don't exist
PermissionError: If user lacks processing rights
InvalidValueError: If LLM_API_KEY is not set
OntologyParsingError: If ontology file is malformed
ValueError: If chunks exceed max token limits (reduce chunk_size)
DatabaseNotCreatedError: If databases are not properly initialized
"""
tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path) tasks = await get_default_tasks(user, graph_model, chunker, chunk_size, ontology_file_path)
if run_in_background: if run_in_background:

View file

@ -5,13 +5,16 @@ from sqlalchemy import select
from sqlalchemy.sql import delete as sql_delete from sqlalchemy.sql import delete as sql_delete
from cognee.modules.data.models import Data, DatasetData, Dataset from cognee.modules.data.models import Data, DatasetData, Dataset
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
from io import StringIO, BytesIO from io import BytesIO
import hashlib import hashlib
import asyncio
from uuid import UUID from uuid import UUID
from cognee.modules.users.models import User
from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses
from cognee.modules.users.methods import get_default_user
from cognee.modules.data.methods import get_authorized_existing_datasets
from cognee.context_global_variables import set_database_global_context_variables
from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError from .exceptions import DocumentNotFoundError, DatasetNotFoundError, DocumentSubgraphNotFoundError
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
@ -26,7 +29,9 @@ def get_text_content_hash(text: str) -> str:
async def delete( async def delete(
data: Union[BinaryIO, List[BinaryIO], str, List[str]], data: Union[BinaryIO, List[BinaryIO], str, List[str]],
dataset_name: str = "main_dataset", dataset_name: str = "main_dataset",
dataset_id: UUID = None,
mode: str = "soft", mode: str = "soft",
user: User = None,
): ):
"""Delete a document and all its related nodes from both relational and graph databases. """Delete a document and all its related nodes from both relational and graph databases.
@ -34,15 +39,27 @@ async def delete(
data: The data to delete (file, URL, or text) data: The data to delete (file, URL, or text)
dataset_name: Name of the dataset to delete from dataset_name: Name of the dataset to delete from
mode: "soft" (default) or "hard" - hard mode also deletes degree-one entity nodes mode: "soft" (default) or "hard" - hard mode also deletes degree-one entity nodes
user: User doing the operation, if none default user will be used.
""" """
if user is None:
user = await get_default_user()
# Verify user has permission to work with given dataset. If dataset_id is given use it, if not use dataset_name
dataset = await get_authorized_existing_datasets(
[dataset_id] if dataset_id else [dataset_name], "delete", user
)
# Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
await set_database_global_context_variables(dataset[0].id, dataset[0].owner_id)
# Handle different input types # Handle different input types
if isinstance(data, str): if isinstance(data, str):
if data.startswith("file://"): # It's a file path if data.startswith("file://") or data.startswith("/"): # It's a file path
with open(data.replace("file://", ""), mode="rb") as file: with open(data.replace("file://", ""), mode="rb") as file:
classified_data = classify(file) classified_data = classify(file)
content_hash = classified_data.get_metadata()["content_hash"] content_hash = classified_data.get_metadata()["content_hash"]
return await delete_single_document(content_hash, dataset_name, mode) return await delete_single_document(content_hash, dataset[0].id, mode)
elif data.startswith("http"): # It's a URL elif data.startswith("http"): # It's a URL
import requests import requests
@ -51,26 +68,26 @@ async def delete(
file_data = BytesIO(response.content) file_data = BytesIO(response.content)
classified_data = classify(file_data) classified_data = classify(file_data)
content_hash = classified_data.get_metadata()["content_hash"] content_hash = classified_data.get_metadata()["content_hash"]
return await delete_single_document(content_hash, dataset_name, mode) return await delete_single_document(content_hash, dataset[0].id, mode)
else: # It's a text string else: # It's a text string
content_hash = get_text_content_hash(data) content_hash = get_text_content_hash(data)
classified_data = classify(data) classified_data = classify(data)
return await delete_single_document(content_hash, dataset_name, mode) return await delete_single_document(content_hash, dataset[0].id, mode)
elif isinstance(data, list): elif isinstance(data, list):
# Handle list of inputs sequentially # Handle list of inputs sequentially
results = [] results = []
for item in data: for item in data:
result = await delete(item, dataset_name, mode) result = await delete(item, dataset_name, dataset[0].id, mode, user=user)
results.append(result) results.append(result)
return {"status": "success", "message": "Multiple documents deleted", "results": results} return {"status": "success", "message": "Multiple documents deleted", "results": results}
else: # It's already a BinaryIO else: # It's already a BinaryIO
data.seek(0) # Ensure we're at the start of the file data.seek(0) # Ensure we're at the start of the file
classified_data = classify(data) classified_data = classify(data)
content_hash = classified_data.get_metadata()["content_hash"] content_hash = classified_data.get_metadata()["content_hash"]
return await delete_single_document(content_hash, dataset_name, mode) return await delete_single_document(content_hash, dataset[0].id, mode)
async def delete_single_document(content_hash: str, dataset_name: str, mode: str = "soft"): async def delete_single_document(content_hash: str, dataset_id: UUID = None, mode: str = "soft"):
"""Delete a single document by its content hash.""" """Delete a single document by its content hash."""
# Delete from graph database # Delete from graph database
@ -157,11 +174,11 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str
# Get the dataset # Get the dataset
dataset = ( dataset = (
await session.execute(select(Dataset).filter(Dataset.name == dataset_name)) await session.execute(select(Dataset).filter(Dataset.id == dataset_id))
).scalar_one_or_none() ).scalar_one_or_none()
if dataset is None: if dataset is None:
raise DatasetNotFoundError(f"Dataset not found: {dataset_name}") raise DatasetNotFoundError(f"Dataset not found: {dataset_id}")
# Delete from dataset_data table # Delete from dataset_data table
dataset_delete_stmt = sql_delete(DatasetData).where( dataset_delete_stmt = sql_delete(DatasetData).where(
@ -186,7 +203,7 @@ async def delete_single_document(content_hash: str, dataset_name: str, mode: str
"message": "Document deleted from both graph and relational databases", "message": "Document deleted from both graph and relational databases",
"graph_deletions": deletion_result["deleted_counts"], "graph_deletions": deletion_result["deleted_counts"],
"content_hash": content_hash, "content_hash": content_hash,
"dataset": dataset_name, "dataset": dataset_id,
"deleted_node_ids": [ "deleted_node_ids": [
str(node_id) for node_id in deleted_node_ids str(node_id) for node_id in deleted_node_ids
], # Convert back to strings for response ], # Convert back to strings for response

View file

@ -1,7 +1,8 @@
from fastapi import Form, UploadFile, Depends from fastapi import Form, UploadFile, Depends
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from fastapi import APIRouter from fastapi import APIRouter
from typing import List, Optional from typing import List
from uuid import UUID
import subprocess import subprocess
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
import requests import requests
@ -18,6 +19,7 @@ def get_delete_router() -> APIRouter:
async def delete( async def delete(
data: List[UploadFile], data: List[UploadFile],
dataset_name: str = Form("main_dataset"), dataset_name: str = Form("main_dataset"),
dataset_id: UUID = None,
mode: str = Form("soft"), mode: str = Form("soft"),
user: User = Depends(get_authenticated_user), user: User = Depends(get_authenticated_user),
): ):
@ -35,6 +37,7 @@ def get_delete_router() -> APIRouter:
# Handle each file in the list # Handle each file in the list
results = [] results = []
for file in data: for file in data:
# TODO: Add check if HTTP Requests are enabled before allowing requests and git clone
if file.filename.startswith("http"): if file.filename.startswith("http"):
if "github" in file.filename: if "github" in file.filename:
# For GitHub repos, we need to get the content hash of each file # For GitHub repos, we need to get the content hash of each file
@ -54,12 +57,22 @@ def get_delete_router() -> APIRouter:
response.raise_for_status() response.raise_for_status()
file_data = response.content file_data = response.content
result = await cognee_delete( result = await cognee_delete(
file_data, dataset_name=dataset_name, mode=mode file_data,
dataset_name=dataset_name,
dataset_id=dataset_id,
mode=mode,
user=user,
) )
results.append(result) results.append(result)
else: else:
# Handle uploaded file by accessing its file attribute # Handle uploaded file by accessing its file attribute
result = await cognee_delete(file.file, dataset_name=dataset_name, mode=mode) result = await cognee_delete(
file.file,
dataset_name=dataset_name,
dataset_id=dataset_id,
mode=mode,
user=user,
)
results.append(result) results.append(result)
if len(results) == 1: if len(results) == 1:

View file

@ -20,6 +20,142 @@ async def search(
node_type: Optional[Type] = None, node_type: Optional[Type] = None,
node_name: Optional[List[str]] = None, node_name: Optional[List[str]] = None,
) -> list: ) -> list:
"""
Search and query the knowledge graph for insights, information, and connections.
This is the final step in the Cognee workflow that retrieves information from the
processed knowledge graph. It supports multiple search modes optimized for different
use cases - from simple fact retrieval to complex reasoning and code analysis.
Search Prerequisites:
- **LLM_API_KEY**: Required for GRAPH_COMPLETION and RAG_COMPLETION search types
- **Data Added**: Must have data previously added via `cognee.add()`
- **Knowledge Graph Built**: Must have processed data via `cognee.cognify()`
- **Dataset Permissions**: User must have 'read' permission on target datasets
- **Vector Database**: Must be accessible for semantic search functionality
Search Types & Use Cases:
**GRAPH_COMPLETION** (Default - Recommended):
Natural language Q&A using full graph context and LLM reasoning.
Best for: Complex questions, analysis, summaries, insights.
Returns: Conversational AI responses with graph-backed context.
**RAG_COMPLETION**:
Traditional RAG using document chunks without graph structure.
Best for: Direct document retrieval, specific fact-finding.
Returns: LLM responses based on relevant text chunks.
**INSIGHTS**:
Structured entity relationships and semantic connections.
Best for: Understanding concept relationships, knowledge mapping.
Returns: Formatted relationship data and entity connections.
**CHUNKS**:
Raw text segments that match the query semantically.
Best for: Finding specific passages, citations, exact content.
Returns: Ranked list of relevant text chunks with metadata.
**SUMMARIES**:
Pre-generated hierarchical summaries of content.
Best for: Quick overviews, document abstracts, topic summaries.
Returns: Multi-level summaries from detailed to high-level.
**CODE**:
Code-specific search with syntax and semantic understanding.
Best for: Finding functions, classes, implementation patterns.
Returns: Structured code information with context and relationships.
**CYPHER**:
Direct graph database queries using Cypher syntax.
Best for: Advanced users, specific graph traversals, debugging.
Returns: Raw graph query results.
Args:
query_text: Your question or search query in natural language.
Examples:
- "What are the main themes in this research?"
- "How do these concepts relate to each other?"
- "Find information about machine learning algorithms"
- "What functions handle user authentication?"
query_type: SearchType enum specifying the search mode.
Defaults to GRAPH_COMPLETION for conversational AI responses.
user: User context for data access permissions. Uses default if None.
datasets: Dataset name(s) to search within. Searches all accessible if None.
- Single dataset: "research_papers"
- Multiple datasets: ["docs", "reports", "analysis"]
- None: Search across all user datasets
dataset_ids: Alternative to datasets - use specific UUID identifiers.
system_prompt_path: Custom system prompt file for LLM-based search types.
Defaults to "answer_simple_question.txt".
top_k: Maximum number of results to return (1-N)
Higher values provide more comprehensive but potentially noisy results.
node_type: Filter results to specific entity types (for advanced filtering).
node_name: Filter results to specific named entities (for targeted search).
Returns:
list: Search results in format determined by query_type:
**GRAPH_COMPLETION/RAG_COMPLETION**:
[List of conversational AI response strings]
**INSIGHTS**:
[List of formatted relationship descriptions and entity connections]
**CHUNKS**:
[List of relevant text passages with source metadata]
**SUMMARIES**:
[List of hierarchical summaries from general to specific]
**CODE**:
[List of structured code information with context]
Performance & Optimization:
- **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context
- **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal)
- **INSIGHTS**: Fast, returns structured relationships without LLM processing
- **CHUNKS**: Fastest, pure vector similarity search without LLM
- **SUMMARIES**: Fast, returns pre-computed summaries
- **CODE**: Medium speed, specialized for code understanding
- **top_k**: Start with 10, increase for comprehensive analysis (max 100)
- **datasets**: Specify datasets to improve speed and relevance
Next Steps After Search:
- Use results for further analysis or application integration
- Combine different search types for comprehensive understanding
- Export insights for reporting or downstream processing
- Iterate with refined queries based on initial results
Environment Variables:
Required for LLM-based search types (GRAPH_COMPLETION, RAG_COMPLETION):
- LLM_API_KEY: API key for your LLM provider
Optional:
- LLM_PROVIDER, LLM_MODEL: Configure LLM for search responses
- VECTOR_DB_PROVIDER: Must match what was used during cognify
- GRAPH_DATABASE_PROVIDER: Must match what was used during cognify
Raises:
DatasetNotFoundError: If specified datasets don't exist or aren't accessible
PermissionDeniedError: If user lacks read access to requested datasets
NoDataError: If no relevant data found for the search query
InvalidValueError: If LLM_API_KEY is not set (for LLM-based search types)
ValueError: If query_text is empty or search parameters are invalid
CollectionNotFoundError: If vector collection not found (data not processed)
"""
# We use lists from now on for datasets # We use lists from now on for datasets
if isinstance(datasets, UUID) or isinstance(datasets, str): if isinstance(datasets, UUID) or isinstance(datasets, str):
datasets = [datasets] datasets = [datasets]

View file

@ -176,43 +176,6 @@ class DataPoint(BaseModel):
""" """
return self.model_validate_json(json_str) return self.model_validate_json(json_str)
# Pickle Serialization
def to_pickle(self) -> bytes:
"""
Serialize the DataPoint instance to a byte format for pickling.
This method uses the built-in Python pickle module to convert the instance into a byte
stream for persistence or transmission.
Returns:
--------
- bytes: The pickled byte representation of the DataPoint instance.
"""
return pickle.dumps(self.dict())
@classmethod
def from_pickle(self, pickled_data: bytes):
"""
Deserialize a DataPoint instance from a pickled byte stream.
The method converts the byte stream back into a DataPoint instance by loading the data
and validating it through the model's constructor.
Parameters:
-----------
- pickled_data (bytes): The bytes representation of a pickled DataPoint instance to
be deserialized.
Returns:
--------
A new DataPoint instance created from the pickled data.
"""
data = pickle.loads(pickled_data)
return self(**data)
def to_dict(self, **kwargs) -> Dict[str, Any]: def to_dict(self, **kwargs) -> Dict[str, Any]:
""" """
Convert the DataPoint instance to a dictionary representation. Convert the DataPoint instance to a dictionary representation.

View file

@ -1,4 +1,5 @@
import litellm import litellm
import logging
from pydantic import BaseModel from pydantic import BaseModel
from typing import Type, Optional from typing import Type, Optional
from litellm import acompletion, JSONSchemaValidationError from litellm import acompletion, JSONSchemaValidationError

View file

@ -1,5 +1,6 @@
"""Adapter for Generic API LLM provider API""" """Adapter for Generic API LLM provider API"""
import logging
from typing import Type from typing import Type
from pydantic import BaseModel from pydantic import BaseModel
@ -7,6 +8,7 @@ import instructor
from cognee.infrastructure.llm.llm_interface import LLMInterface from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.infrastructure.llm.config import get_llm_config from cognee.infrastructure.llm.config import get_llm_config
from cognee.infrastructure.llm.rate_limiter import rate_limit_async, sleep_and_retry_async from cognee.infrastructure.llm.rate_limiter import rate_limit_async, sleep_and_retry_async
from cognee.shared.logging_utils import get_logger
import litellm import litellm

View file

@ -1,8 +1,7 @@
from cognee.shared.logging_utils import get_logger
import litellm import litellm
from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.shared.logging_utils import get_logger
logger = get_logger() logger = get_logger()
@ -22,6 +21,9 @@ def get_max_chunk_tokens():
the smaller value of the embedding engine's max tokens and half of the LLM's the smaller value of the embedding engine's max tokens and half of the LLM's
maximum tokens. maximum tokens.
""" """
# NOTE: Import must be done in function to avoid circular import issue
from cognee.infrastructure.databases.vector import get_vector_engine
# Calculate max chunk size based on the following formula # Calculate max chunk size based on the following formula
embedding_engine = get_vector_engine().embedding_engine embedding_engine = get_vector_engine().embedding_engine
llm_client = get_llm_client() llm_client = get_llm_client()
@ -93,6 +95,9 @@ async def test_embedding_connection():
the exception if the connection to the embedding handler cannot be established. the exception if the connection to the embedding handler cannot be established.
""" """
try: try:
# NOTE: Vector engine import must be done in function to avoid circular import issue
from cognee.infrastructure.databases.vector import get_vector_engine
await get_vector_engine().embedding_engine.embed_text("test") await get_vector_engine().embedding_engine.embed_text("test")
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)

View file

@ -2,6 +2,10 @@ from pypdf import PdfReader
from cognee.modules.chunking.Chunker import Chunker from cognee.modules.chunking.Chunker import Chunker
from .open_data_file import open_data_file from .open_data_file import open_data_file
from .Document import Document from .Document import Document
from cognee.shared.logging_utils import get_logger
from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError
logger = get_logger("PDFDocument")
class PdfDocument(Document): class PdfDocument(Document):
@ -9,12 +13,19 @@ class PdfDocument(Document):
def read(self, chunker_cls: Chunker, max_chunk_size: int): def read(self, chunker_cls: Chunker, max_chunk_size: int):
with open_data_file(self.raw_data_location, mode="rb") as stream: with open_data_file(self.raw_data_location, mode="rb") as stream:
file = PdfReader(stream) logger.info(f"Reading PDF:{self.raw_data_location}")
try:
file = PdfReader(stream, strict=False)
except Exception:
raise PyPdfInternalError()
def get_text(): def get_text():
for page in file.pages: try:
page_text = page.extract_text() for page in file.pages:
yield page_text page_text = page.extract_text()
yield page_text
except Exception:
raise PyPdfInternalError()
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size) chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)

View file

@ -0,0 +1,7 @@
"""
Custom exceptions for the Cognee API.
This module defines a set of exceptions for the different document types
"""
from .exceptions import PyPdfInternalError

View file

@ -0,0 +1,14 @@
from cognee.exceptions import CogneeApiError
from fastapi import status
class PyPdfInternalError(CogneeApiError):
"""Internal pypdf error"""
def __init__(
self,
message: str = "Error during PyPdf processing. Pdf is damaged or cannot be processed.",
name: str = "PyPdfInternalError",
status_code=status.WS_1011_INTERNAL_ERROR,
):
super().__init__(message, name, status_code)

View file

@ -1,4 +1,6 @@
from typing import IO, Optional from typing import IO, Optional
from urllib.parse import urlparse
import os
from cognee.api.v1.add.config import get_s3_config from cognee.api.v1.add.config import get_s3_config
@ -24,8 +26,16 @@ def open_data_file(
else: else:
return fs.open(file_path, mode=mode, encoding=encoding, **kwargs) return fs.open(file_path, mode=mode, encoding=encoding, **kwargs)
elif file_path.startswith("file://"): elif file_path.startswith("file://"):
# Handle local file URLs by stripping the file:// prefix # Handle local file URLs by properly parsing the URI
file_path = file_path.replace("file://", "", 1) parsed_url = urlparse(file_path)
return open(file_path, mode=mode, encoding=encoding, **kwargs) # On Windows, urlparse handles drive letters correctly
# Convert the path component to a proper file path
if os.name == "nt": # Windows
# Remove leading slash from Windows paths like /C:/Users/...
local_path = parsed_url.path.lstrip("/")
else: # Unix-like systems
local_path = parsed_url.path
return open(local_path, mode=mode, encoding=encoding, **kwargs)
else: else:
return open(file_path, mode=mode, encoding=encoding, **kwargs) return open(file_path, mode=mode, encoding=encoding, **kwargs)

View file

@ -121,7 +121,7 @@ async def run_pipeline(
check_dataset_name(dataset.name) check_dataset_name(dataset.name)
# Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True # Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
await set_database_global_context_variables(dataset.name, user.id) await set_database_global_context_variables(dataset.id, dataset.owner_id)
# Ugly hack, but no easier way to do this. # Ugly hack, but no easier way to do this.
if pipeline_name == "add_pipeline": if pipeline_name == "add_pipeline":

View file

@ -57,7 +57,7 @@ async def search(
""" """
# Use search function filtered by permissions if access control is enabled # Use search function filtered by permissions if access control is enabled
if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true":
return await permissions_search( return await authorized_search(
query_text, query_type, user, dataset_ids, system_prompt_path, top_k query_text, query_type, user, dataset_ids, system_prompt_path, top_k
) )
@ -143,7 +143,7 @@ async def specific_search(
return results return results
async def permissions_search( async def authorized_search(
query_text: str, query_text: str,
query_type: SearchType, query_type: SearchType,
user: User = None, user: User = None,
@ -190,7 +190,11 @@ async def specific_search_by_context(
search_results = await specific_search( search_results = await specific_search(
query_type, query_text, user, system_prompt_path=system_prompt_path, top_k=top_k query_type, query_text, user, system_prompt_path=system_prompt_path, top_k=top_k
) )
return {dataset.name: search_results} return {
"search_result": search_results,
"dataset_id": dataset.id,
"dataset_name": dataset.name,
}
# Search every dataset async based on query and appropriate database configuration # Search every dataset async based on query and appropriate database configuration
tasks = [] tasks = []

View file

@ -11,6 +11,23 @@ import importlib.metadata
from cognee import __version__ as cognee_version from cognee import __version__ as cognee_version
from typing import Protocol from typing import Protocol
# Configure external library logging
def configure_external_library_logging():
"""Configure logging for external libraries to reduce verbosity"""
# Configure LiteLLM logging to reduce verbosity
try:
import litellm
litellm.set_verbose = False
# Suppress LiteLLM ERROR logging using standard logging
logging.getLogger("litellm").setLevel(logging.CRITICAL)
except ImportError:
# LiteLLM not available, skip configuration
pass
# Export common log levels # Export common log levels
DEBUG = logging.DEBUG DEBUG = logging.DEBUG
INFO = logging.INFO INFO = logging.INFO
@ -148,6 +165,44 @@ def get_logger(name=None, level=None) -> LoggerInterface:
return logger return logger
def log_database_configuration(logger):
"""Log the current database configuration for all database types"""
# NOTE: Has to be imporated at runtime to avoid circular import
from cognee.infrastructure.databases.relational.config import get_relational_config
from cognee.infrastructure.databases.vector.config import get_vectordb_config
from cognee.infrastructure.databases.graph.config import get_graph_config
try:
# Log relational database configuration
relational_config = get_relational_config()
logger.info(f"Relational database: {relational_config.db_provider}")
if relational_config.db_provider == "postgres":
logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}")
logger.info(f"Postgres database: {relational_config.db_name}")
elif relational_config.db_provider == "sqlite":
logger.info(f"SQLite path: {relational_config.db_path}")
logger.info(f"SQLite database: {relational_config.db_name}")
# Log vector database configuration
vector_config = get_vectordb_config()
logger.info(f"Vector database: {vector_config.vector_db_provider}")
if vector_config.vector_db_provider == "lancedb":
logger.info(f"Vector database path: {vector_config.vector_db_url}")
else:
logger.info(f"Vector database URL: {vector_config.vector_db_url}")
# Log graph database configuration
graph_config = get_graph_config()
logger.info(f"Graph database: {graph_config.graph_database_provider}")
if graph_config.graph_database_provider == "kuzu":
logger.info(f"Graph database path: {graph_config.graph_file_path}")
else:
logger.info(f"Graph database URL: {graph_config.graph_database_url}")
except Exception as e:
logger.warning(f"Could not retrieve database configuration: {str(e)}")
def cleanup_old_logs(logs_dir, max_files): def cleanup_old_logs(logs_dir, max_files):
""" """
Removes old log files, keeping only the most recent ones. Removes old log files, keeping only the most recent ones.
@ -193,6 +248,9 @@ def setup_logging(log_level=None, name=None):
log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")] log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]
# Configure external library logging early to suppress verbose output
configure_external_library_logging()
def exception_handler(logger, method_name, event_dict): def exception_handler(logger, method_name, event_dict):
"""Custom processor to handle uncaught exceptions.""" """Custom processor to handle uncaught exceptions."""
# Check if there's an exc_info that needs to be processed # Check if there's an exc_info that needs to be processed
@ -339,6 +397,9 @@ def setup_logging(log_level=None, name=None):
logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai") logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
# Log database configuration
log_database_configuration(logger)
# Return the configured logger # Return the configured logger
return logger return logger

View file

@ -31,7 +31,7 @@ def chunk_by_paragraph(
current_chunk = "" current_chunk = ""
chunk_index = 0 chunk_index = 0
paragraph_ids = [] paragraph_ids = []
last_cut_type = None last_cut_type = "default"
current_chunk_size = 0 current_chunk_size = 0
for paragraph_id, sentence, sentence_size, end_type in chunk_by_sentence( for paragraph_id, sentence, sentence_size, end_type in chunk_by_sentence(
@ -77,6 +77,9 @@ def chunk_by_paragraph(
current_chunk_size = 0 current_chunk_size = 0
chunk_index += 1 chunk_index += 1
if not end_type:
end_type = "default"
last_cut_type = end_type last_cut_type = end_type
# Yield any remaining text # Yield any remaining text

View file

@ -1,5 +1,6 @@
from typing import AsyncGenerator from typing import AsyncGenerator
from cognee.shared.logging_utils import get_logger
from cognee.modules.data.processing.document_types.Document import Document from cognee.modules.data.processing.document_types.Document import Document
from sqlalchemy import select from sqlalchemy import select
from cognee.modules.data.models import Data from cognee.modules.data.models import Data
@ -7,6 +8,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine
from uuid import UUID from uuid import UUID
from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.chunking.Chunker import Chunker from cognee.modules.chunking.Chunker import Chunker
from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError
async def update_document_token_count(document_id: UUID, token_count: int) -> None: async def update_document_token_count(document_id: UUID, token_count: int) -> None:
@ -38,10 +40,13 @@ async def extract_chunks_from_documents(
""" """
for document in documents: for document in documents:
document_token_count = 0 document_token_count = 0
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker): try:
document_token_count += document_chunk.chunk_size for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
document_chunk.belongs_to_set = document.belongs_to_set document_token_count += document_chunk.chunk_size
yield document_chunk document_chunk.belongs_to_set = document.belongs_to_set
yield document_chunk
await update_document_token_count(document.id, document_token_count) await update_document_token_count(document.id, document_token_count)
except PyPdfInternalError:
pass
# todo rita # todo rita

View file

@ -180,10 +180,12 @@ async def ingest_data(
await session.commit() await session.commit()
await give_permission_on_dataset(user, dataset.id, "read") # Only give permission if dataset owner is same as user (to avoid giving delete and share permission to non owner users)
await give_permission_on_dataset(user, dataset.id, "write") if dataset.owner_id == user.id:
await give_permission_on_dataset(user, dataset.id, "delete") await give_permission_on_dataset(user, dataset.id, "read")
await give_permission_on_dataset(user, dataset.id, "share") await give_permission_on_dataset(user, dataset.id, "write")
await give_permission_on_dataset(user, dataset.id, "delete")
await give_permission_on_dataset(user, dataset.id, "share")
return file_paths return file_paths

View file

@ -20,6 +20,7 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any], datase
file_path = data_item file_path = data_item
# data is a file path # data is a file path
elif data_item.startswith("file://") or data_item.startswith("/"): elif data_item.startswith("file://") or data_item.startswith("/"):
# TODO: Add check if ACCEPT_LOCAL_FILE_PATH is enabled, if it's not raise an error
file_path = data_item.replace("file://", "") file_path = data_item.replace("file://", "")
# data is text # data is text
else: else:

View file

@ -0,0 +1,203 @@
import os
import cognee
import pathlib
from cognee.modules.users.exceptions import PermissionDeniedError
from cognee.shared.logging_utils import get_logger
from cognee.modules.search.types import SearchType
from cognee.modules.users.methods import get_default_user, create_user
from cognee.modules.users.permissions.methods import authorized_give_permission_on_datasets
logger = get_logger()
async def main():
# Enable permissions feature
os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "True"
# Clean up test directories before starting
data_directory_path = str(
pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_permissions")
).resolve()
)
cognee_directory_path = str(
pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_permissions")
).resolve()
)
cognee.config.data_root_directory(data_directory_path)
cognee.config.system_root_directory(cognee_directory_path)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
explanation_file_path = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
)
# Add document for default user
await cognee.add([explanation_file_path], dataset_name="NLP")
default_user = await get_default_user()
text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
"""
# Add document for test user
test_user = await create_user("user@example.com", "example")
await cognee.add([text], dataset_name="QUANTUM", user=test_user)
await cognee.cognify(["NLP"], user=default_user)
await cognee.cognify(["QUANTUM"], user=test_user)
# Check if default_user can only see information from the NLP dataset
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=default_user,
)
assert len(search_results) == 1, "The search results list lenght is not one."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
assert search_results[0]["dataset_name"] == "NLP", (
f"Dict must contain dataset name 'NLP': {search_results[0]}"
)
# Check if test_user can only see information from the QUANTUM dataset
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=test_user,
)
assert len(search_results) == 1, "The search results list lenght is not one."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
assert search_results[0]["dataset_name"] == "QUANTUM", (
f"Dict must contain dataset name 'QUANTUM': {search_results[0]}"
)
# Try to add document with default_user to test_users dataset (test write permission enforcement)
test_user_dataset_id = search_results[0]["dataset_id"]
add_error = False
try:
await cognee.add(
[explanation_file_path],
dataset_name="QUANTUM",
dataset_id=test_user_dataset_id,
user=default_user,
)
except PermissionDeniedError:
add_error = True
assert add_error, "PermissionDeniedError was not raised during add as expected"
# Try to cognify with default_user the test_users dataset (test write permission enforcement)
cognify_error = False
try:
await cognee.cognify(datasets=[test_user_dataset_id], user=default_user)
except PermissionDeniedError:
cognify_error = True
assert cognify_error, "PermissionDeniedError was not raised during cognify as expected"
# Try to add permission for a dataset default_user does not have share permission for
give_permission_error = False
try:
await authorized_give_permission_on_datasets(
default_user.id,
[test_user_dataset_id],
"write",
default_user.id,
)
except PermissionDeniedError:
give_permission_error = True
assert give_permission_error, (
"PermissionDeniedError was not raised during assignment of permission as expected"
)
# Actually give permission to default_user to write on test_users dataset
await authorized_give_permission_on_datasets(
default_user.id,
[test_user_dataset_id],
"write",
test_user.id,
)
# Add new data to test_users dataset from default_user
await cognee.add(
[explanation_file_path],
dataset_name="QUANTUM",
dataset_id=test_user_dataset_id,
user=default_user,
)
await cognee.cognify(datasets=[test_user_dataset_id], user=default_user)
# Actually give permission to default_user to read on test_users dataset
await authorized_give_permission_on_datasets(
default_user.id,
[test_user_dataset_id],
"read",
test_user.id,
)
# Check if default_user can see from test_users datasets now
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=default_user,
dataset_ids=[test_user_dataset_id],
)
assert len(search_results) == 1, "The search results list length is not one."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
assert search_results[0]["dataset_name"] == "QUANTUM", (
f"Dict must contain dataset name 'QUANTUM': {search_results[0]}"
)
# Check if default_user can only see information from both datasets now
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="What is in the document?",
user=default_user,
)
assert len(search_results) == 2, "The search results list length is not two."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
# Try deleting data from test_user dataset with default_user without delete permission
delete_error = False
try:
await cognee.delete([text], dataset_id=test_user_dataset_id, user=default_user)
except PermissionDeniedError:
delete_error = True
assert delete_error, "PermissionDeniedError was not raised during delete operation as expected"
# Try deleting data from test_user dataset with test_user
await cognee.delete([text], dataset_id=test_user_dataset_id, user=test_user)
# Actually give permission to default_user to delete data for test_users dataset
await authorized_give_permission_on_datasets(
default_user.id,
[test_user_dataset_id],
"delete",
test_user.id,
)
# Try deleting data from test_user dataset with default_user after getting delete permission
await cognee.delete([explanation_file_path], dataset_id=test_user_dataset_id, user=default_user)
if __name__ == "__main__":
import asyncio
asyncio.run(main())

View file

@ -1,6 +1,7 @@
import os import os
import tempfile import tempfile
import pytest import pytest
from pathlib import Path
from cognee.modules.data.processing.document_types.open_data_file import open_data_file from cognee.modules.data.processing.document_types.open_data_file import open_data_file
@ -29,7 +30,8 @@ class TestOpenDataFile:
temp_file_path = f.name temp_file_path = f.name
try: try:
file_url = f"file://{temp_file_path}" # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
file_url = Path(temp_file_path).as_uri()
with open_data_file(file_url, mode="r") as f: with open_data_file(file_url, mode="r") as f:
content = f.read() content = f.read()
assert content == test_content assert content == test_content
@ -44,7 +46,8 @@ class TestOpenDataFile:
temp_file_path = f.name temp_file_path = f.name
try: try:
file_url = f"file://{temp_file_path}" # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
file_url = Path(temp_file_path).as_uri()
with open_data_file(file_url, mode="rb") as f: with open_data_file(file_url, mode="rb") as f:
content = f.read() content = f.read()
assert content == test_content.encode() assert content == test_content.encode()
@ -61,7 +64,8 @@ class TestOpenDataFile:
temp_file_path = f.name temp_file_path = f.name
try: try:
file_url = f"file://{temp_file_path}" # Use pathlib.Path.as_uri() for proper cross-platform file URL creation
file_url = Path(temp_file_path).as_uri()
with open_data_file(file_url, mode="r", encoding="utf-8") as f: with open_data_file(file_url, mode="r", encoding="utf-8") as f:
content = f.read() content = f.read()
assert content == test_content assert content == test_content
@ -84,7 +88,9 @@ class TestOpenDataFile:
try: try:
# Even if someone accidentally adds multiple file:// prefixes # Even if someone accidentally adds multiple file:// prefixes
file_url = f"file://file://{temp_file_path}" # Use proper file URL creation first
proper_file_url = Path(temp_file_path).as_uri()
file_url = f"file://{proper_file_url}"
with open_data_file(file_url, mode="r") as f: with open_data_file(file_url, mode="r") as f:
content = f.read() content = f.read()
# This should work because we only replace the first occurrence # This should work because we only replace the first occurrence

View file

@ -17,7 +17,7 @@ def get_cognee_version() -> str:
.strip("'\"\n ") .strip("'\"\n ")
) )
# Mark the version as a local Cognee library by appending “-dev” # Mark the version as a local Cognee library by appending “-dev”
return f"{version}-dev" return f"{version}-local"
try: try:
return importlib.metadata.version("cognee") return importlib.metadata.version("cognee")
except importlib.metadata.PackageNotFoundError: except importlib.metadata.PackageNotFoundError: