From a8ff50ceae262868cb303707f5396abe07cfed38 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 17 Oct 2025 18:09:01 +0200 Subject: [PATCH 01/85] feat: Initial multi-tenancy commit --- cognee/modules/data/methods/create_dataset.py | 5 +-- cognee/modules/data/models/Dataset.py | 1 + cognee/modules/users/methods/create_user.py | 35 +++++-------------- .../modules/users/methods/get_default_user.py | 7 +--- cognee/modules/users/methods/get_user.py | 2 +- .../users/methods/get_user_by_email.py | 2 +- cognee/modules/users/models/Tenant.py | 13 ++++--- cognee/modules/users/models/User.py | 11 +++--- cognee/modules/users/models/UserTenant.py | 12 +++++++ cognee/modules/users/models/__init__.py | 1 + .../get_all_user_permission_datasets.py | 20 +++++------ .../tenants/methods/add_user_to_tenant.py | 25 +++++++++---- .../users/tenants/methods/create_tenant.py | 16 ++++++--- examples/python/permissions_example.py | 4 ++- 14 files changed, 82 insertions(+), 72 deletions(-) create mode 100644 cognee/modules/users/models/UserTenant.py diff --git a/cognee/modules/data/methods/create_dataset.py b/cognee/modules/data/methods/create_dataset.py index c080de0e8..280c9e105 100644 --- a/cognee/modules/data/methods/create_dataset.py +++ b/cognee/modules/data/methods/create_dataset.py @@ -22,8 +22,9 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) - if dataset is None: # Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user) - dataset = Dataset(id=dataset_id, name=dataset_name, data=[]) - dataset.owner_id = owner_id + dataset = Dataset( + id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id + ) session.add(dataset) diff --git a/cognee/modules/data/models/Dataset.py b/cognee/modules/data/models/Dataset.py index 797401d5a..00ed4da96 100644 --- a/cognee/modules/data/models/Dataset.py +++ b/cognee/modules/data/models/Dataset.py @@ -18,6 +18,7 @@ class Dataset(Base): updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) owner_id = Column(UUID, index=True) + tenant_id = Column(UUID, index=True, nullable=True) acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan") diff --git a/cognee/modules/users/methods/create_user.py b/cognee/modules/users/methods/create_user.py index 1b303bd36..953c70cd6 100644 --- a/cognee/modules/users/methods/create_user.py +++ b/cognee/modules/users/methods/create_user.py @@ -18,7 +18,6 @@ from typing import Optional async def create_user( email: str, password: str, - tenant_id: Optional[str] = None, is_superuser: bool = False, is_active: bool = True, is_verified: bool = False, @@ -30,33 +29,15 @@ async def create_user( async with relational_engine.get_async_session() as session: async with get_user_db_context(session) as user_db: async with get_user_manager_context(user_db) as user_manager: - if tenant_id: - # Check if the tenant already exists - result = await session.execute(select(Tenant).where(Tenant.id == tenant_id)) - tenant = result.scalars().first() - if not tenant: - raise TenantNotFoundError - - user = await user_manager.create( - UserCreate( - email=email, - password=password, - tenant_id=tenant.id, - is_superuser=is_superuser, - is_active=is_active, - is_verified=is_verified, - ) - ) - else: - user = await user_manager.create( - UserCreate( - email=email, - password=password, - is_superuser=is_superuser, - is_active=is_active, - is_verified=is_verified, - ) + user = await user_manager.create( + UserCreate( + email=email, + password=password, + is_superuser=is_superuser, + is_active=is_active, + is_verified=is_verified, ) + ) if auto_login: await session.refresh(user) diff --git a/cognee/modules/users/methods/get_default_user.py b/cognee/modules/users/methods/get_default_user.py index 48073a884..773545f8e 100644 --- a/cognee/modules/users/methods/get_default_user.py +++ b/cognee/modules/users/methods/get_default_user.py @@ -27,12 +27,7 @@ async def get_default_user() -> SimpleNamespace: if user is None: return await create_default_user() - # We return a SimpleNamespace to have the same user type as our SaaS - # SimpleNamespace is just a dictionary which can be accessed through attributes - auth_data = SimpleNamespace( - id=user.id, email=user.email, tenant_id=user.tenant_id, roles=[] - ) - return auth_data + return user except Exception as error: if "principals" in str(error.args): raise DatabaseNotCreatedError() from error diff --git a/cognee/modules/users/methods/get_user.py b/cognee/modules/users/methods/get_user.py index 2678a5a01..a1c87aab7 100644 --- a/cognee/modules/users/methods/get_user.py +++ b/cognee/modules/users/methods/get_user.py @@ -14,7 +14,7 @@ async def get_user(user_id: UUID): user = ( await session.execute( select(User) - .options(selectinload(User.roles), selectinload(User.tenant)) + .options(selectinload(User.roles), selectinload(User.tenants)) .where(User.id == user_id) ) ).scalar() diff --git a/cognee/modules/users/methods/get_user_by_email.py b/cognee/modules/users/methods/get_user_by_email.py index c4bd5b48e..6df989251 100644 --- a/cognee/modules/users/methods/get_user_by_email.py +++ b/cognee/modules/users/methods/get_user_by_email.py @@ -13,7 +13,7 @@ async def get_user_by_email(user_email: str): user = ( await session.execute( select(User) - .options(joinedload(User.roles), joinedload(User.tenant)) + .options(joinedload(User.roles), joinedload(User.tenants)) .where(User.email == user_email) ) ).scalar() diff --git a/cognee/modules/users/models/Tenant.py b/cognee/modules/users/models/Tenant.py index 95023a6ee..b8fa158c5 100644 --- a/cognee/modules/users/models/Tenant.py +++ b/cognee/modules/users/models/Tenant.py @@ -1,7 +1,7 @@ -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped from sqlalchemy import Column, String, ForeignKey, UUID from .Principal import Principal -from .User import User +from .UserTenant import UserTenant from .Role import Role @@ -13,14 +13,13 @@ class Tenant(Principal): owner_id = Column(UUID, index=True) - # One-to-Many relationship with User; specify the join via User.tenant_id - users = relationship( + users: Mapped[list["User"]] = relationship( # noqa: F821 "User", - back_populates="tenant", - foreign_keys=lambda: [User.tenant_id], + secondary=UserTenant.__tablename__, + back_populates="tenants", ) - # One-to-Many relationship with Role (if needed; similar fix) + # One-to-Many relationship with Role roles = relationship( "Role", back_populates="tenant", diff --git a/cognee/modules/users/models/User.py b/cognee/modules/users/models/User.py index 8972a5932..a98abd3bc 100644 --- a/cognee/modules/users/models/User.py +++ b/cognee/modules/users/models/User.py @@ -6,8 +6,10 @@ from sqlalchemy import ForeignKey, Column, UUID from sqlalchemy.orm import relationship, Mapped from .Principal import Principal +from .UserTenant import UserTenant from .UserRole import UserRole from .Role import Role +from .Tenant import Tenant class User(SQLAlchemyBaseUserTableUUID, Principal): @@ -15,7 +17,7 @@ class User(SQLAlchemyBaseUserTableUUID, Principal): id = Column(UUID, ForeignKey("principals.id", ondelete="CASCADE"), primary_key=True) - # Foreign key to Tenant (Many-to-One relationship) + # Foreign key to current Tenant (Many-to-One relationship) tenant_id = Column(UUID, ForeignKey("tenants.id")) # Many-to-Many Relationship with Roles @@ -25,11 +27,11 @@ class User(SQLAlchemyBaseUserTableUUID, Principal): back_populates="users", ) - # Relationship to Tenant - tenant = relationship( + # Many-to-Many Relationship with Tenants user is a part of + tenants: Mapped[list["Tenant"]] = relationship( "Tenant", + secondary=UserTenant.__tablename__, back_populates="users", - foreign_keys=[tenant_id], ) # ACL Relationship (One-to-Many) @@ -46,7 +48,6 @@ class UserRead(schemas.BaseUser[uuid_UUID]): class UserCreate(schemas.BaseUserCreate): - tenant_id: Optional[uuid_UUID] = None is_verified: bool = True diff --git a/cognee/modules/users/models/UserTenant.py b/cognee/modules/users/models/UserTenant.py new file mode 100644 index 000000000..bfb852aa5 --- /dev/null +++ b/cognee/modules/users/models/UserTenant.py @@ -0,0 +1,12 @@ +from datetime import datetime, timezone +from sqlalchemy import Column, ForeignKey, DateTime, UUID +from cognee.infrastructure.databases.relational import Base + + +class UserTenant(Base): + __tablename__ = "user_tenants" + + created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) + + user_id = Column(UUID, ForeignKey("users.id"), primary_key=True) + tenant_id = Column(UUID, ForeignKey("tenants.id"), primary_key=True) diff --git a/cognee/modules/users/models/__init__.py b/cognee/modules/users/models/__init__.py index ba2f40e49..5114cc45a 100644 --- a/cognee/modules/users/models/__init__.py +++ b/cognee/modules/users/models/__init__.py @@ -1,6 +1,7 @@ from .User import User from .Role import Role from .UserRole import UserRole +from .UserTenant import UserTenant from .DatasetDatabase import DatasetDatabase from .RoleDefaultPermissions import RoleDefaultPermissions from .UserDefaultPermissions import UserDefaultPermissions diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index 1185dd7ad..a4f538259 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -1,11 +1,8 @@ -from types import SimpleNamespace - from cognee.shared.logging_utils import get_logger from ...models.User import User from cognee.modules.data.models.Dataset import Dataset from cognee.modules.users.permissions.methods import get_principal_datasets -from cognee.modules.users.permissions.methods import get_role, get_tenant logger = get_logger() @@ -25,17 +22,15 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # Get all datasets User has explicit access to datasets.extend(await get_principal_datasets(user, permission_type)) - if user.tenant_id: - # Get all datasets all tenants have access to - tenant = await get_tenant(user.tenant_id) + # Get all tenants user is a part of + tenants = await user.awaitable_attrs.tenants + + for tenant in tenants: + # Get all datasets all tenant members have access to datasets.extend(await get_principal_datasets(tenant, permission_type)) - # Get all datasets Users roles have access to - if isinstance(user, SimpleNamespace): - # If simple namespace use roles defined in user - roles = user.roles - else: - roles = await user.awaitable_attrs.roles + # Get all datasets accessible by roles user is a part of + roles = await user.awaitable_attrs.roles for role in roles: datasets.extend(await get_principal_datasets(role, permission_type)) @@ -45,4 +40,5 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) + # TODO: Add filtering out of datasets that aren't currently selected tenant of user return list(unique.values()) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index 1374067a7..b9f5898d0 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -1,8 +1,11 @@ +from typing import Optional from uuid import UUID from sqlalchemy.exc import IntegrityError +from sqlalchemy import insert from cognee.infrastructure.databases.exceptions import EntityAlreadyExistsError from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.models.UserTenant import UserTenant from cognee.modules.users.methods import get_user from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import ( @@ -12,14 +15,19 @@ from cognee.modules.users.exceptions import ( ) -async def add_user_to_tenant(user_id: UUID, tenant_id: UUID, owner_id: UUID): +async def add_user_to_tenant( + user_id: UUID, tenant_id: UUID, owner_id: UUID, set_active_tenant: Optional[bool] = True +): """ Add a user with the given id to the tenant with the given id. This can only be successful if the request owner with the given id is the tenant owner. + + If set_active_tenant is true it will automatically set the users active tenant to provided tenant. Args: user_id: Id of the user. tenant_id: Id of the tenant. owner_id: Id of the request owner. + set_active_tenant: If set_active_tenant is true it will automatically set the users active tenant to provided tenant. Returns: None @@ -41,12 +49,17 @@ async def add_user_to_tenant(user_id: UUID, tenant_id: UUID, owner_id: UUID): ) try: - if user.tenant_id is None: + try: + # Add association directly to the association table + create_user_tenant_statement = insert(UserTenant).values( + user_id=user_id, tenant_id=tenant_id + ) + await session.execute(create_user_tenant_statement) + except IntegrityError: + raise EntityAlreadyExistsError(message="User is already part of group.") + + if set_active_tenant: user.tenant_id = tenant_id - elif user.tenant_id == tenant_id: - return - else: - raise IntegrityError await session.merge(user) await session.commit() diff --git a/cognee/modules/users/tenants/methods/create_tenant.py b/cognee/modules/users/tenants/methods/create_tenant.py index bfd23e08f..665e3cc18 100644 --- a/cognee/modules/users/tenants/methods/create_tenant.py +++ b/cognee/modules/users/tenants/methods/create_tenant.py @@ -1,6 +1,8 @@ from uuid import UUID +from sqlalchemy import insert from sqlalchemy.exc import IntegrityError +from cognee.modules.users.models.UserTenant import UserTenant from cognee.infrastructure.databases.exceptions import EntityAlreadyExistsError from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models import Tenant @@ -22,16 +24,22 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: async with db_engine.get_async_session() as session: try: user = await get_user(user_id) - if user.tenant_id: - raise EntityAlreadyExistsError( - message="User already has a tenant. New tenant cannot be created." - ) tenant = Tenant(name=tenant_name, owner_id=user_id) session.add(tenant) await session.flush() user.tenant_id = tenant.id + + try: + # Add association directly to the association table + create_user_tenant_statement = insert(UserTenant).values( + user_id=user_id, tenant_id=tenant.id + ) + await session.execute(create_user_tenant_statement) + except IntegrityError: + raise EntityAlreadyExistsError(message="User is already part of group.") + await session.merge(user) await session.commit() return tenant.id diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 4f51b660f..7c140845c 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -150,7 +150,9 @@ async def main(): # To add a user to a role he must be part of the same tenant/organization print("\nOperation started as user_2 to add user_3 to CogneeLab tenant/organization") - await add_user_to_tenant(user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id) + await add_user_to_tenant( + user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_active_tenant=True + ) print( "\nOperation started by user_2, as tenant owner, to add user_3 to Researcher role inside the tenant/organization" From 0c4e3e1f5295746db287eff5101d60c4cf89c1df Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 20:13:22 +0200 Subject: [PATCH 02/85] fix: Load tenants to default user --- cognee/modules/users/methods/get_default_user.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cognee/modules/users/methods/get_default_user.py b/cognee/modules/users/methods/get_default_user.py index 773545f8e..a48bd8928 100644 --- a/cognee/modules/users/methods/get_default_user.py +++ b/cognee/modules/users/methods/get_default_user.py @@ -18,7 +18,9 @@ async def get_default_user() -> SimpleNamespace: try: async with db_engine.get_async_session() as session: query = ( - select(User).options(selectinload(User.roles)).where(User.email == default_email) + select(User) + .options(selectinload(User.roles), selectinload(User.tenants)) + .where(User.email == default_email) ) result = await session.execute(query) From 12785e31ea327135a4e1968c90f1cb1e5891fad3 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 21:11:14 +0200 Subject: [PATCH 03/85] fix: Resolve issue with adding user to tenants --- .../tenants/methods/add_user_to_tenant.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index b9f5898d0..dabab6b6b 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -48,22 +48,18 @@ async def add_user_to_tenant( message="Only tenant owner can add other users to organization." ) - try: - try: - # Add association directly to the association table - create_user_tenant_statement = insert(UserTenant).values( - user_id=user_id, tenant_id=tenant_id - ) - await session.execute(create_user_tenant_statement) - except IntegrityError: - raise EntityAlreadyExistsError(message="User is already part of group.") - - if set_active_tenant: - user.tenant_id = tenant_id - + if set_active_tenant: + user.tenant_id = tenant_id await session.merge(user) await session.commit() - except IntegrityError: - raise EntityAlreadyExistsError( - message="User is already part of a tenant. Only one tenant can be assigned to user." + + try: + # Add association directly to the association table + create_user_tenant_statement = insert(UserTenant).values( + user_id=user_id, tenant_id=tenant_id ) + await session.execute(create_user_tenant_statement) + await session.commit() + + except IntegrityError: + raise EntityAlreadyExistsError(message="User is already part of group.") From 13f0423a55720debc1d04fbe9f005855c008ae53 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 21:35:50 +0200 Subject: [PATCH 04/85] refactor: Add better TODO message --- .../permissions/methods/get_all_user_permission_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index a4f538259..ff0f52d27 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -40,5 +40,6 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) - # TODO: Add filtering out of datasets that aren't currently selected tenant of user + # TODO: Add filtering out of datasets that aren't currently selected tenant of user (currently selected tenant is the tenant_id value in the User model) + # TODO: Add endpoint/method to select current Tenant for a user (This UUID value should be stored in tenant_id of User model) return list(unique.values()) From d6bb95e3798984bee76a0e5cd92308097f153649 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 21:57:39 +0200 Subject: [PATCH 05/85] fix: load tenants and roles when creating user --- cognee/modules/users/methods/create_user.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/modules/users/methods/create_user.py b/cognee/modules/users/methods/create_user.py index 953c70cd6..ef325fb6f 100644 --- a/cognee/modules/users/methods/create_user.py +++ b/cognee/modules/users/methods/create_user.py @@ -42,6 +42,10 @@ async def create_user( if auto_login: await session.refresh(user) + # Update tenants and roles information for User object + _ = await user.awaitable_attrs.tenants + _ = await user.awaitable_attrs.roles + return user except UserAlreadyExists as error: print(f"User {email} already exists") From 4f874deace3b55072bf97c35b45e158d03c5d844 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 23:50:17 +0200 Subject: [PATCH 06/85] feat: Add tenant select method/endpoint for users --- .../routers/get_permissions_router.py | 32 ++++++++++++ .../get_all_user_permission_datasets.py | 1 - .../modules/users/tenants/methods/__init__.py | 1 + .../users/tenants/methods/create_tenant.py | 7 +-- .../users/tenants/methods/select_tenant.py | 50 +++++++++++++++++++ 5 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 cognee/modules/users/tenants/methods/select_tenant.py diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 637293268..7959415da 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -220,4 +220,36 @@ def get_permissions_router() -> APIRouter: status_code=200, content={"message": "Tenant created.", "tenant_id": str(tenant_id)} ) + @permissions_router.post("/tenants/{tenant_id}") + async def select_tenant(tenant_id: UUID, user: User = Depends(get_authenticated_user)): + """ + Select current tenant. + + This endpoint selects a tenant with the specified UUID. Tenants are used + to organize users and resources in multi-tenant environments, providing + isolation and access control between different groups or organizations. + + ## Request Parameters + - **tenant_id** (UUID): UUID of the tenant to create + + ## Response + Returns a success message indicating the tenant was created. + """ + send_telemetry( + "Permissions API Endpoint Invoked", + user.id, + additional_properties={ + "endpoint": f"POST /v1/permissions/tenants/{str(tenant_id)}", + "tenant_id": tenant_id, + }, + ) + + from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method + + await select_tenant_method(user_id=user.id, tenant_id=tenant_id) + + return JSONResponse( + status_code=200, content={"message": "Tenant selected.", "tenant_id": str(tenant_id)} + ) + return permissions_router diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index ff0f52d27..e5dbb0e4b 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -41,5 +41,4 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> unique.setdefault(dataset.id, dataset) # TODO: Add filtering out of datasets that aren't currently selected tenant of user (currently selected tenant is the tenant_id value in the User model) - # TODO: Add endpoint/method to select current Tenant for a user (This UUID value should be stored in tenant_id of User model) return list(unique.values()) diff --git a/cognee/modules/users/tenants/methods/__init__.py b/cognee/modules/users/tenants/methods/__init__.py index 9a052e9c6..39e2b31bb 100644 --- a/cognee/modules/users/tenants/methods/__init__.py +++ b/cognee/modules/users/tenants/methods/__init__.py @@ -1,2 +1,3 @@ from .create_tenant import create_tenant from .add_user_to_tenant import add_user_to_tenant +from .select_tenant import select_tenant diff --git a/cognee/modules/users/tenants/methods/create_tenant.py b/cognee/modules/users/tenants/methods/create_tenant.py index 665e3cc18..60e10db5c 100644 --- a/cognee/modules/users/tenants/methods/create_tenant.py +++ b/cognee/modules/users/tenants/methods/create_tenant.py @@ -30,6 +30,8 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: await session.flush() user.tenant_id = tenant.id + await session.merge(user) + await session.commit() try: # Add association directly to the association table @@ -37,11 +39,10 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: user_id=user_id, tenant_id=tenant.id ) await session.execute(create_user_tenant_statement) + await session.commit() except IntegrityError: - raise EntityAlreadyExistsError(message="User is already part of group.") + raise EntityAlreadyExistsError(message="User is already part of tenant.") - await session.merge(user) - await session.commit() return tenant.id except IntegrityError as e: raise EntityAlreadyExistsError(message="Tenant already exists.") from e diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py new file mode 100644 index 000000000..709e46bf2 --- /dev/null +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -0,0 +1,50 @@ +from uuid import UUID + +import sqlalchemy.exc +from sqlalchemy import select + +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.models.UserTenant import UserTenant +from cognee.modules.users.methods import get_user +from cognee.modules.users.permissions.methods import get_tenant +from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError + + +async def select_tenant(user_id: UUID, tenant_id: UUID): + """ + Set the users active tenant to provided tenant. + Args: + user_id: Id of the user. + tenant_id: Id of the tenant. + + Returns: + None + + """ + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + user = await get_user(user_id) + tenant = await get_tenant(tenant_id) + + if not user: + raise UserNotFoundError + elif not tenant: + raise TenantNotFoundError + + # Check if User is part of Tenant + result = await session.execute( + select(UserTenant) + .where(UserTenant.user_id == user_id) + .where(UserTenant.tenant_id == tenant_id) + ) + + try: + result = result.scalar_one() + except sqlalchemy.exc.NoResultFound as e: + raise TenantNotFoundError("User Tenant relationship not found.") from e + + if result: + # If user is part of tenant update current tenant of user + user.tenant_id = tenant_id + await session.merge(user) + await session.commit() From 6934692e1b7646a493f47ec6a189e041db6cb14a Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 20 Oct 2025 15:07:13 +0200 Subject: [PATCH 07/85] refactor: Enable selection of default single user tenant --- .../routers/get_permissions_router.py | 24 ++++++++++++------- .../users/tenants/methods/select_tenant.py | 13 +++++++++- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 7959415da..eeea9b653 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -1,14 +1,19 @@ from uuid import UUID -from typing import List +from typing import List, Union from fastapi import APIRouter, Depends from fastapi.responses import JSONResponse from cognee.modules.users.models import User +from cognee.api.DTO import InDTO from cognee.modules.users.methods import get_authenticated_user from cognee.shared.utils import send_telemetry +class SelectTenantDTO(InDTO): + tenant_id: UUID | None = None + + def get_permissions_router() -> APIRouter: permissions_router = APIRouter() @@ -220,8 +225,8 @@ def get_permissions_router() -> APIRouter: status_code=200, content={"message": "Tenant created.", "tenant_id": str(tenant_id)} ) - @permissions_router.post("/tenants/{tenant_id}") - async def select_tenant(tenant_id: UUID, user: User = Depends(get_authenticated_user)): + @permissions_router.post("/tenants/select") + async def select_tenant(payload: SelectTenantDTO, user: User = Depends(get_authenticated_user)): """ Select current tenant. @@ -229,8 +234,10 @@ def get_permissions_router() -> APIRouter: to organize users and resources in multi-tenant environments, providing isolation and access control between different groups or organizations. + Sending a null/None value as tenant_id selects his default single user tenant + ## Request Parameters - - **tenant_id** (UUID): UUID of the tenant to create + - **tenant_id** (Union[UUID, None]): UUID of the tenant to select, If null/None is provided use the default single user tenant ## Response Returns a success message indicating the tenant was created. @@ -239,17 +246,18 @@ def get_permissions_router() -> APIRouter: "Permissions API Endpoint Invoked", user.id, additional_properties={ - "endpoint": f"POST /v1/permissions/tenants/{str(tenant_id)}", - "tenant_id": tenant_id, + "endpoint": f"POST /v1/permissions/tenants/{str(payload.tenant_id)}", + "tenant_id": str(payload.tenant_id), }, ) from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method - await select_tenant_method(user_id=user.id, tenant_id=tenant_id) + await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id) return JSONResponse( - status_code=200, content={"message": "Tenant selected.", "tenant_id": str(tenant_id)} + status_code=200, + content={"message": "Tenant selected.", "tenant_id": str(payload.tenant_id)}, ) return permissions_router diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index 709e46bf2..732b24858 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -1,4 +1,5 @@ from uuid import UUID +from typing import Union import sqlalchemy.exc from sqlalchemy import select @@ -10,9 +11,11 @@ from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user_id: UUID, tenant_id: UUID): +async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): """ Set the users active tenant to provided tenant. + + If None tenant_id is provided set current Tenant to the default single user-tenant Args: user_id: Id of the user. tenant_id: Id of the tenant. @@ -24,6 +27,14 @@ async def select_tenant(user_id: UUID, tenant_id: UUID): db_engine = get_relational_engine() async with db_engine.get_async_session() as session: user = await get_user(user_id) + + if tenant_id is None: + # If no tenant_id is provided set current Tenant to the single user-tenant + user.tenant_id = None + await session.merge(user) + await session.commit() + return + tenant = await get_tenant(tenant_id) if not user: From 742866b4c9f1d4aa53ab60fb54b79474fbfea0d2 Mon Sep 17 00:00:00 2001 From: EricXiao Date: Wed, 22 Oct 2025 16:56:46 +0800 Subject: [PATCH 08/85] feat: csv ingestion loader & chunk Signed-off-by: EricXiao --- cognee/cli/commands/cognify_command.py | 9 +- cognee/cli/config.py | 2 +- .../files/utils/guess_file_type.py | 43 +++++ .../files/utils/is_csv_content.py | 181 ++++++++++++++++++ cognee/infrastructure/loaders/LoaderEngine.py | 1 + .../infrastructure/loaders/core/__init__.py | 3 +- .../infrastructure/loaders/core/csv_loader.py | 93 +++++++++ .../loaders/core/text_loader.py | 3 +- .../loaders/supported_loaders.py | 3 +- cognee/modules/chunking/CsvChunker.py | 35 ++++ .../processing/document_types/CsvDocument.py | 33 ++++ .../processing/document_types/__init__.py | 1 + cognee/tasks/chunks/__init__.py | 1 + cognee/tasks/chunks/chunk_by_row.py | 94 +++++++++ cognee/tasks/documents/classify_documents.py | 2 + .../integration/documents/CsvDocument_test.py | 70 +++++++ .../tests/test_data/example_with_header.csv | 3 + .../processing/chunks/chunk_by_row_test.py | 52 +++++ 18 files changed, 623 insertions(+), 6 deletions(-) create mode 100644 cognee/infrastructure/files/utils/is_csv_content.py create mode 100644 cognee/infrastructure/loaders/core/csv_loader.py create mode 100644 cognee/modules/chunking/CsvChunker.py create mode 100644 cognee/modules/data/processing/document_types/CsvDocument.py create mode 100644 cognee/tasks/chunks/chunk_by_row.py create mode 100644 cognee/tests/integration/documents/CsvDocument_test.py create mode 100644 cognee/tests/test_data/example_with_header.csv create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_row_test.py diff --git a/cognee/cli/commands/cognify_command.py b/cognee/cli/commands/cognify_command.py index 16eaf0454..b89c1f70e 100644 --- a/cognee/cli/commands/cognify_command.py +++ b/cognee/cli/commands/cognify_command.py @@ -22,7 +22,7 @@ relationships, and creates semantic connections for enhanced search and reasonin Processing Pipeline: 1. **Document Classification**: Identifies document types and structures -2. **Permission Validation**: Ensures user has processing rights +2. **Permission Validation**: Ensures user has processing rights 3. **Text Chunking**: Breaks content into semantically meaningful segments 4. **Entity Extraction**: Identifies key concepts, people, places, organizations 5. **Relationship Detection**: Discovers connections between entities @@ -97,6 +97,13 @@ After successful cognify processing, use `cognee search` to query the knowledge chunker_class = LangchainChunker except ImportError: fmt.warning("LangchainChunker not available, using TextChunker") + elif args.chunker == "CsvChunker": + try: + from cognee.modules.chunking.CsvChunker import CsvChunker + + chunker_class = CsvChunker + except ImportError: + fmt.warning("CsvChunker not available, using TextChunker") result = await cognee.cognify( datasets=datasets, diff --git a/cognee/cli/config.py b/cognee/cli/config.py index d016608c1..082adbaec 100644 --- a/cognee/cli/config.py +++ b/cognee/cli/config.py @@ -26,7 +26,7 @@ SEARCH_TYPE_CHOICES = [ ] # Chunker choices -CHUNKER_CHOICES = ["TextChunker", "LangchainChunker"] +CHUNKER_CHOICES = ["TextChunker", "LangchainChunker", "CsvChunker"] # Output format choices OUTPUT_FORMAT_CHOICES = ["json", "pretty", "simple"] diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index dcdd68cad..10f59a400 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -1,6 +1,8 @@ from typing import BinaryIO import filetype + from .is_text_content import is_text_content +from .is_csv_content import is_csv_content class FileTypeException(Exception): @@ -134,3 +136,44 @@ def guess_file_type(file: BinaryIO) -> filetype.Type: raise FileTypeException(f"Unknown file detected: {file.name}.") return file_type + + +class CsvFileType(filetype.Type): + """ + Match CSV file types based on MIME type and extension. + + Public methods: + - match + + Instance variables: + - MIME: The MIME type of the CSV. + - EXTENSION: The file extension of the CSV. + """ + + MIME = "text/csv" + EXTENSION = "csv" + + def __init__(self): + super().__init__(mime=self.MIME, extension=self.EXTENSION) + + def match(self, buf): + """ + Determine if the given buffer contains csv content. + + Parameters: + ----------- + + - buf: The buffer to check for csv content. + + Returns: + -------- + + Returns True if the buffer is identified as csv content, otherwise False. + """ + + return is_csv_content(buf) + + +csv_file_type = CsvFileType() + +filetype.add_type(csv_file_type) diff --git a/cognee/infrastructure/files/utils/is_csv_content.py b/cognee/infrastructure/files/utils/is_csv_content.py new file mode 100644 index 000000000..07b7ea69b --- /dev/null +++ b/cognee/infrastructure/files/utils/is_csv_content.py @@ -0,0 +1,181 @@ +import csv +from collections import Counter + + +def is_csv_content(content): + """ + Heuristically determine whether a bytes-like object is CSV text. + + Strategy (fail-fast and cheap to expensive): + 1) Decode: Try a small ordered list of common encodings with strict errors. + 2) Line sampling: require >= 2 non-empty lines; sample up to 50 lines. + 3) Delimiter detection: + - Prefer csv.Sniffer() with common delimiters. + - Fallback to a lightweight consistency heuristic. + 4) Lightweight parse check: + - Parse a few lines with the delimiter. + - Ensure at least 2 valid rows and relatively stable column counts. + + Returns: + bool: True if the buffer looks like CSV; False otherwise. + """ + try: + encoding_list = [ + "utf-8", + "utf-8-sig", + "utf-32-le", + "utf-32-be", + "utf-16-le", + "utf-16-be", + "gb18030", + "shift_jis", + "cp949", + "cp1252", + "iso-8859-1", + ] + + # Try to decode strictly—if decoding fails for all encodings, it's not text/CSV. + text = None + for enc in encoding_list: + try: + text = content.decode(enc, errors="strict") + break + except UnicodeDecodeError: + continue + if text is None: + return False + + # Reject empty/whitespace-only payloads. + stripped = text.strip() + if not stripped: + return False + + # Split into logical lines and drop empty ones. Require at least two lines. + lines = [ln for ln in text.splitlines() if ln.strip()] + if len(lines) < 2: + return False + + # Take a small sample to keep sniffing cheap and predictable. + sample_lines = lines[:50] + + # Detect delimiter using csv.Sniffer first; if that fails, use our heuristic. + delimiter = _sniff_delimiter(sample_lines) or _heuristic_delimiter(sample_lines) + if not delimiter: + return False + + # Finally, do a lightweight parse sanity check with the chosen delimiter. + return _lightweight_parse_check(sample_lines, delimiter) + except Exception: + return False + + +def _sniff_delimiter(lines): + """ + Try Python's built-in csv.Sniffer on a sample. + + Args: + lines (list[str]): Sample lines (already decoded). + + Returns: + str | None: The detected delimiter if sniffing succeeds; otherwise None. + """ + # Join up to 50 lines to form the sample string Sniffer will inspect. + sample = "\n".join(lines[:50]) + try: + dialect = csv.Sniffer().sniff(sample, delimiters=",\t;|") + return dialect.delimiter + except Exception: + # Sniffer is known to be brittle on small/dirty samples—silently fallback. + return None + + +def _heuristic_delimiter(lines): + """ + Fallback delimiter detection based on count consistency per line. + + Heuristic: + - For each candidate delimiter, count occurrences per line. + - Keep only lines with count > 0 (line must contain the delimiter). + - Require at least half of lines to contain the delimiter (min 2). + - Compute the mode (most common count). If the proportion of lines that + exhibit the modal count is >= 80%, accept that delimiter. + + Args: + lines (list[str]): Sample lines. + + Returns: + str | None: Best delimiter if one meets the consistency threshold; else None. + """ + candidates = [",", "\t", ";", "|"] + best = None + best_score = 0.0 + + for d in candidates: + # Count how many times the delimiter appears in each line. + counts = [ln.count(d) for ln in lines] + # Consider only lines that actually contain the delimiter at least once. + nonzero = [c for c in counts if c > 0] + + # Require that more than half of lines (and at least 2) contain the delimiter. + if len(nonzero) < max(2, int(0.5 * len(lines))): + continue + + # Find the modal count and its frequency. + cnt = Counter(nonzero) + pairs = cnt.most_common(1) + if not pairs: + continue + + mode, mode_freq = pairs[0] + # Consistency ratio: lines with the modal count / total lines in the sample. + consistency = mode_freq / len(lines) + # Accept if consistent enough and better than any previous candidate. + if mode >= 1 and consistency >= 0.80 and consistency > best_score: + best = d + best_score = consistency + + return best + + +def _lightweight_parse_check(lines, delimiter): + """ + Parse a few lines with csv.reader and check structural stability. + + Heuristic: + - Parse up to 5 lines with the given delimiter. + - Count column widths per parsed row. + - Require at least 2 non-empty rows. + - Allow at most 1 row whose width deviates by >2 columns from the first row. + + Args: + lines (list[str]): Sample lines (decoded). + delimiter (str): Delimiter chosen by sniffing/heuristics. + + Returns: + bool: True if parsing looks stable; False otherwise. + """ + try: + # csv.reader accepts any iterable of strings; feeding the first 10 lines is fine. + reader = csv.reader(lines[:10], delimiter=delimiter) + widths = [] + valid_rows = 0 + for row in reader: + if not row: + continue + + widths.append(len(row)) + valid_rows += 1 + + # Need at least two meaningful rows to make a judgment. + if valid_rows < 2: + return False + + if widths: + first = widths[0] + # Count rows whose width deviates significantly (>2) from the first row. + unstable = sum(1 for w in widths if abs(w - first) > 2) + # Permit at most 1 unstable row among the parsed sample. + return unstable <= 1 + return False + except Exception: + return False diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 6b62f7641..37e63c9fc 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -30,6 +30,7 @@ class LoaderEngine: "pypdf_loader", "image_loader", "audio_loader", + "csv_loader", "unstructured_loader", "advanced_pdf_loader", ] diff --git a/cognee/infrastructure/loaders/core/__init__.py b/cognee/infrastructure/loaders/core/__init__.py index 8a2df80f9..09819fbd2 100644 --- a/cognee/infrastructure/loaders/core/__init__.py +++ b/cognee/infrastructure/loaders/core/__init__.py @@ -3,5 +3,6 @@ from .text_loader import TextLoader from .audio_loader import AudioLoader from .image_loader import ImageLoader +from .csv_loader import CsvLoader -__all__ = ["TextLoader", "AudioLoader", "ImageLoader"] +__all__ = ["TextLoader", "AudioLoader", "ImageLoader", "CsvLoader"] diff --git a/cognee/infrastructure/loaders/core/csv_loader.py b/cognee/infrastructure/loaders/core/csv_loader.py new file mode 100644 index 000000000..a314a7a24 --- /dev/null +++ b/cognee/infrastructure/loaders/core/csv_loader.py @@ -0,0 +1,93 @@ +import os +from typing import List +import csv +from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface +from cognee.infrastructure.files.storage import get_file_storage, get_storage_config +from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata + + +class CsvLoader(LoaderInterface): + """ + Core CSV file loader that handles basic CSV file formats. + """ + + @property + def supported_extensions(self) -> List[str]: + """Supported text file extensions.""" + return [ + "csv", + ] + + @property + def supported_mime_types(self) -> List[str]: + """Supported MIME types for text content.""" + return [ + "text/csv", + ] + + @property + def loader_name(self) -> str: + """Unique identifier for this loader.""" + return "csv_loader" + + def can_handle(self, extension: str, mime_type: str) -> bool: + """ + Check if this loader can handle the given file. + + Args: + extension: File extension + mime_type: Optional MIME type + + Returns: + True if file can be handled, False otherwise + """ + if extension in self.supported_extensions and mime_type in self.supported_mime_types: + return True + + return False + + async def load(self, file_path: str, encoding: str = "utf-8", **kwargs): + """ + Load and process the csv file. + + Args: + file_path: Path to the file to load + encoding: Text encoding to use (default: utf-8) + **kwargs: Additional configuration (unused) + + Returns: + LoaderResult containing the file content and metadata + + Raises: + FileNotFoundError: If file doesn't exist + UnicodeDecodeError: If file cannot be decoded with specified encoding + OSError: If file cannot be read + """ + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + with open(file_path, "rb") as f: + file_metadata = await get_file_metadata(f) + # Name ingested file of current loader based on original file content hash + storage_file_name = "text_" + file_metadata["content_hash"] + ".txt" + + row_texts = [] + row_index = 1 + + with open(file_path, "r", encoding=encoding, newline="") as file: + reader = csv.DictReader(file) + for row in reader: + pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()] + row_text = ", ".join(pairs) + row_texts.append(f"Row {row_index}:\n{row_text}\n") + row_index += 1 + + content = "\n".join(row_texts) + + storage_config = get_storage_config() + data_root_directory = storage_config["data_root_directory"] + storage = get_file_storage(data_root_directory) + + full_file_path = await storage.store(storage_file_name, content) + + return full_file_path diff --git a/cognee/infrastructure/loaders/core/text_loader.py b/cognee/infrastructure/loaders/core/text_loader.py index a6f94be9b..e478edb22 100644 --- a/cognee/infrastructure/loaders/core/text_loader.py +++ b/cognee/infrastructure/loaders/core/text_loader.py @@ -16,7 +16,7 @@ class TextLoader(LoaderInterface): @property def supported_extensions(self) -> List[str]: """Supported text file extensions.""" - return ["txt", "md", "csv", "json", "xml", "yaml", "yml", "log"] + return ["txt", "md", "json", "xml", "yaml", "yml", "log"] @property def supported_mime_types(self) -> List[str]: @@ -24,7 +24,6 @@ class TextLoader(LoaderInterface): return [ "text/plain", "text/markdown", - "text/csv", "application/json", "text/xml", "application/xml", diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py index d103babe3..b506df5f3 100644 --- a/cognee/infrastructure/loaders/supported_loaders.py +++ b/cognee/infrastructure/loaders/supported_loaders.py @@ -1,5 +1,5 @@ from cognee.infrastructure.loaders.external import PyPdfLoader -from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader +from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader, CsvLoader # Registry for loader implementations supported_loaders = { @@ -7,6 +7,7 @@ supported_loaders = { TextLoader.loader_name: TextLoader, ImageLoader.loader_name: ImageLoader, AudioLoader.loader_name: AudioLoader, + CsvLoader.loader_name: CsvLoader, } # Try adding optional loaders diff --git a/cognee/modules/chunking/CsvChunker.py b/cognee/modules/chunking/CsvChunker.py new file mode 100644 index 000000000..4ba4a969e --- /dev/null +++ b/cognee/modules/chunking/CsvChunker.py @@ -0,0 +1,35 @@ +from cognee.shared.logging_utils import get_logger + + +from cognee.tasks.chunks import chunk_by_row +from cognee.modules.chunking.Chunker import Chunker +from .models.DocumentChunk import DocumentChunk + +logger = get_logger() + + +class CsvChunker(Chunker): + async def read(self): + async for content_text in self.get_text(): + if content_text is None: + continue + + for chunk_data in chunk_by_row(content_text, self.max_chunk_size): + if chunk_data["chunk_size"] <= self.max_chunk_size: + yield DocumentChunk( + id=chunk_data["chunk_id"], + text=chunk_data["text"], + chunk_size=chunk_data["chunk_size"], + is_part_of=self.document, + chunk_index=self.chunk_index, + cut_type=chunk_data["cut_type"], + contains=[], + metadata={ + "index_fields": ["text"], + }, + ) + self.chunk_index += 1 + else: + raise ValueError( + f"Chunk size is larger than the maximum chunk size {self.max_chunk_size}" + ) diff --git a/cognee/modules/data/processing/document_types/CsvDocument.py b/cognee/modules/data/processing/document_types/CsvDocument.py new file mode 100644 index 000000000..3381275bd --- /dev/null +++ b/cognee/modules/data/processing/document_types/CsvDocument.py @@ -0,0 +1,33 @@ +import io +import csv +from typing import Type + +from cognee.modules.chunking.Chunker import Chunker +from cognee.infrastructure.files.utils.open_data_file import open_data_file +from .Document import Document + + +class CsvDocument(Document): + type: str = "csv" + mime_type: str = "text/csv" + + async def read(self, chunker_cls: Type[Chunker], max_chunk_size: int): + async def get_text(): + async with open_data_file( + self.raw_data_location, mode="r", encoding="utf-8", newline="" + ) as file: + content = file.read() + file_like_obj = io.StringIO(content) + reader = csv.DictReader(file_like_obj) + + for row in reader: + pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()] + row_text = ", ".join(pairs) + if not row_text.strip(): + break + yield row_text + + chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text) + + async for chunk in chunker.read(): + yield chunk diff --git a/cognee/modules/data/processing/document_types/__init__.py b/cognee/modules/data/processing/document_types/__init__.py index 2e862f4ba..133dd53f8 100644 --- a/cognee/modules/data/processing/document_types/__init__.py +++ b/cognee/modules/data/processing/document_types/__init__.py @@ -4,3 +4,4 @@ from .TextDocument import TextDocument from .ImageDocument import ImageDocument from .AudioDocument import AudioDocument from .UnstructuredDocument import UnstructuredDocument +from .CsvDocument import CsvDocument diff --git a/cognee/tasks/chunks/__init__.py b/cognee/tasks/chunks/__init__.py index 22ce96be8..37d4de73e 100644 --- a/cognee/tasks/chunks/__init__.py +++ b/cognee/tasks/chunks/__init__.py @@ -1,4 +1,5 @@ from .chunk_by_word import chunk_by_word from .chunk_by_sentence import chunk_by_sentence from .chunk_by_paragraph import chunk_by_paragraph +from .chunk_by_row import chunk_by_row from .remove_disconnected_chunks import remove_disconnected_chunks diff --git a/cognee/tasks/chunks/chunk_by_row.py b/cognee/tasks/chunks/chunk_by_row.py new file mode 100644 index 000000000..8daf13689 --- /dev/null +++ b/cognee/tasks/chunks/chunk_by_row.py @@ -0,0 +1,94 @@ +from typing import Any, Dict, Iterator +from uuid import NAMESPACE_OID, uuid5 + +from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine + + +def _get_pair_size(pair_text: str) -> int: + """ + Calculate the size of a given text in terms of tokens. + + If an embedding engine's tokenizer is available, count the tokens for the provided word. + If the tokenizer is not available, assume the word counts as one token. + + Parameters: + ----------- + + - pair_text (str): The key:value pair text for which the token size is to be calculated. + + Returns: + -------- + + - int: The number of tokens representing the text, typically an integer, depending + on the tokenizer's output. + """ + embedding_engine = get_embedding_engine() + if embedding_engine.tokenizer: + return embedding_engine.tokenizer.count_tokens(pair_text) + else: + return 3 + + +def chunk_by_row( + data: str, + max_chunk_size, +) -> Iterator[Dict[str, Any]]: + """ + Chunk the input text by row while enabling exact text reconstruction. + + This function divides the given text data into smaller chunks on a line-by-line basis, + ensuring that the size of each chunk is less than or equal to the specified maximum + chunk size. It guarantees that when the generated chunks are concatenated, they + reproduce the original text accurately. The tokenization process is handled by + adapters compatible with the vector engine's embedding model. + + Parameters: + ----------- + + - data (str): The input text to be chunked. + - max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or + words. + """ + current_chunk_list = [] + chunk_index = 0 + current_chunk_size = 0 + + lines = data.split("\n\n") + for line in lines: + pairs_text = line.split(", ") + + for pair_text in pairs_text: + pair_size = _get_pair_size(pair_text) + if current_chunk_size > 0 and (current_chunk_size + pair_size > max_chunk_size): + # Yield current cut chunk + current_chunk = ", ".join(current_chunk_list) + chunk_dict = { + "text": current_chunk, + "chunk_size": current_chunk_size, + "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "chunk_index": chunk_index, + "cut_type": "row_cut", + } + + yield chunk_dict + + # Start new chunk with current pair text + current_chunk_list = [] + current_chunk_size = 0 + chunk_index += 1 + + current_chunk_list.append(pair_text) + current_chunk_size += pair_size + + # Yield row chunk + current_chunk = ", ".join(current_chunk_list) + if current_chunk: + chunk_dict = { + "text": current_chunk, + "chunk_size": current_chunk_size, + "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "chunk_index": chunk_index, + "cut_type": "row_end", + } + + yield chunk_dict diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 9fa512906..e4f13ebd1 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -7,6 +7,7 @@ from cognee.modules.data.processing.document_types import ( ImageDocument, TextDocument, UnstructuredDocument, + CsvDocument, ) from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.engine.utils.generate_node_id import generate_node_id @@ -15,6 +16,7 @@ from cognee.tasks.documents.exceptions import WrongDataDocumentInputError EXTENSION_TO_DOCUMENT_CLASS = { "pdf": PdfDocument, # Text documents "txt": TextDocument, + "csv": CsvDocument, "docx": UnstructuredDocument, "doc": UnstructuredDocument, "odt": UnstructuredDocument, diff --git a/cognee/tests/integration/documents/CsvDocument_test.py b/cognee/tests/integration/documents/CsvDocument_test.py new file mode 100644 index 000000000..421bb81bd --- /dev/null +++ b/cognee/tests/integration/documents/CsvDocument_test.py @@ -0,0 +1,70 @@ +import os +import sys +import uuid +import pytest +import pathlib +from unittest.mock import patch + +from cognee.modules.chunking.CsvChunker import CsvChunker +from cognee.modules.data.processing.document_types.CsvDocument import CsvDocument +from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine +from cognee.tests.integration.documents.async_gen_zip import async_gen_zip + +chunk_by_row_module = sys.modules.get("cognee.tasks.chunks.chunk_by_row") + + +GROUND_TRUTH = { + "chunk_size_10": [ + {"token_count": 9, "len_text": 26, "cut_type": "row_cut", "chunk_index": 0}, + {"token_count": 6, "len_text": 29, "cut_type": "row_end", "chunk_index": 1}, + {"token_count": 9, "len_text": 25, "cut_type": "row_cut", "chunk_index": 2}, + {"token_count": 6, "len_text": 30, "cut_type": "row_end", "chunk_index": 3}, + ], + "chunk_size_128": [ + {"token_count": 15, "len_text": 57, "cut_type": "row_end", "chunk_index": 0}, + {"token_count": 15, "len_text": 57, "cut_type": "row_end", "chunk_index": 1}, + ], +} + + +@pytest.mark.parametrize( + "input_file,chunk_size", + [("example_with_header.csv", 10), ("example_with_header.csv", 128)], +) +@patch.object(chunk_by_row_module, "get_embedding_engine", side_effect=mock_get_embedding_engine) +@pytest.mark.asyncio +async def test_CsvDocument(mock_engine, input_file, chunk_size): + # Define file paths of test data + csv_file_path = os.path.join( + pathlib.Path(__file__).parent.parent.parent, + "test_data", + input_file, + ) + + # Define test documents + csv_document = CsvDocument( + id=uuid.uuid4(), + name="example_with_header.csv", + raw_data_location=csv_file_path, + external_metadata="", + mime_type="text/csv", + ) + + # TEST CSV + ground_truth_key = f"chunk_size_{chunk_size}" + async for ground_truth, row_data in async_gen_zip( + GROUND_TRUTH[ground_truth_key], + csv_document.read(chunker_cls=CsvChunker, max_chunk_size=chunk_size), + ): + assert ground_truth["token_count"] == row_data.chunk_size, ( + f'{ground_truth["token_count"] = } != {row_data.chunk_size = }' + ) + assert ground_truth["len_text"] == len(row_data.text), ( + f'{ground_truth["len_text"] = } != {len(row_data.text) = }' + ) + assert ground_truth["cut_type"] == row_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {row_data.cut_type = }' + ) + assert ground_truth["chunk_index"] == row_data.chunk_index, ( + f'{ground_truth["chunk_index"] = } != {row_data.chunk_index = }' + ) diff --git a/cognee/tests/test_data/example_with_header.csv b/cognee/tests/test_data/example_with_header.csv new file mode 100644 index 000000000..dc900e5ef --- /dev/null +++ b/cognee/tests/test_data/example_with_header.csv @@ -0,0 +1,3 @@ +id,name,age,city,country +1,Eric,30,Beijing,China +2,Joe,35,Berlin,Germany diff --git a/cognee/tests/unit/processing/chunks/chunk_by_row_test.py b/cognee/tests/unit/processing/chunks/chunk_by_row_test.py new file mode 100644 index 000000000..7d6a73a06 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_row_test.py @@ -0,0 +1,52 @@ +from itertools import product + +import numpy as np +import pytest + +from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine +from cognee.tasks.chunks import chunk_by_row + +INPUT_TEXTS = "name: John, age: 30, city: New York, country: USA" +max_chunk_size_vals = [8, 32] + + +@pytest.mark.parametrize( + "input_text,max_chunk_size", + list(product([INPUT_TEXTS], max_chunk_size_vals)), +) +def test_chunk_by_row_isomorphism(input_text, max_chunk_size): + chunks = chunk_by_row(input_text, max_chunk_size) + reconstructed_text = ", ".join([chunk["text"] for chunk in chunks]) + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) + + +@pytest.mark.parametrize( + "input_text,max_chunk_size", + list(product([INPUT_TEXTS], max_chunk_size_vals)), +) +def test_row_chunk_length(input_text, max_chunk_size): + chunks = list(chunk_by_row(data=input_text, max_chunk_size=max_chunk_size)) + embedding_engine = get_embedding_engine() + + chunk_lengths = np.array( + [embedding_engine.tokenizer.count_tokens(chunk["text"]) for chunk in chunks] + ) + + larger_chunks = chunk_lengths[chunk_lengths > max_chunk_size] + assert np.all(chunk_lengths <= max_chunk_size), ( + f"{max_chunk_size = }: {larger_chunks} are too large" + ) + + +@pytest.mark.parametrize( + "input_text,max_chunk_size", + list(product([INPUT_TEXTS], max_chunk_size_vals)), +) +def test_chunk_by_row_chunk_numbering(input_text, max_chunk_size): + chunks = chunk_by_row(data=input_text, max_chunk_size=max_chunk_size) + chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) + assert np.all(chunk_indices == np.arange(len(chunk_indices))), ( + f"{chunk_indices = } are not monotonically increasing" + ) From 8566516ceca89a0e85db6aa5ba967f5d8070b2c7 Mon Sep 17 00:00:00 2001 From: EricXiao Date: Wed, 22 Oct 2025 16:59:07 +0800 Subject: [PATCH 09/85] chore: Remove local test code Signed-off-by: EricXiao --- .../loaders/external/advanced_pdf_loader.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cognee/infrastructure/loaders/external/advanced_pdf_loader.py b/cognee/infrastructure/loaders/external/advanced_pdf_loader.py index 6d1412b77..4b3ba296a 100644 --- a/cognee/infrastructure/loaders/external/advanced_pdf_loader.py +++ b/cognee/infrastructure/loaders/external/advanced_pdf_loader.py @@ -227,12 +227,3 @@ class AdvancedPdfLoader(LoaderInterface): if value is None: return "" return str(value).replace("\xa0", " ").strip() - - -if __name__ == "__main__": - loader = AdvancedPdfLoader() - asyncio.run( - loader.load( - "/Users/xiaotao/work/cognee/cognee/infrastructure/loaders/external/attention_is_all_you_need.pdf" - ) - ) From a4a9e762465ecaf0dcdb9b0132db7951d11b437c Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Sun, 2 Nov 2025 17:05:03 +0500 Subject: [PATCH 10/85] feat: add ontology endpoint in REST API - Add POST /api/v1/ontologies endpoint for file upload - Add GET /api/v1/ontologies endpoint for listing ontologies - Implement OntologyService for file management and metadata - Integrate ontology_key parameter in cognify endpoint - Update RDFLibOntologyResolver to support file-like objects - Add essential test suite for ontology endpoints --- cognee/api/client.py | 3 + .../v1/cognify/routers/get_cognify_router.py | 31 +++++- cognee/api/v1/ontologies/__init__.py | 4 + cognee/api/v1/ontologies/ontologies.py | 101 ++++++++++++++++++ cognee/api/v1/ontologies/routers/__init__.py | 0 .../ontologies/routers/get_ontology_router.py | 89 +++++++++++++++ .../rdf_xml/RDFLibOntologyResolver.py | 69 +++++++----- cognee/tests/test_ontology_endpoint.py | 89 +++++++++++++++ 8 files changed, 356 insertions(+), 30 deletions(-) create mode 100644 cognee/api/v1/ontologies/__init__.py create mode 100644 cognee/api/v1/ontologies/ontologies.py create mode 100644 cognee/api/v1/ontologies/routers/__init__.py create mode 100644 cognee/api/v1/ontologies/routers/get_ontology_router.py create mode 100644 cognee/tests/test_ontology_endpoint.py diff --git a/cognee/api/client.py b/cognee/api/client.py index 6766c12de..89e9eb2f5 100644 --- a/cognee/api/client.py +++ b/cognee/api/client.py @@ -23,6 +23,7 @@ from cognee.api.v1.settings.routers import get_settings_router from cognee.api.v1.datasets.routers import get_datasets_router from cognee.api.v1.cognify.routers import get_code_pipeline_router, get_cognify_router from cognee.api.v1.search.routers import get_search_router +from cognee.api.v1.ontologies.routers.get_ontology_router import get_ontology_router from cognee.api.v1.memify.routers import get_memify_router from cognee.api.v1.add.routers import get_add_router from cognee.api.v1.delete.routers import get_delete_router @@ -258,6 +259,8 @@ app.include_router( app.include_router(get_datasets_router(), prefix="/api/v1/datasets", tags=["datasets"]) +app.include_router(get_ontology_router(), prefix="/api/v1/ontologies", tags=["ontologies"]) + app.include_router(get_settings_router(), prefix="/api/v1/settings", tags=["settings"]) app.include_router(get_visualize_router(), prefix="/api/v1/visualize", tags=["visualize"]) diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index 231bbcd11..246cc6c56 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -41,6 +41,10 @@ class CognifyPayloadDTO(InDTO): custom_prompt: Optional[str] = Field( default="", description="Custom prompt for entity extraction and graph generation" ) + ontology_key: Optional[str] = Field( + default=None, + description="Reference to previously uploaded ontology" + ) def get_cognify_router() -> APIRouter: @@ -68,6 +72,7 @@ def get_cognify_router() -> APIRouter: - **dataset_ids** (Optional[List[UUID]]): List of existing dataset UUIDs to process. UUIDs allow processing of datasets not owned by the user (if permitted). - **run_in_background** (Optional[bool]): Whether to execute processing asynchronously. Defaults to False (blocking). - **custom_prompt** (Optional[str]): Custom prompt for entity extraction and graph generation. If provided, this prompt will be used instead of the default prompts for knowledge graph extraction. + - **ontology_key** (Optional[str]): Reference to a previously uploaded ontology file to use for knowledge graph construction. ## Response - **Blocking execution**: Complete pipeline run information with entity counts, processing duration, and success/failure status @@ -82,7 +87,8 @@ def get_cognify_router() -> APIRouter: { "datasets": ["research_papers", "documentation"], "run_in_background": false, - "custom_prompt": "Extract entities focusing on technical concepts and their relationships. Identify key technologies, methodologies, and their interconnections." + "custom_prompt": "Extract entities focusing on technical concepts and their relationships. Identify key technologies, methodologies, and their interconnections.", + "ontology_key": "medical_ontology_v1" } ``` @@ -108,13 +114,36 @@ def get_cognify_router() -> APIRouter: ) from cognee.api.v1.cognify import cognify as cognee_cognify + from cognee.api.v1.ontologies.ontologies import OntologyService try: datasets = payload.dataset_ids if payload.dataset_ids else payload.datasets + config_to_use = None + + if payload.ontology_key: + ontology_service = OntologyService() + try: + ontology_content = ontology_service.get_ontology_content(payload.ontology_key, user) + + from cognee.modules.ontology.ontology_config import Config + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from io import StringIO + + ontology_stream = StringIO(ontology_content) + config_to_use: Config = { + "ontology_config": { + "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_stream) + } + } + except ValueError as e: + return JSONResponse( + status_code=400, content={"error": f"Ontology error: {str(e)}"} + ) cognify_run = await cognee_cognify( datasets, user, + config=config_to_use, run_in_background=payload.run_in_background, custom_prompt=payload.custom_prompt, ) diff --git a/cognee/api/v1/ontologies/__init__.py b/cognee/api/v1/ontologies/__init__.py new file mode 100644 index 000000000..c25064edc --- /dev/null +++ b/cognee/api/v1/ontologies/__init__.py @@ -0,0 +1,4 @@ +from .ontologies import OntologyService +from .routers.get_ontology_router import get_ontology_router + +__all__ = ["OntologyService", "get_ontology_router"] \ No newline at end of file diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py new file mode 100644 index 000000000..fb7f3cd9a --- /dev/null +++ b/cognee/api/v1/ontologies/ontologies.py @@ -0,0 +1,101 @@ +import os +import json +import tempfile +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional +from dataclasses import dataclass + +@dataclass +class OntologyMetadata: + ontology_key: str + filename: str + size_bytes: int + uploaded_at: str + description: Optional[str] = None + +class OntologyService: + def __init__(self): + pass + + @property + def base_dir(self) -> Path: + return Path(tempfile.gettempdir()) / "ontologies" + + def _get_user_dir(self, user_id: str) -> Path: + user_dir = self.base_dir / str(user_id) + user_dir.mkdir(parents=True, exist_ok=True) + return user_dir + + def _get_metadata_path(self, user_dir: Path) -> Path: + return user_dir / "metadata.json" + + def _load_metadata(self, user_dir: Path) -> dict: + metadata_path = self._get_metadata_path(user_dir) + if metadata_path.exists(): + with open(metadata_path, 'r') as f: + return json.load(f) + return {} + + def _save_metadata(self, user_dir: Path, metadata: dict): + metadata_path = self._get_metadata_path(user_dir) + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + + async def upload_ontology(self, ontology_key: str, file, user, description: Optional[str] = None) -> OntologyMetadata: + # Validate file format + if not file.filename.lower().endswith('.owl'): + raise ValueError("File must be in .owl format") + + user_dir = self._get_user_dir(str(user.id)) + metadata = self._load_metadata(user_dir) + + # Check for duplicate key + if ontology_key in metadata: + raise ValueError(f"Ontology key '{ontology_key}' already exists") + + # Read file content + content = await file.read() + if len(content) > 10 * 1024 * 1024: # 10MB limit + raise ValueError("File size exceeds 10MB limit") + + # Save file + file_path = user_dir / f"{ontology_key}.owl" + with open(file_path, 'wb') as f: + f.write(content) + + # Update metadata + ontology_metadata = { + "filename": file.filename, + "size_bytes": len(content), + "uploaded_at": datetime.now(timezone.utc).isoformat(), + "description": description + } + metadata[ontology_key] = ontology_metadata + self._save_metadata(user_dir, metadata) + + return OntologyMetadata( + ontology_key=ontology_key, + filename=file.filename, + size_bytes=len(content), + uploaded_at=ontology_metadata["uploaded_at"], + description=description + ) + + def get_ontology_content(self, ontology_key: str, user) -> str: + user_dir = self._get_user_dir(str(user.id)) + metadata = self._load_metadata(user_dir) + + if ontology_key not in metadata: + raise ValueError(f"Ontology key '{ontology_key}' not found") + + file_path = user_dir / f"{ontology_key}.owl" + if not file_path.exists(): + raise ValueError(f"Ontology file for key '{ontology_key}' not found") + + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + + def list_ontologies(self, user) -> dict: + user_dir = self._get_user_dir(str(user.id)) + return self._load_metadata(user_dir) \ No newline at end of file diff --git a/cognee/api/v1/ontologies/routers/__init__.py b/cognee/api/v1/ontologies/routers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/api/v1/ontologies/routers/get_ontology_router.py b/cognee/api/v1/ontologies/routers/get_ontology_router.py new file mode 100644 index 000000000..c171fa7bb --- /dev/null +++ b/cognee/api/v1/ontologies/routers/get_ontology_router.py @@ -0,0 +1,89 @@ +from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException +from fastapi.responses import JSONResponse +from typing import Optional + +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_authenticated_user +from cognee.shared.utils import send_telemetry +from cognee import __version__ as cognee_version +from ..ontologies import OntologyService + +def get_ontology_router() -> APIRouter: + router = APIRouter() + ontology_service = OntologyService() + + @router.post("", response_model=dict) + async def upload_ontology( + ontology_key: str = Form(...), + ontology_file: UploadFile = File(...), + description: Optional[str] = Form(None), + user: User = Depends(get_authenticated_user) + ): + """ + Upload an ontology file with a named key for later use in cognify operations. + + ## Request Parameters + - **ontology_key** (str): User-defined identifier for the ontology + - **ontology_file** (UploadFile): OWL format ontology file + - **description** (Optional[str]): Optional description of the ontology + + ## Response + Returns metadata about the uploaded ontology including key, filename, size, and upload timestamp. + + ## Error Codes + - **400 Bad Request**: Invalid file format, duplicate key, file size exceeded + - **500 Internal Server Error**: File system or processing errors + """ + send_telemetry( + "Ontology Upload API Endpoint Invoked", + user.id, + additional_properties={ + "endpoint": "POST /api/v1/ontologies", + "cognee_version": cognee_version, + }, + ) + + try: + result = await ontology_service.upload_ontology( + ontology_key, ontology_file, user, description + ) + return { + "ontology_key": result.ontology_key, + "filename": result.filename, + "size_bytes": result.size_bytes, + "uploaded_at": result.uploaded_at + } + except ValueError as e: + return JSONResponse(status_code=400, content={"error": str(e)}) + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + @router.get("", response_model=dict) + async def list_ontologies( + user: User = Depends(get_authenticated_user) + ): + """ + List all uploaded ontologies for the authenticated user. + + ## Response + Returns a dictionary mapping ontology keys to their metadata including filename, size, and upload timestamp. + + ## Error Codes + - **500 Internal Server Error**: File system or processing errors + """ + send_telemetry( + "Ontology List API Endpoint Invoked", + user.id, + additional_properties={ + "endpoint": "GET /api/v1/ontologies", + "cognee_version": cognee_version, + }, + ) + + try: + metadata = ontology_service.list_ontologies(user) + return metadata + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + return router \ No newline at end of file diff --git a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index 45e32936a..4acc8861b 100644 --- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -2,7 +2,7 @@ import os import difflib from cognee.shared.logging_utils import get_logger from collections import deque -from typing import List, Tuple, Dict, Optional, Any, Union +from typing import List, Tuple, Dict, Optional, Any, Union, IO from rdflib import Graph, URIRef, RDF, RDFS, OWL from cognee.modules.ontology.exceptions import ( @@ -26,44 +26,55 @@ class RDFLibOntologyResolver(BaseOntologyResolver): def __init__( self, - ontology_file: Optional[Union[str, List[str]]] = None, + ontology_file: Optional[Union[str, List[str], IO]] = None, matching_strategy: Optional[MatchingStrategy] = None, ) -> None: super().__init__(matching_strategy) self.ontology_file = ontology_file try: - files_to_load = [] + self.graph = None if ontology_file is not None: - if isinstance(ontology_file, str): - files_to_load = [ontology_file] - elif isinstance(ontology_file, list): - files_to_load = ontology_file + if hasattr(ontology_file, "read"): + self.graph = Graph() + content = ontology_file.read() + self.graph.parse(data=content, format="xml") + logger.info("Ontology loaded successfully from file object") else: - raise ValueError( - f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}" - ) - - if files_to_load: - self.graph = Graph() - loaded_files = [] - for file_path in files_to_load: - if os.path.exists(file_path): - self.graph.parse(file_path) - loaded_files.append(file_path) - logger.info("Ontology loaded successfully from file: %s", file_path) + files_to_load = [] + if isinstance(ontology_file, str): + files_to_load = [ontology_file] + elif isinstance(ontology_file, list): + files_to_load = ontology_file else: - logger.warning( - "Ontology file '%s' not found. Skipping this file.", - file_path, + raise ValueError( + f"ontology_file must be a string, list of strings, file-like object, or None. Got: {type(ontology_file)}" ) - if not loaded_files: - logger.info( - "No valid ontology files found. No owl ontology will be attached to the graph." - ) - self.graph = None - else: - logger.info("Total ontology files loaded: %d", len(loaded_files)) + if files_to_load: + self.graph = Graph() + loaded_files = [] + for file_path in files_to_load: + if os.path.exists(file_path): + self.graph.parse(file_path) + loaded_files.append(file_path) + logger.info("Ontology loaded successfully from file: %s", file_path) + else: + logger.warning( + "Ontology file '%s' not found. Skipping this file.", + file_path, + ) + + if not loaded_files: + logger.info( + "No valid ontology files found. No owl ontology will be attached to the graph." + ) + self.graph = None + else: + logger.info("Total ontology files loaded: %d", len(loaded_files)) + else: + logger.info( + "No ontology file provided. No owl ontology will be attached to the graph." + ) else: logger.info( "No ontology file provided. No owl ontology will be attached to the graph." diff --git a/cognee/tests/test_ontology_endpoint.py b/cognee/tests/test_ontology_endpoint.py new file mode 100644 index 000000000..4849f8649 --- /dev/null +++ b/cognee/tests/test_ontology_endpoint.py @@ -0,0 +1,89 @@ +import pytest +import uuid +from fastapi.testclient import TestClient +from unittest.mock import patch, Mock, AsyncMock +from types import SimpleNamespace +import importlib +from cognee.api.client import app + +gau_mod = importlib.import_module("cognee.modules.users.methods.get_authenticated_user") + +@pytest.fixture +def client(): + return TestClient(app) + +@pytest.fixture +def mock_user(): + user = Mock() + user.id = "test-user-123" + return user + +@pytest.fixture +def mock_default_user(): + """Mock default user for testing.""" + return SimpleNamespace( + id=uuid.uuid4(), + email="default@example.com", + is_active=True, + tenant_id=uuid.uuid4() + ) + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_success(mock_get_default_user, client, mock_default_user): + """Test successful ontology upload""" + mock_get_default_user.return_value = mock_default_user + ontology_content = b"" + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + + response = client.post( + "/api/v1/ontologies", + files={"ontology_file": ("test.owl", ontology_content)}, + data={"ontology_key": unique_key, "description": "Test"} + ) + + assert response.status_code == 200 + data = response.json() + assert data["ontology_key"] == unique_key + assert "uploaded_at" in data + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_default_user): + """Test 400 response for non-.owl files""" + mock_get_default_user.return_value = mock_default_user + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + response = client.post( + "/api/v1/ontologies", + files={"ontology_file": ("test.txt", b"not xml")}, + data={"ontology_key": unique_key} + ) + assert response.status_code == 400 + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_missing_data(mock_get_default_user, client, mock_default_user): + """Test 400 response for missing file or key""" + mock_get_default_user.return_value = mock_default_user + # Missing file + response = client.post("/api/v1/ontologies", data={"ontology_key": "test"}) + assert response.status_code == 400 + + # Missing key + response = client.post("/api/v1/ontologies", files={"ontology_file": ("test.owl", b"xml")}) + assert response.status_code == 400 + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_default_user): + """Test behavior when default user is provided (no explicit authentication)""" + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + mock_get_default_user.return_value = mock_default_user + response = client.post( + "/api/v1/ontologies", + files={"ontology_file": ("test.owl", b"")}, + data={"ontology_key": unique_key} + ) + + # The current system provides a default user when no explicit authentication is given + # This test verifies the system works with conditional authentication + assert response.status_code == 200 + data = response.json() + assert data["ontology_key"] == unique_key + assert "uploaded_at" in data \ No newline at end of file From e3b707a0c242fb7268d56d2b485990f120fe0462 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 12:20:17 +0100 Subject: [PATCH 11/85] refactor: Change variable names, add setting of current tenant to be optional for tenant creation --- .../users/tenants/methods/add_user_to_tenant.py | 8 ++++---- .../modules/users/tenants/methods/create_tenant.py | 13 +++++++++---- .../modules/users/tenants/methods/select_tenant.py | 2 +- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index dabab6b6b..edadfe66b 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -16,18 +16,18 @@ from cognee.modules.users.exceptions import ( async def add_user_to_tenant( - user_id: UUID, tenant_id: UUID, owner_id: UUID, set_active_tenant: Optional[bool] = True + user_id: UUID, tenant_id: UUID, owner_id: UUID, set_as_active_tenant: Optional[bool] = True ): """ Add a user with the given id to the tenant with the given id. This can only be successful if the request owner with the given id is the tenant owner. - If set_active_tenant is true it will automatically set the users active tenant to provided tenant. + If set_as_active_tenant is true it will automatically set the users active tenant to provided tenant. Args: user_id: Id of the user. tenant_id: Id of the tenant. owner_id: Id of the request owner. - set_active_tenant: If set_active_tenant is true it will automatically set the users active tenant to provided tenant. + set_as_active_tenant: If set_as_active_tenant is true it will automatically set the users active tenant to provided tenant. Returns: None @@ -48,7 +48,7 @@ async def add_user_to_tenant( message="Only tenant owner can add other users to organization." ) - if set_active_tenant: + if set_as_active_tenant: user.tenant_id = tenant_id await session.merge(user) await session.commit() diff --git a/cognee/modules/users/tenants/methods/create_tenant.py b/cognee/modules/users/tenants/methods/create_tenant.py index 60e10db5c..32baa05fd 100644 --- a/cognee/modules/users/tenants/methods/create_tenant.py +++ b/cognee/modules/users/tenants/methods/create_tenant.py @@ -1,6 +1,7 @@ from uuid import UUID from sqlalchemy import insert from sqlalchemy.exc import IntegrityError +from typing import Optional from cognee.modules.users.models.UserTenant import UserTenant from cognee.infrastructure.databases.exceptions import EntityAlreadyExistsError @@ -9,13 +10,16 @@ from cognee.modules.users.models import Tenant from cognee.modules.users.methods import get_user -async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: +async def create_tenant( + tenant_name: str, user_id: UUID, set_as_active_tenant: Optional[bool] = True +) -> UUID: """ Create a new tenant with the given name, for the user with the given id. This user is the owner of the tenant. Args: tenant_name: Name of the new tenant. user_id: Id of the user. + set_as_active_tenant: If true, set the newly created tenant as the active tenant for the user. Returns: None @@ -29,9 +33,10 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: session.add(tenant) await session.flush() - user.tenant_id = tenant.id - await session.merge(user) - await session.commit() + if set_as_active_tenant: + user.tenant_id = tenant.id + await session.merge(user) + await session.commit() try: # Add association directly to the association table diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index 732b24858..6e72fea2f 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -52,7 +52,7 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): try: result = result.scalar_one() except sqlalchemy.exc.NoResultFound as e: - raise TenantNotFoundError("User Tenant relationship not found.") from e + raise TenantNotFoundError("User is not part of the tenant.") from e if result: # If user is part of tenant update current tenant of user From b0f85c9e990f8dd20e6fce8dcd6f29c4050050e8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:01:10 +0100 Subject: [PATCH 12/85] feat: add legacy and modern data_id calculating --- cognee/modules/data/methods/__init__.py | 1 + .../data/methods/get_unique_data_id.py | 71 +++++++++++++++++++ cognee/modules/ingestion/identify.py | 10 +-- 3 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 cognee/modules/data/methods/get_unique_data_id.py diff --git a/cognee/modules/data/methods/__init__.py b/cognee/modules/data/methods/__init__.py index 83913085c..7936a9afd 100644 --- a/cognee/modules/data/methods/__init__.py +++ b/cognee/modules/data/methods/__init__.py @@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset from .get_authorized_dataset_by_name import get_authorized_dataset_by_name from .get_data import get_data from .get_unique_dataset_id import get_unique_dataset_id +from .get_unique_data_id import get_unique_data_id from .get_authorized_existing_datasets import get_authorized_existing_datasets from .get_dataset_ids import get_dataset_ids diff --git a/cognee/modules/data/methods/get_unique_data_id.py b/cognee/modules/data/methods/get_unique_data_id.py new file mode 100644 index 000000000..3fc184ce4 --- /dev/null +++ b/cognee/modules/data/methods/get_unique_data_id.py @@ -0,0 +1,71 @@ +from uuid import uuid5, NAMESPACE_OID, UUID +from typing import Optional +from sqlalchemy import select + +from cognee.modules.data.models.Data import Data +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.models import User + + +async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID: + """ + Function returns a unique UUID for data based on data identifier, user id and tenant id. + If data with legacy ID exists, return that ID to maintain compatibility. + + Args: + data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.) + user: User object adding the data + tenant_id: UUID of the tenant for which data is being added + + Returns: + UUID: Unique identifier for the data + """ + + def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID: + """ + Deprecated function, returns a unique UUID for data based on data identifier and user id. + Needed to support legacy data without tenant information. + Args: + data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.) + user: User object adding the data + + Returns: + UUID: Unique identifier for the data + """ + # return UUID hash of file contents + owner id + tenant_id + return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}") + + def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID: + """ + Function returns a unique UUID for data based on data identifier, user id and tenant id. + Args: + data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.) + user: User object adding the data + tenant_id: UUID of the tenant for which data is being added + + Returns: + UUID: Unique identifier for the data + """ + # return UUID hash of file contents + owner id + tenant_id + return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}") + + # Get all possible data_id values + data_id = { + "modern_data_id": _get_modern_unique_data_id( + data_identifier=data_identifier, user=user, tenant_id=tenant_id + ), + "legacy_data_id": _get_deprecated_unique_data_id( + data_identifier=data_identifier, user=user + ), + } + + # Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + legacy_data_point = ( + await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"])) + ).scalar_one_or_none() + + if not legacy_data_point: + return data_id["modern_data_id"] + return data_id["legacy_data_id"] diff --git a/cognee/modules/ingestion/identify.py b/cognee/modules/ingestion/identify.py index 977ff3f0b..5a0fe379e 100644 --- a/cognee/modules/ingestion/identify.py +++ b/cognee/modules/ingestion/identify.py @@ -1,11 +1,13 @@ -from uuid import uuid5, NAMESPACE_OID +from uuid import UUID from .data_types import IngestionData from cognee.modules.users.models import User +from cognee.modules.data.methods import get_unique_data_id -def identify(data: IngestionData, user: User) -> str: +async def identify(data: IngestionData, user: User) -> UUID: data_content_hash: str = data.get_identifier() - # return UUID hash of file contents + owner id - return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}") + return await get_unique_data_id( + data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id + ) From ff388179fb38cd82a59b3a45ea3f343b16c56c86 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:11:57 +0100 Subject: [PATCH 13/85] feat: Add dataset_id calculation that handles legacy dataset_id --- .../data/methods/get_unique_data_id.py | 11 ++- .../data/methods/get_unique_dataset_id.py | 70 +++++++++++++++++-- cognee/modules/ingestion/identify.py | 4 +- 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/cognee/modules/data/methods/get_unique_data_id.py b/cognee/modules/data/methods/get_unique_data_id.py index 3fc184ce4..877b5930c 100644 --- a/cognee/modules/data/methods/get_unique_data_id.py +++ b/cognee/modules/data/methods/get_unique_data_id.py @@ -1,5 +1,4 @@ from uuid import uuid5, NAMESPACE_OID, UUID -from typing import Optional from sqlalchemy import select from cognee.modules.data.models.Data import Data @@ -7,7 +6,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models import User -async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID: +async def get_unique_data_id(data_identifier: str, user: User) -> UUID: """ Function returns a unique UUID for data based on data identifier, user id and tenant id. If data with legacy ID exists, return that ID to maintain compatibility. @@ -35,7 +34,7 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option # return UUID hash of file contents + owner id + tenant_id return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}") - def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID: + def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID: """ Function returns a unique UUID for data based on data identifier, user id and tenant id. Args: @@ -47,13 +46,11 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option UUID: Unique identifier for the data """ # return UUID hash of file contents + owner id + tenant_id - return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}") + return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}") # Get all possible data_id values data_id = { - "modern_data_id": _get_modern_unique_data_id( - data_identifier=data_identifier, user=user, tenant_id=tenant_id - ), + "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user), "legacy_data_id": _get_deprecated_unique_data_id( data_identifier=data_identifier, user=user ), diff --git a/cognee/modules/data/methods/get_unique_dataset_id.py b/cognee/modules/data/methods/get_unique_dataset_id.py index 2caf5fb55..274f24d1a 100644 --- a/cognee/modules/data/methods/get_unique_dataset_id.py +++ b/cognee/modules/data/methods/get_unique_dataset_id.py @@ -1,9 +1,71 @@ from uuid import UUID, uuid5, NAMESPACE_OID -from cognee.modules.users.models import User from typing import Union +from sqlalchemy import select + +from cognee.modules.data.models.Dataset import Dataset +from cognee.modules.users.models import User +from cognee.infrastructure.databases.relational import get_relational_engine async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: - if isinstance(dataset_name, UUID): - return dataset_name - return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}") + """ + Function returns a unique UUID for dataset based on dataset name, user id and tenant id. + If dataset with legacy ID exists, return that ID to maintain compatibility. + + Args: + dataset_name: string representing the dataset name + user: User object adding the dataset + tenant_id: UUID of the tenant for which dataset is being added + + Returns: + UUID: Unique identifier for the dataset + """ + + def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: + """ + Legacy function, returns a unique UUID for dataset based on dataset name and user id. + Needed to support legacy datasets without tenant information. + Args: + dataset_name: string representing the dataset name + user: Current User object adding the dataset + + Returns: + UUID: Unique identifier for the dataset + """ + if isinstance(dataset_name, UUID): + return dataset_name + return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}") + + def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: + """ + Returns a unique UUID for dataset based on dataset name, user id and tenant_id. + Args: + dataset_name: string representing the dataset name + user: Current User object adding the dataset + tenant_id: UUID of the tenant for which dataset is being added + + Returns: + UUID: Unique identifier for the dataset + """ + if isinstance(dataset_name, UUID): + return dataset_name + return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}") + + # Get all possible dataset_id values + dataset_id = { + "modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user), + "legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user), + } + + # Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + legacy_dataset = ( + await session.execute( + select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"]) + ) + ).scalar_one_or_none() + + if not legacy_dataset: + return dataset_id["modern_dataset_id"] + return dataset_id["legacy_dataset_id"] diff --git a/cognee/modules/ingestion/identify.py b/cognee/modules/ingestion/identify.py index 5a0fe379e..640fce4a2 100644 --- a/cognee/modules/ingestion/identify.py +++ b/cognee/modules/ingestion/identify.py @@ -8,6 +8,4 @@ from cognee.modules.data.methods import get_unique_data_id async def identify(data: IngestionData, user: User) -> UUID: data_content_hash: str = data.get_identifier() - return await get_unique_data_id( - data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id - ) + return await get_unique_data_id(data_identifier=data_content_hash, user=user) From ac257dca1db4123cf97abacf32e1ecd85dab9afd Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:13:42 +0100 Subject: [PATCH 14/85] refactor: Account for async change for identify function --- cognee/modules/pipelines/operations/run_tasks_data_item.py | 2 +- cognee/tasks/ingestion/ingest_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py index 152e72d7f..2cc449df6 100644 --- a/cognee/modules/pipelines/operations/run_tasks_data_item.py +++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py @@ -69,7 +69,7 @@ async def run_tasks_data_item_incremental( async with open_data_file(file_path) as file: classified_data = ingestion.classify(file) # data_id is the hash of file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, user) + data_id = await ingestion.identify(classified_data, user) else: # If data was already processed by Cognee get data id data_id = data_item.id diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 0572d0f1e..5987f38d5 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -99,7 +99,7 @@ async def ingest_data( # data_id is the hash of original file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, user) + data_id = await ingestion.identify(classified_data, user) original_file_metadata = classified_data.get_metadata() # Find metadata from Cognee data storage text file From ea675f29d65dcf354d8999106ff3b8db3a8149f2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:15:49 +0100 Subject: [PATCH 15/85] fix: Resolve typo in accessing dictionary for dataset_id --- cognee/modules/data/methods/get_unique_dataset_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/data/methods/get_unique_dataset_id.py b/cognee/modules/data/methods/get_unique_dataset_id.py index 274f24d1a..2b765ec78 100644 --- a/cognee/modules/data/methods/get_unique_dataset_id.py +++ b/cognee/modules/data/methods/get_unique_dataset_id.py @@ -62,7 +62,7 @@ async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> U async with db_engine.get_async_session() as session: legacy_dataset = ( await session.execute( - select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"]) + select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"]) ) ).scalar_one_or_none() From 9d771acc2427592f40caf0e9727c8e8151c5af64 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:35:50 +0100 Subject: [PATCH 16/85] refactor: filter out search results --- .../methods/get_all_user_permission_datasets.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index e5dbb0e4b..a8cb96fbb 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -26,13 +26,16 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> tenants = await user.awaitable_attrs.tenants for tenant in tenants: - # Get all datasets all tenant members have access to - datasets.extend(await get_principal_datasets(tenant, permission_type)) + # If tenant is the user's selected tenant add datasets that users roles in the tenant and the tenant itself + # have access for + if tenant.id == user.tenant_id: + # Get all datasets all tenant members have access to + datasets.extend(await get_principal_datasets(tenant, permission_type)) - # Get all datasets accessible by roles user is a part of - roles = await user.awaitable_attrs.roles - for role in roles: - datasets.extend(await get_principal_datasets(role, permission_type)) + # Get all datasets accessible by roles user is a part of + roles = await user.awaitable_attrs.roles + for role in roles: + datasets.extend(await get_principal_datasets(role, permission_type)) # Deduplicate datasets with same ID unique = {} From f4117c42e9c1bd0630a333bb789faba8686ba5b0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 16:43:41 +0100 Subject: [PATCH 17/85] fix: Resolve issue with entity extraction test --- cognee/tests/tasks/entity_extraction/entity_extraction_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py index 39e883e09..41a9254ca 100644 --- a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py +++ b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py @@ -55,7 +55,7 @@ async def main(): classified_data = ingestion.classify(file) # data_id is the hash of original file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, await get_default_user()) + data_id = await ingestion.identify(classified_data, await get_default_user()) await cognee.add(file_path) From cd32b492a469c9bfac14d4b3f20ed99a727a9460 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 17:56:01 +0100 Subject: [PATCH 18/85] refactor: Add filtering of non current tenant results when authorizing dataset --- .../get_all_user_permission_datasets.py | 25 ++++++++++--------- .../users/roles/methods/add_user_to_role.py | 4 ++- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index a8cb96fbb..ee1de3c72 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -24,18 +24,14 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # Get all tenants user is a part of tenants = await user.awaitable_attrs.tenants - for tenant in tenants: - # If tenant is the user's selected tenant add datasets that users roles in the tenant and the tenant itself - # have access for - if tenant.id == user.tenant_id: - # Get all datasets all tenant members have access to - datasets.extend(await get_principal_datasets(tenant, permission_type)) + # Get all datasets all tenant members have access to + datasets.extend(await get_principal_datasets(tenant, permission_type)) - # Get all datasets accessible by roles user is a part of - roles = await user.awaitable_attrs.roles - for role in roles: - datasets.extend(await get_principal_datasets(role, permission_type)) + # Get all datasets accessible by roles user is a part of + roles = await user.awaitable_attrs.roles + for role in roles: + datasets.extend(await get_principal_datasets(role, permission_type)) # Deduplicate datasets with same ID unique = {} @@ -43,5 +39,10 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) - # TODO: Add filtering out of datasets that aren't currently selected tenant of user (currently selected tenant is the tenant_id value in the User model) - return list(unique.values()) + # Filter out dataset that aren't part of the current user's tenant + filtered_datasets = [] + for dataset in list(unique.values()): + if dataset.tenant_id == user.tenant_id: + filtered_datasets.append(dataset) + + return filtered_datasets diff --git a/cognee/modules/users/roles/methods/add_user_to_role.py b/cognee/modules/users/roles/methods/add_user_to_role.py index de5e47775..d764ac900 100644 --- a/cognee/modules/users/roles/methods/add_user_to_role.py +++ b/cognee/modules/users/roles/methods/add_user_to_role.py @@ -42,11 +42,13 @@ async def add_user_to_role(user_id: UUID, role_id: UUID, owner_id: UUID): .first() ) + user_tenants = await user.awaitable_attrs.tenants + if not user: raise UserNotFoundError elif not role: raise RoleNotFoundError - elif user.tenant_id != role.tenant_id: + elif role.tenant_id not in [tenant.id for tenant in user_tenants]: # TESTME raise TenantNotFoundError( message="User tenant does not match role tenant. User cannot be added to role." ) From fb102f29a8fbbfa941641208f41a55e1eb370fb5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 19:03:56 +0100 Subject: [PATCH 19/85] chore: Add alembic migration for multi-tenant system --- .../c946955da633_multi_tenant_support.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 alembic/versions/c946955da633_multi_tenant_support.py diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py new file mode 100644 index 000000000..2ad230974 --- /dev/null +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -0,0 +1,113 @@ +"""Multi Tenant Support + +Revision ID: c946955da633 +Revises: 211ab850ef3d +Create Date: 2025-11-04 18:11:09.325158 + +""" + +from typing import Sequence, Union +from datetime import datetime, timezone +from uuid import uuid4 + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = "c946955da633" +down_revision: Union[str, None] = "211ab850ef3d" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _define_user_table() -> sa.Table: + table = sa.Table( + "users", + sa.MetaData(), + sa.Column( + "id", + sa.UUID, + sa.ForeignKey("principals.id", ondelete="CASCADE"), + primary_key=True, + nullable=False, + ), + sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), index=True, nullable=True), + ) + return table + + +def _define_dataset_table() -> sa.Table: + # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table + # definition or load what is in the database + table = sa.Table( + "datasets", + sa.MetaData(), + sa.Column("id", sa.UUID, primary_key=True, default=uuid4), + sa.Column("name", sa.Text), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + default=lambda: datetime.now(timezone.utc), + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + onupdate=lambda: datetime.now(timezone.utc), + ), + sa.Column("owner_id", sa.UUID(), sa.ForeignKey("principals.id"), index=True), + sa.Column("tenant_id", sa.UUID(), sa.ForeignKey("tenants.id"), index=True, nullable=True), + ) + + return table + + +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + dataset = _define_dataset_table() + user = _define_user_table() + + tenant_id_column = _get_column(insp, "datasets", "tenant_id") + if not tenant_id_column: + op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True)) + + # Build correlated subquery: select users.tenant_id for each dataset.owner_id + tenant_id_from_dataset_owner = ( + sa.select(user.c.tenant_id).where(user.c.id == dataset.c.owner_id).scalar_subquery() + ) + + # Update statement; restrict to rows where tenant_id is currently NULL + # update_stmt = ( + # sa.update(dataset) + # .values(tenant_id=subq) + # ) + + user = _define_user_table() + if op.get_context().dialect.name == "sqlite": + # If column doesn't exist create new original_extension column and update from values of extension column + with op.batch_alter_table("datasets") as batch_op: + batch_op.execute( + dataset.update().values( + tenant_id=tenant_id_from_dataset_owner, + ) + ) + else: + conn = op.get_bind() + conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner)) + + op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"], unique=False) + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + op.drop_column("datasets", "tenant_id") + # ### end Alembic commands ### From db2a32dd171a7db53487bec4c29474d1f36d1aa2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 19:17:02 +0100 Subject: [PATCH 20/85] test: Resolve issue permission example --- alembic/versions/c946955da633_multi_tenant_support.py | 9 +-------- examples/python/permissions_example.py | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 2ad230974..09781c85c 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -79,18 +79,11 @@ def upgrade() -> None: if not tenant_id_column: op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True)) - # Build correlated subquery: select users.tenant_id for each dataset.owner_id + # Build subquery, select users.tenant_id for each dataset.owner_id tenant_id_from_dataset_owner = ( sa.select(user.c.tenant_id).where(user.c.id == dataset.c.owner_id).scalar_subquery() ) - # Update statement; restrict to rows where tenant_id is currently NULL - # update_stmt = ( - # sa.update(dataset) - # .values(tenant_id=subq) - # ) - - user = _define_user_table() if op.get_context().dialect.name == "sqlite": # If column doesn't exist create new original_extension column and update from values of extension column with op.batch_alter_table("datasets") as batch_op: diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 7c140845c..5d1195a11 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -151,7 +151,7 @@ async def main(): # To add a user to a role he must be part of the same tenant/organization print("\nOperation started as user_2 to add user_3 to CogneeLab tenant/organization") await add_user_to_tenant( - user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_active_tenant=True + user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_as_active_tenant=True ) print( From f002d3bf0ef24e8db113625971bfb98e6473e6b7 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 20:24:16 +0100 Subject: [PATCH 21/85] refactor: Update permissions example --- .../tenants/methods/add_user_to_tenant.py | 2 +- .../users/tenants/methods/select_tenant.py | 6 ++- examples/python/permissions_example.py | 45 ++++++++++++++++--- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index edadfe66b..eecc49f6f 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -16,7 +16,7 @@ from cognee.modules.users.exceptions import ( async def add_user_to_tenant( - user_id: UUID, tenant_id: UUID, owner_id: UUID, set_as_active_tenant: Optional[bool] = True + user_id: UUID, tenant_id: UUID, owner_id: UUID, set_as_active_tenant: Optional[bool] = False ): """ Add a user with the given id to the tenant with the given id. diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index 6e72fea2f..b444e9b1e 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -7,11 +7,12 @@ from sqlalchemy import select from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models.UserTenant import UserTenant from cognee.modules.users.methods import get_user +from cognee.modules.users.models.User import User from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): +async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: """ Set the users active tenant to provided tenant. @@ -33,7 +34,7 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): user.tenant_id = None await session.merge(user) await session.commit() - return + return user tenant = await get_tenant(tenant_id) @@ -59,3 +60,4 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): user.tenant_id = tenant_id await session.merge(user) await session.commit() + return user diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 5d1195a11..fdbde00f0 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -3,6 +3,7 @@ import cognee import pathlib from cognee.modules.users.exceptions import PermissionDeniedError +from cognee.modules.users.tenants.methods import select_tenant from cognee.shared.logging_utils import get_logger from cognee.modules.search.types import SearchType from cognee.modules.users.methods import create_user @@ -116,6 +117,7 @@ async def main(): print( "\nOperation started as user_2 to give read permission to user_1 for the dataset owned by user_2" ) + await authorized_give_permission_on_datasets( user_1.id, [quantum_dataset_id], @@ -142,6 +144,9 @@ async def main(): print("User 2 is creating CogneeLab tenant/organization") tenant_id = await create_tenant("CogneeLab", user_2.id) + print("User 2 is selecting CogneeLab tenant/organization as active tenant") + await select_tenant(user_id=user_2.id, tenant_id=tenant_id) + print("\nUser 2 is creating Researcher role") role_id = await create_role(role_name="Researcher", owner_id=user_2.id) @@ -150,27 +155,55 @@ async def main(): # To add a user to a role he must be part of the same tenant/organization print("\nOperation started as user_2 to add user_3 to CogneeLab tenant/organization") - await add_user_to_tenant( - user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_as_active_tenant=True - ) + await add_user_to_tenant(user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id) print( "\nOperation started by user_2, as tenant owner, to add user_3 to Researcher role inside the tenant/organization" ) await add_user_to_role(user_id=user_3.id, role_id=role_id, owner_id=user_2.id) + print("\nOperation as user_3 to select CogneeLab tenant/organization as active tenant") + await select_tenant(user_id=user_3.id, tenant_id=tenant_id) + print( - "\nOperation started as user_2 to give read permission to Researcher role for the dataset owned by user_2" + "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by user_2" + ) + # Even though the dataset owner is user_2, the dataset doesn't belong to the tenant/organization CogneeLab. + # So we can't assign permissions to it when we're acting in the CogneeLab tenant. + try: + await authorized_give_permission_on_datasets( + role_id, + [quantum_dataset_id], + "read", + user_2.id, + ) + except PermissionDeniedError: + print( + "User 2 could not give permission to the role as the QUANTUM dataset is not part of the CogneeLab tenant" + ) + + print( + "We will now create a new QUANTUM dataset in the CogneeLab tenant so that permissions can be assigned to the Researcher role inside the tenant/organization" + ) + # Re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally + # and can still be accessed by selecting the personal tenant for user 2. + await cognee.add([text], dataset_name="QUANTUM", user=user_2) + quantum_cognify_result = await cognee.cognify(["QUANTUM"], user=user_2) + + # The recreated Quantum dataset will now have a different dataset_id as it's a new dataset in a different organization + quantum_dataset_id_cognee_lab_tenant = extract_dataset_id_from_cognify(quantum_cognify_result) + print( + "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by the CogneeLab tenant" ) await authorized_give_permission_on_datasets( role_id, - [quantum_dataset_id], + [quantum_dataset_id_cognee_lab_tenant], "read", user_2.id, ) # Now user_3 can read from QUANTUM dataset as part of the Researcher role after proper permissions have been assigned by the QUANTUM dataset owner, user_2. - print("\nSearch result as user_3 on the dataset owned by user_2:") + print("\nSearch result as user_3 on the QUANTUM dataset owned by the CogneeLab organization:") search_results = await cognee.search( query_type=SearchType.GRAPH_COMPLETION, query_text="What is in the document?", From 7782f246d30f159e23c4ae46afa7896936a8a677 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 20:54:00 +0100 Subject: [PATCH 22/85] refactor: Update permissions example to work with new changes --- .../routers/get_permissions_router.py | 2 +- .../users/tenants/methods/select_tenant.py | 9 +++----- examples/python/permissions_example.py | 22 ++++++++++--------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 20d35e748..db2c72705 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -259,7 +259,7 @@ def get_permissions_router() -> APIRouter: from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method - await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id) + await select_tenant_method(user=user, tenant_id=payload.tenant_id) return JSONResponse( status_code=200, diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index b444e9b1e..cb291d5f2 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -6,19 +6,18 @@ from sqlalchemy import select from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models.UserTenant import UserTenant -from cognee.modules.users.methods import get_user from cognee.modules.users.models.User import User from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: +async def select_tenant(user: User, tenant_id: Union[UUID, None]) -> User: """ Set the users active tenant to provided tenant. If None tenant_id is provided set current Tenant to the default single user-tenant Args: - user_id: Id of the user. + user: User object. tenant_id: Id of the tenant. Returns: @@ -27,8 +26,6 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: - user = await get_user(user_id) - if tenant_id is None: # If no tenant_id is provided set current Tenant to the single user-tenant user.tenant_id = None @@ -46,7 +43,7 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: # Check if User is part of Tenant result = await session.execute( select(UserTenant) - .where(UserTenant.user_id == user_id) + .where(UserTenant.user_id == user.id) .where(UserTenant.tenant_id == tenant_id) ) diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index fdbde00f0..4bbd30bea 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -145,7 +145,7 @@ async def main(): tenant_id = await create_tenant("CogneeLab", user_2.id) print("User 2 is selecting CogneeLab tenant/organization as active tenant") - await select_tenant(user_id=user_2.id, tenant_id=tenant_id) + await select_tenant(user=user_2, tenant_id=tenant_id) print("\nUser 2 is creating Researcher role") role_id = await create_role(role_name="Researcher", owner_id=user_2.id) @@ -163,7 +163,7 @@ async def main(): await add_user_to_role(user_id=user_3.id, role_id=role_id, owner_id=user_2.id) print("\nOperation as user_3 to select CogneeLab tenant/organization as active tenant") - await select_tenant(user_id=user_3.id, tenant_id=tenant_id) + await select_tenant(user=user_3, tenant_id=tenant_id) print( "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by user_2" @@ -183,21 +183,23 @@ async def main(): ) print( - "We will now create a new QUANTUM dataset in the CogneeLab tenant so that permissions can be assigned to the Researcher role inside the tenant/organization" + "We will now create a new QUANTUM dataset with the QUANTUM_COGNEE_LAB name in the CogneeLab tenant so that permissions can be assigned to the Researcher role inside the tenant/organization" ) - # Re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally + # We can re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally # and can still be accessed by selecting the personal tenant for user 2. - await cognee.add([text], dataset_name="QUANTUM", user=user_2) - quantum_cognify_result = await cognee.cognify(["QUANTUM"], user=user_2) + await cognee.add([text], dataset_name="QUANTUM_COGNEE_LAB", user=user_2) + quantum_cognee_lab_cognify_result = await cognee.cognify(["QUANTUM_COGNEE_LAB"], user=user_2) # The recreated Quantum dataset will now have a different dataset_id as it's a new dataset in a different organization - quantum_dataset_id_cognee_lab_tenant = extract_dataset_id_from_cognify(quantum_cognify_result) + quantum_cognee_lab_dataset_id = extract_dataset_id_from_cognify( + quantum_cognee_lab_cognify_result + ) print( "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by the CogneeLab tenant" ) await authorized_give_permission_on_datasets( role_id, - [quantum_dataset_id_cognee_lab_tenant], + [quantum_cognee_lab_dataset_id], "read", user_2.id, ) @@ -207,8 +209,8 @@ async def main(): search_results = await cognee.search( query_type=SearchType.GRAPH_COMPLETION, query_text="What is in the document?", - user=user_1, - dataset_ids=[quantum_dataset_id], + user=user_3, + dataset_ids=[quantum_cognee_lab_dataset_id], ) for result in search_results: print(f"{result}\n") From c2aaec2a827fbdd8f91747989753b4f62a41fa38 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 23:34:51 +0100 Subject: [PATCH 23/85] refactor: Resolve issue with permissions example --- .../api/v1/permissions/routers/get_permissions_router.py | 2 +- cognee/modules/users/tenants/methods/select_tenant.py | 6 ++++-- examples/python/permissions_example.py | 8 ++++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index db2c72705..20d35e748 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -259,7 +259,7 @@ def get_permissions_router() -> APIRouter: from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method - await select_tenant_method(user=user, tenant_id=payload.tenant_id) + await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id) return JSONResponse( status_code=200, diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index cb291d5f2..83c11dc91 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -5,19 +5,20 @@ import sqlalchemy.exc from sqlalchemy import select from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.methods.get_user import get_user from cognee.modules.users.models.UserTenant import UserTenant from cognee.modules.users.models.User import User from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user: User, tenant_id: Union[UUID, None]) -> User: +async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: """ Set the users active tenant to provided tenant. If None tenant_id is provided set current Tenant to the default single user-tenant Args: - user: User object. + user_id: UUID of the user. tenant_id: Id of the tenant. Returns: @@ -26,6 +27,7 @@ async def select_tenant(user: User, tenant_id: Union[UUID, None]) -> User: """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: + user = await get_user(user_id) if tenant_id is None: # If no tenant_id is provided set current Tenant to the single user-tenant user.tenant_id = None diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 4bbd30bea..c0b104023 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -145,7 +145,7 @@ async def main(): tenant_id = await create_tenant("CogneeLab", user_2.id) print("User 2 is selecting CogneeLab tenant/organization as active tenant") - await select_tenant(user=user_2, tenant_id=tenant_id) + await select_tenant(user_id=user_2.id, tenant_id=tenant_id) print("\nUser 2 is creating Researcher role") role_id = await create_role(role_name="Researcher", owner_id=user_2.id) @@ -163,7 +163,7 @@ async def main(): await add_user_to_role(user_id=user_3.id, role_id=role_id, owner_id=user_2.id) print("\nOperation as user_3 to select CogneeLab tenant/organization as active tenant") - await select_tenant(user=user_3, tenant_id=tenant_id) + await select_tenant(user_id=user_3.id, tenant_id=tenant_id) print( "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by user_2" @@ -187,6 +187,10 @@ async def main(): ) # We can re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally # and can still be accessed by selecting the personal tenant for user 2. + from cognee.modules.users.methods import get_user + + # Note: We need to update user_2 from the database to refresh its tenant context changes + user_2 = await get_user(user_2.id) await cognee.add([text], dataset_name="QUANTUM_COGNEE_LAB", user=user_2) quantum_cognee_lab_cognify_result = await cognee.cognify(["QUANTUM_COGNEE_LAB"], user=user_2) From 1643b13c95ba83b08abb0d1afeec80767049db26 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 12:43:01 +0100 Subject: [PATCH 24/85] chore: add table creation for multi-tenancy to migration --- .../c946955da633_multi_tenant_support.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 09781c85c..fc45644d0 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -75,6 +75,28 @@ def upgrade() -> None: dataset = _define_dataset_table() user = _define_user_table() + if "user_tenants" not in insp.get_table_names(): + tenant_id_from_user = sa.select(user.c.tenant_id).scalar_subquery() + # Define table with all necessary columns including primary key + user_tenants = op.create_table( + "user_tenants", + sa.Column("user_id", sa.UUID, sa.ForeignKey("users.id"), primary_key=True), + sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), primary_key=True), + sa.Column("created_at", sa.DateTime(), default=lambda: datetime.now(timezone.utc)), + ) + if op.get_context().dialect.name == "sqlite": + # If column doesn't exist create new original_extension column and update from values of extension column + with op.batch_alter_table("user_tenants") as batch_op: + batch_op.execute( + user_tenants.update().values( + tenant_id=tenant_id_from_user, + user_id=user.c.id, + ) + ) + else: + conn = op.get_bind() + conn.execute(dataset.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id)) + tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True)) From 9fc4199958045cb5ed06cfcaf783baae247760d6 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 13:18:47 +0100 Subject: [PATCH 25/85] fix: Resolve issue with cleaning acl table --- .../ab7e313804ae_permission_system_rework.py | 72 +++++++++++-------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/alembic/versions/ab7e313804ae_permission_system_rework.py b/alembic/versions/ab7e313804ae_permission_system_rework.py index bd69b9b41..d83f946a6 100644 --- a/alembic/versions/ab7e313804ae_permission_system_rework.py +++ b/alembic/versions/ab7e313804ae_permission_system_rework.py @@ -144,44 +144,58 @@ def _create_data_permission(conn, user_id, data_id, permission_name): ) +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + def upgrade() -> None: conn = op.get_bind() + insp = sa.inspect(conn) - # Recreate ACLs table with default permissions set to datasets instead of documents - op.drop_table("acls") + dataset_id_column = _get_column(insp, "acls", "dataset_id") + if not dataset_id_column: + # Recreate ACLs table with default permissions set to datasets instead of documents + op.drop_table("acls") - acls_table = op.create_table( - "acls", - sa.Column("id", UUID, primary_key=True, default=uuid4), - sa.Column( - "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc) - ), - sa.Column( - "updated_at", sa.DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc) - ), - sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")), - sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")), - sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")), - ) + acls_table = op.create_table( + "acls", + sa.Column("id", UUID, primary_key=True, default=uuid4), + sa.Column( + "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc) + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + onupdate=lambda: datetime.now(timezone.utc), + ), + sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")), + sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")), + sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")), + ) - # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table - # definition or load what is in the database - dataset_table = _define_dataset_table() - datasets = conn.execute(sa.select(dataset_table)).fetchall() + # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table + # definition or load what is in the database + dataset_table = _define_dataset_table() + datasets = conn.execute(sa.select(dataset_table)).fetchall() - if not datasets: - return + if not datasets: + return - acl_list = [] + acl_list = [] - for dataset in datasets: - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read")) - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write")) - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share")) - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete")) + for dataset in datasets: + acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read")) + acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write")) + acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share")) + acl_list.append( + _create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete") + ) - if acl_list: - op.bulk_insert(acls_table, acl_list) + if acl_list: + op.bulk_insert(acls_table, acl_list) def downgrade() -> None: From 1b4aa5c67b0412b134e903855f1ad3d12c7eab0e Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 15:15:15 +0100 Subject: [PATCH 26/85] fix: resolve issue with cypher search --- cognee/modules/retrieval/cypher_search_retriever.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/modules/retrieval/cypher_search_retriever.py b/cognee/modules/retrieval/cypher_search_retriever.py index 9978f2536..b8cbce558 100644 --- a/cognee/modules/retrieval/cypher_search_retriever.py +++ b/cognee/modules/retrieval/cypher_search_retriever.py @@ -49,8 +49,9 @@ class CypherSearchRetriever(BaseRetriever): if is_empty: logger.warning("Search attempt on an empty knowledge graph") return [] + from fastapi.encoders import jsonable_encoder - result = await graph_engine.query(query) + result = jsonable_encoder(await graph_engine.query(query)) except Exception as e: logger.error("Failed to execture cypher search retrieval: %s", str(e)) raise CypherSearchError() from e From 1c142734b4f031e466d5a7b111f3ed2e55e338c3 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 15:22:23 +0100 Subject: [PATCH 27/85] refactor: change import to top of file --- cognee/modules/retrieval/cypher_search_retriever.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/modules/retrieval/cypher_search_retriever.py b/cognee/modules/retrieval/cypher_search_retriever.py index b8cbce558..01816f3df 100644 --- a/cognee/modules/retrieval/cypher_search_retriever.py +++ b/cognee/modules/retrieval/cypher_search_retriever.py @@ -1,4 +1,6 @@ from typing import Any, Optional +from fastapi.encoders import jsonable_encoder + from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.utils.completion import generate_completion @@ -49,7 +51,6 @@ class CypherSearchRetriever(BaseRetriever): if is_empty: logger.warning("Search attempt on an empty knowledge graph") return [] - from fastapi.encoders import jsonable_encoder result = jsonable_encoder(await graph_engine.query(query)) except Exception as e: From fa4c50f972e27190fb97d20b2e61726b52bb3f2a Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 16:05:33 +0100 Subject: [PATCH 28/85] fix: Resolve issue with sync migration not working for postgresql --- .../211ab850ef3d_add_sync_operations_table.py | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/alembic/versions/211ab850ef3d_add_sync_operations_table.py b/alembic/versions/211ab850ef3d_add_sync_operations_table.py index 370aab1a4..9c6e81f12 100644 --- a/alembic/versions/211ab850ef3d_add_sync_operations_table.py +++ b/alembic/versions/211ab850ef3d_add_sync_operations_table.py @@ -10,6 +10,7 @@ from typing import Sequence, Union from alembic import op import sqlalchemy as sa +from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. @@ -27,6 +28,27 @@ def upgrade() -> None: inspector = sa.inspect(connection) if "sync_operations" not in inspector.get_table_names(): + if op.get_context().dialect.name == "postgresql": + syncstatus = postgresql.ENUM( + "STARTED", + "IN_PROGRESS", + "COMPLETED", + "FAILED", + "CANCELLED", + name="syncstatus", + create_type=False, + ) + else: + syncstatus = sa.Enum( + "STARTED", + "IN_PROGRESS", + "COMPLETED", + "FAILED", + "CANCELLED", + name="syncstatus", + create_type=False, + ) + # Table doesn't exist, create it normally op.create_table( "sync_operations", @@ -34,15 +56,7 @@ def upgrade() -> None: sa.Column("run_id", sa.Text(), nullable=True), sa.Column( "status", - sa.Enum( - "STARTED", - "IN_PROGRESS", - "COMPLETED", - "FAILED", - "CANCELLED", - name="syncstatus", - create_type=False, - ), + syncstatus, nullable=True, ), sa.Column("progress_percentage", sa.Integer(), nullable=True), From c4807a0c6751e05a4fb04439afa183fa7620c8f5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 16:14:37 +0100 Subject: [PATCH 29/85] refactor: Use user_tenants table to update --- alembic/versions/c946955da633_multi_tenant_support.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index fc45644d0..3f7bde5a2 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -95,7 +95,9 @@ def upgrade() -> None: ) else: conn = op.get_bind() - conn.execute(dataset.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id)) + conn.execute( + user_tenants.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id) + ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: From 9b6cbaf389b172fb86da42ac1c9c8fe544202aae Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 17:24:11 +0100 Subject: [PATCH 30/85] chore: Add multi tenant migration --- .../211ab850ef3d_add_sync_operations_table.py | 6 ++- .../c946955da633_multi_tenant_support.py | 38 +++++++++++-------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/alembic/versions/211ab850ef3d_add_sync_operations_table.py b/alembic/versions/211ab850ef3d_add_sync_operations_table.py index 9c6e81f12..976439a32 100644 --- a/alembic/versions/211ab850ef3d_add_sync_operations_table.py +++ b/alembic/versions/211ab850ef3d_add_sync_operations_table.py @@ -36,7 +36,8 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=False, + create_type=True, + checkfirst=True, ) else: syncstatus = sa.Enum( @@ -46,7 +47,8 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=False, + create_type=True, + checkfirst=True, ) # Table doesn't exist, create it normally diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 3f7bde5a2..6d21f8fc7 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -20,6 +20,10 @@ branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None +def _now(): + return datetime.now(timezone.utc) + + def _define_user_table() -> sa.Table: table = sa.Table( "users", @@ -76,27 +80,29 @@ def upgrade() -> None: user = _define_user_table() if "user_tenants" not in insp.get_table_names(): - tenant_id_from_user = sa.select(user.c.tenant_id).scalar_subquery() # Define table with all necessary columns including primary key user_tenants = op.create_table( "user_tenants", sa.Column("user_id", sa.UUID, sa.ForeignKey("users.id"), primary_key=True), sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), primary_key=True), - sa.Column("created_at", sa.DateTime(), default=lambda: datetime.now(timezone.utc)), + sa.Column( + "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc) + ), ) - if op.get_context().dialect.name == "sqlite": - # If column doesn't exist create new original_extension column and update from values of extension column - with op.batch_alter_table("user_tenants") as batch_op: - batch_op.execute( - user_tenants.update().values( - tenant_id=tenant_id_from_user, - user_id=user.c.id, - ) - ) - else: - conn = op.get_bind() - conn.execute( - user_tenants.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id) + + # Get all users with their tenant_id + user_data = conn.execute( + sa.select(user.c.id, user.c.tenant_id).where(user.c.tenant_id.isnot(None)) + ).fetchall() + + # Insert into user_tenants table + if user_data: + op.bulk_insert( + user_tenants, + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ], ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") @@ -125,6 +131,6 @@ def upgrade() -> None: def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + op.drop_table("user_tenants") op.drop_column("datasets", "tenant_id") # ### end Alembic commands ### From 1ef5805c5708ae82eed17335e891eddafd794cf4 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 17:50:13 +0100 Subject: [PATCH 31/85] fix: Resolve issue with sync migration --- .../211ab850ef3d_add_sync_operations_table.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/alembic/versions/211ab850ef3d_add_sync_operations_table.py b/alembic/versions/211ab850ef3d_add_sync_operations_table.py index 976439a32..30049b44b 100644 --- a/alembic/versions/211ab850ef3d_add_sync_operations_table.py +++ b/alembic/versions/211ab850ef3d_add_sync_operations_table.py @@ -27,6 +27,12 @@ def upgrade() -> None: connection = op.get_bind() inspector = sa.inspect(connection) + if op.get_context().dialect.name == "postgresql": + syncstatus_enum = postgresql.ENUM( + "STARTED", "IN_PROGRESS", "COMPLETED", "FAILED", "CANCELLED", name="syncstatus" + ) + syncstatus_enum.create(op.get_bind(), checkfirst=True) + if "sync_operations" not in inspector.get_table_names(): if op.get_context().dialect.name == "postgresql": syncstatus = postgresql.ENUM( @@ -36,8 +42,7 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=True, - checkfirst=True, + create_type=False, ) else: syncstatus = sa.Enum( @@ -47,8 +52,7 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=True, - checkfirst=True, + create_type=False, ) # Table doesn't exist, create it normally From ce64f242b7a5480bbe9763bbb523fafb7b10e9fb Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 18:04:05 +0100 Subject: [PATCH 32/85] refactor: add droping of index as well --- alembic/versions/c946955da633_multi_tenant_support.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 6d21f8fc7..ba451fc03 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -126,11 +126,10 @@ def upgrade() -> None: conn = op.get_bind() conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner)) - op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"], unique=False) - def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.drop_table("user_tenants") + op.drop_index(op.f("ix_datasets_tenant_id"), table_name="datasets") op.drop_column("datasets", "tenant_id") + op.drop_table("user_tenants") # ### end Alembic commands ### From 79bd2b2576b913528feb92ca6832242133bf9822 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 6 Nov 2025 09:48:01 +0100 Subject: [PATCH 33/85] chore: fixes ruff formatting --- .../v1/cognify/routers/get_cognify_router.py | 15 ++++++++---- cognee/api/v1/ontologies/__init__.py | 2 +- cognee/api/v1/ontologies/ontologies.py | 22 ++++++++++------- .../ontologies/routers/get_ontology_router.py | 11 ++++----- cognee/tests/test_ontology_endpoint.py | 24 ++++++++++++------- 5 files changed, 44 insertions(+), 30 deletions(-) diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index 246cc6c56..252ffe7bf 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -42,8 +42,7 @@ class CognifyPayloadDTO(InDTO): default="", description="Custom prompt for entity extraction and graph generation" ) ontology_key: Optional[str] = Field( - default=None, - description="Reference to previously uploaded ontology" + default=None, description="Reference to previously uploaded ontology" ) @@ -123,16 +122,22 @@ def get_cognify_router() -> APIRouter: if payload.ontology_key: ontology_service = OntologyService() try: - ontology_content = ontology_service.get_ontology_content(payload.ontology_key, user) + ontology_content = ontology_service.get_ontology_content( + payload.ontology_key, user + ) from cognee.modules.ontology.ontology_config import Config - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import ( + RDFLibOntologyResolver, + ) from io import StringIO ontology_stream = StringIO(ontology_content) config_to_use: Config = { "ontology_config": { - "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_stream) + "ontology_resolver": RDFLibOntologyResolver( + ontology_file=ontology_stream + ) } } except ValueError as e: diff --git a/cognee/api/v1/ontologies/__init__.py b/cognee/api/v1/ontologies/__init__.py index c25064edc..b90d46c3d 100644 --- a/cognee/api/v1/ontologies/__init__.py +++ b/cognee/api/v1/ontologies/__init__.py @@ -1,4 +1,4 @@ from .ontologies import OntologyService from .routers.get_ontology_router import get_ontology_router -__all__ = ["OntologyService", "get_ontology_router"] \ No newline at end of file +__all__ = ["OntologyService", "get_ontology_router"] diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index fb7f3cd9a..6bfb7658e 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -6,6 +6,7 @@ from datetime import datetime, timezone from typing import Optional from dataclasses import dataclass + @dataclass class OntologyMetadata: ontology_key: str @@ -14,6 +15,7 @@ class OntologyMetadata: uploaded_at: str description: Optional[str] = None + class OntologyService: def __init__(self): pass @@ -33,18 +35,20 @@ class OntologyService: def _load_metadata(self, user_dir: Path) -> dict: metadata_path = self._get_metadata_path(user_dir) if metadata_path.exists(): - with open(metadata_path, 'r') as f: + with open(metadata_path, "r") as f: return json.load(f) return {} def _save_metadata(self, user_dir: Path, metadata: dict): metadata_path = self._get_metadata_path(user_dir) - with open(metadata_path, 'w') as f: + with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) - async def upload_ontology(self, ontology_key: str, file, user, description: Optional[str] = None) -> OntologyMetadata: + async def upload_ontology( + self, ontology_key: str, file, user, description: Optional[str] = None + ) -> OntologyMetadata: # Validate file format - if not file.filename.lower().endswith('.owl'): + if not file.filename.lower().endswith(".owl"): raise ValueError("File must be in .owl format") user_dir = self._get_user_dir(str(user.id)) @@ -61,7 +65,7 @@ class OntologyService: # Save file file_path = user_dir / f"{ontology_key}.owl" - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: f.write(content) # Update metadata @@ -69,7 +73,7 @@ class OntologyService: "filename": file.filename, "size_bytes": len(content), "uploaded_at": datetime.now(timezone.utc).isoformat(), - "description": description + "description": description, } metadata[ontology_key] = ontology_metadata self._save_metadata(user_dir, metadata) @@ -79,7 +83,7 @@ class OntologyService: filename=file.filename, size_bytes=len(content), uploaded_at=ontology_metadata["uploaded_at"], - description=description + description=description, ) def get_ontology_content(self, ontology_key: str, user) -> str: @@ -93,9 +97,9 @@ class OntologyService: if not file_path.exists(): raise ValueError(f"Ontology file for key '{ontology_key}' not found") - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, "r", encoding="utf-8") as f: return f.read() def list_ontologies(self, user) -> dict: user_dir = self._get_user_dir(str(user.id)) - return self._load_metadata(user_dir) \ No newline at end of file + return self._load_metadata(user_dir) diff --git a/cognee/api/v1/ontologies/routers/get_ontology_router.py b/cognee/api/v1/ontologies/routers/get_ontology_router.py index c171fa7bb..f5c51ba21 100644 --- a/cognee/api/v1/ontologies/routers/get_ontology_router.py +++ b/cognee/api/v1/ontologies/routers/get_ontology_router.py @@ -8,6 +8,7 @@ from cognee.shared.utils import send_telemetry from cognee import __version__ as cognee_version from ..ontologies import OntologyService + def get_ontology_router() -> APIRouter: router = APIRouter() ontology_service = OntologyService() @@ -17,7 +18,7 @@ def get_ontology_router() -> APIRouter: ontology_key: str = Form(...), ontology_file: UploadFile = File(...), description: Optional[str] = Form(None), - user: User = Depends(get_authenticated_user) + user: User = Depends(get_authenticated_user), ): """ Upload an ontology file with a named key for later use in cognify operations. @@ -51,7 +52,7 @@ def get_ontology_router() -> APIRouter: "ontology_key": result.ontology_key, "filename": result.filename, "size_bytes": result.size_bytes, - "uploaded_at": result.uploaded_at + "uploaded_at": result.uploaded_at, } except ValueError as e: return JSONResponse(status_code=400, content={"error": str(e)}) @@ -59,9 +60,7 @@ def get_ontology_router() -> APIRouter: return JSONResponse(status_code=500, content={"error": str(e)}) @router.get("", response_model=dict) - async def list_ontologies( - user: User = Depends(get_authenticated_user) - ): + async def list_ontologies(user: User = Depends(get_authenticated_user)): """ List all uploaded ontologies for the authenticated user. @@ -86,4 +85,4 @@ def get_ontology_router() -> APIRouter: except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)}) - return router \ No newline at end of file + return router diff --git a/cognee/tests/test_ontology_endpoint.py b/cognee/tests/test_ontology_endpoint.py index 4849f8649..b5cedfafe 100644 --- a/cognee/tests/test_ontology_endpoint.py +++ b/cognee/tests/test_ontology_endpoint.py @@ -8,37 +8,40 @@ from cognee.api.client import app gau_mod = importlib.import_module("cognee.modules.users.methods.get_authenticated_user") + @pytest.fixture def client(): return TestClient(app) + @pytest.fixture def mock_user(): user = Mock() user.id = "test-user-123" return user + @pytest.fixture def mock_default_user(): """Mock default user for testing.""" return SimpleNamespace( - id=uuid.uuid4(), - email="default@example.com", - is_active=True, - tenant_id=uuid.uuid4() + id=uuid.uuid4(), email="default@example.com", is_active=True, tenant_id=uuid.uuid4() ) + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_success(mock_get_default_user, client, mock_default_user): """Test successful ontology upload""" mock_get_default_user.return_value = mock_default_user - ontology_content = b"" + ontology_content = ( + b"" + ) unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" response = client.post( "/api/v1/ontologies", files={"ontology_file": ("test.owl", ontology_content)}, - data={"ontology_key": unique_key, "description": "Test"} + data={"ontology_key": unique_key, "description": "Test"}, ) assert response.status_code == 200 @@ -46,6 +49,7 @@ def test_upload_ontology_success(mock_get_default_user, client, mock_default_use assert data["ontology_key"] == unique_key assert "uploaded_at" in data + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_default_user): """Test 400 response for non-.owl files""" @@ -54,10 +58,11 @@ def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_defaul response = client.post( "/api/v1/ontologies", files={"ontology_file": ("test.txt", b"not xml")}, - data={"ontology_key": unique_key} + data={"ontology_key": unique_key}, ) assert response.status_code == 400 + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_missing_data(mock_get_default_user, client, mock_default_user): """Test 400 response for missing file or key""" @@ -70,6 +75,7 @@ def test_upload_ontology_missing_data(mock_get_default_user, client, mock_defaul response = client.post("/api/v1/ontologies", files={"ontology_file": ("test.owl", b"xml")}) assert response.status_code == 400 + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_default_user): """Test behavior when default user is provided (no explicit authentication)""" @@ -78,7 +84,7 @@ def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_defaul response = client.post( "/api/v1/ontologies", files={"ontology_file": ("test.owl", b"")}, - data={"ontology_key": unique_key} + data={"ontology_key": unique_key}, ) # The current system provides a default user when no explicit authentication is given @@ -86,4 +92,4 @@ def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_defaul assert response.status_code == 200 data = response.json() assert data["ontology_key"] == unique_key - assert "uploaded_at" in data \ No newline at end of file + assert "uploaded_at" in data From 5271ee49019f75196adc3f3fe3069efd7bc72e14 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 11:12:12 +0100 Subject: [PATCH 34/85] fix: Resolve issue with empty node set --- cognee/api/v1/add/routers/get_add_router.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cognee/api/v1/add/routers/get_add_router.py b/cognee/api/v1/add/routers/get_add_router.py index b2e7068b0..39dc1a3e6 100644 --- a/cognee/api/v1/add/routers/get_add_router.py +++ b/cognee/api/v1/add/routers/get_add_router.py @@ -82,7 +82,9 @@ def get_add_router() -> APIRouter: datasetName, user=user, dataset_id=datasetId, - node_set=node_set if node_set else None, + node_set=node_set + if node_set != [""] + else None, # Transform default node_set endpoint value to None ) if isinstance(add_run, PipelineRunErrored): From cc60141c8dec285e48a05e8550782cb4708eb6ad Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 11:46:34 +0100 Subject: [PATCH 35/85] test: change key for wighted edges example --- .github/workflows/weighted_edges_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 874ef6ea4..0425ac797 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -109,12 +109,12 @@ jobs: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini LLM_ENDPOINT: https://api.openai.com/v1/ - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" EMBEDDING_PROVIDER: openai EMBEDDING_MODEL: text-embedding-3-small EMBEDDING_ENDPOINT: https://api.openai.com/v1/ - EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }} + EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} EMBEDDING_API_VERSION: "2024-02-01" steps: - name: Check out repository From cdb06d2157d2602bd0e3f9c696d38368e119a8e4 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 11:50:58 +0100 Subject: [PATCH 36/85] test: add workflow_dispatch for test purposes --- .github/workflows/weighted_edges_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 0425ac797..10778d1a0 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -17,6 +17,7 @@ on: - 'cognee/tests/unit/interfaces/graph/test_weighted_edges.py' - 'examples/python/weighted_edges_example.py' - '.github/workflows/weighted_edges_tests.yml' + workflow_dispatch: env: RUNTIME__LOG_LEVEL: ERROR From 5dc8d4e8a1206e9ce4fbf4df157ac945b83283c7 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 11:52:39 +0100 Subject: [PATCH 37/85] test: remove workflow_dispatch --- .github/workflows/weighted_edges_tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 10778d1a0..0425ac797 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -17,7 +17,6 @@ on: - 'cognee/tests/unit/interfaces/graph/test_weighted_edges.py' - 'examples/python/weighted_edges_example.py' - '.github/workflows/weighted_edges_tests.yml' - workflow_dispatch: env: RUNTIME__LOG_LEVEL: ERROR From 4567ffcfff6ca850252fcced54c9324698484f9b Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 12:27:10 +0100 Subject: [PATCH 38/85] test: change endpoint --- .github/workflows/weighted_edges_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 0425ac797..d571d354f 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -108,12 +108,12 @@ jobs: env: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini - LLM_ENDPOINT: https://api.openai.com/v1/ + LLM_ENDPOINT: https://api.openai.com/v1 LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" EMBEDDING_PROVIDER: openai EMBEDDING_MODEL: text-embedding-3-small - EMBEDDING_ENDPOINT: https://api.openai.com/v1/ + EMBEDDING_ENDPOINT: https://api.openai.com/v1/embeddings EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} EMBEDDING_API_VERSION: "2024-02-01" steps: From 286584eef0039686ac9ad62f0be0a0a7a9637edc Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 12:35:37 +0100 Subject: [PATCH 39/85] test: change embedding config --- .github/workflows/weighted_edges_tests.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index d571d354f..b722c5e7c 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -111,11 +111,11 @@ jobs: LLM_ENDPOINT: https://api.openai.com/v1 LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" - EMBEDDING_PROVIDER: openai - EMBEDDING_MODEL: text-embedding-3-small - EMBEDDING_ENDPOINT: https://api.openai.com/v1/embeddings - EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} - EMBEDDING_API_VERSION: "2024-02-01" + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + steps: - name: Check out repository uses: actions/checkout@v4 From a058250c95390df84f1c44419cff8adf3a4e8269 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Thu, 6 Nov 2025 13:03:11 +0100 Subject: [PATCH 40/85] fix: add cognee to the local run environment --- cognee/modules/notebooks/operations/run_in_local_sandbox.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cognee/modules/notebooks/operations/run_in_local_sandbox.py b/cognee/modules/notebooks/operations/run_in_local_sandbox.py index 071deafb7..46499186e 100644 --- a/cognee/modules/notebooks/operations/run_in_local_sandbox.py +++ b/cognee/modules/notebooks/operations/run_in_local_sandbox.py @@ -2,6 +2,8 @@ import io import sys import traceback +import cognee + def wrap_in_async_handler(user_code: str) -> str: return ( @@ -34,6 +36,7 @@ def run_in_local_sandbox(code, environment=None, loop=None): environment["print"] = customPrintFunction environment["running_loop"] = loop + environment["cognee"] = cognee try: exec(code, environment) From 6b81559bb65d8501dc698761ed2fb51ee7c0d839 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 13:04:01 +0100 Subject: [PATCH 41/85] test: changed endpoints in other tests --- .github/workflows/weighted_edges_tests.yml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index b722c5e7c..0b263cdcf 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -32,7 +32,7 @@ jobs: env: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} steps: - name: Check out repository @@ -67,14 +67,13 @@ jobs: env: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini - LLM_ENDPOINT: https://api.openai.com/v1/ - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_ENDPOINT: https://api.openai.com/v1 + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" - EMBEDDING_PROVIDER: openai - EMBEDDING_MODEL: text-embedding-3-small - EMBEDDING_ENDPOINT: https://api.openai.com/v1/ - EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }} - EMBEDDING_API_VERSION: "2024-02-01" + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} steps: - name: Check out repository uses: actions/checkout@v4 @@ -115,7 +114,7 @@ jobs: EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - + steps: - name: Check out repository uses: actions/checkout@v4 From 62c599a4994c4e78c33c66925fb879b72c4ff338 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 13:13:35 +0100 Subject: [PATCH 42/85] test: add dev to CI config for weighted edges tests --- .github/workflows/weighted_edges_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 0b263cdcf..2b4a043bf 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -2,7 +2,7 @@ name: Weighted Edges Tests on: push: - branches: [ main, weighted_edges ] + branches: [ main, dev, weighted_edges ] paths: - 'cognee/modules/graph/utils/get_graph_from_model.py' - 'cognee/infrastructure/engine/models/Edge.py' @@ -10,7 +10,7 @@ on: - 'examples/python/weighted_edges_example.py' - '.github/workflows/weighted_edges_tests.yml' pull_request: - branches: [ main ] + branches: [ main, dev ] paths: - 'cognee/modules/graph/utils/get_graph_from_model.py' - 'cognee/infrastructure/engine/models/Edge.py' From ac6dd08855e30349b0666e6af48da3e829079948 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 14:35:26 +0100 Subject: [PATCH 43/85] fix: Resolve issue with sqlite index creation --- alembic/versions/c946955da633_multi_tenant_support.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index ba451fc03..c87500907 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -126,6 +126,8 @@ def upgrade() -> None: conn = op.get_bind() conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner)) + op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"]) + def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### From ac751bacf09e26b851b5829d46330a2f7ee7f25e Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 14:51:25 +0100 Subject: [PATCH 44/85] fix: Resolve SQLite migration issue --- .../c946955da633_multi_tenant_support.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index c87500907..a87989d9b 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -97,13 +97,23 @@ def upgrade() -> None: # Insert into user_tenants table if user_data: - op.bulk_insert( - user_tenants, - [ - {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} - for user_id, tenant_id in user_data - ], - ) + if op.get_context().dialect.name == "sqlite": + insert_stmt = user_tenants.insert().values( + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ] + ) + conn.execute(insert_stmt) + conn.commit() + else: + op.bulk_insert( + user_tenants, + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ], + ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: From d5caba144c603c59b2960e6c6c88028553db0b57 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 6 Nov 2025 15:57:52 +0100 Subject: [PATCH 45/85] CI: Initial Release test workflow --- .github/workflows/e2e_tests.yml | 41 ---------------- .github/workflows/load_tests.yml | 76 ++++++++++++++++++++++++++++++ .github/workflows/release_test.yml | 14 ++++++ 3 files changed, 90 insertions(+), 41 deletions(-) create mode 100644 .github/workflows/load_tests.yml create mode 100644 .github/workflows/release_test.yml diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 0596f22d3..70a4b56e6 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -447,44 +447,3 @@ jobs: DB_USERNAME: cognee DB_PASSWORD: cognee run: uv run python ./cognee/tests/test_conversation_history.py - - test-load: - name: Test Load - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - extra-dependencies: "aws" - - - name: Set File Descriptor Limit - run: sudo prlimit --pid $$ --nofile=4096:4096 - - - name: Verify File Descriptor Limit - run: ulimit -n - - - name: Dependencies already installed - run: echo "Dependencies already installed in setup" - - - name: Run Load Test - env: - ENV: 'dev' - ENABLE_BACKEND_ACCESS_CONTROL: True - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - STORAGE_BACKEND: s3 - AWS_REGION: eu-west-1 - AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} - run: uv run python ./cognee/tests/test_load.py \ No newline at end of file diff --git a/.github/workflows/load_tests.yml b/.github/workflows/load_tests.yml new file mode 100644 index 000000000..ff11bb88b --- /dev/null +++ b/.github/workflows/load_tests.yml @@ -0,0 +1,76 @@ +name: Load tests + +permissions: + contents: read + +on: + workflow_dispatch: + workflow_call: + secrets: + LLM_MODEL: + required: true + LLM_ENDPOINT: + required: true + LLM_API_KEY: + required: true + LLM_API_VERSION: + required: true + EMBEDDING_MODEL: + required: true + EMBEDDING_ENDPOINT: + required: true + EMBEDDING_API_KEY: + required: true + EMBEDDING_API_VERSION: + required: true + OPENAI_API_KEY: + required: true + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + +jobs: + test-load: + name: Test Load + runs-on: ubuntu-22.04 + timeout-minutes: 60 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + extra-dependencies: "aws" + + - name: Set File Descriptor Limit + run: sudo prlimit --pid $$ --nofile=4096:4096 + + - name: Verify File Descriptor Limit + run: ulimit -n + + - name: Dependencies already installed + run: echo "Dependencies already installed in setup" + + - name: Run Load Test + env: + ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: True + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + STORAGE_BACKEND: s3 + AWS_REGION: eu-west-1 + AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} + run: uv run python ./cognee/tests/test_load.py + + diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml new file mode 100644 index 000000000..23679bef4 --- /dev/null +++ b/.github/workflows/release_test.yml @@ -0,0 +1,14 @@ +# Long-running, heavy and resource-consuming tests for release validation +name: Release Test Workflow + +permissions: + contents: read + +on: + workflow_dispatch: + +jobs: + load-tests: + name: Load Tests + uses: ./.github/workflows/load_tests.yml + secrets: inherit \ No newline at end of file From ef3a3826698d89cfdf5bed62b6ea9f93576122d8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:23:54 +0100 Subject: [PATCH 46/85] refactor: use batch insert for SQLite as well --- .../c946955da633_multi_tenant_support.py | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index a87989d9b..d8fccdfbf 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -97,23 +97,13 @@ def upgrade() -> None: # Insert into user_tenants table if user_data: - if op.get_context().dialect.name == "sqlite": - insert_stmt = user_tenants.insert().values( - [ - {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} - for user_id, tenant_id in user_data - ] - ) - conn.execute(insert_stmt) - conn.commit() - else: - op.bulk_insert( - user_tenants, - [ - {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} - for user_id, tenant_id in user_data - ], - ) + op.bulk_insert( + user_tenants, + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ], + ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: @@ -141,7 +131,7 @@ def upgrade() -> None: def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("user_tenants") op.drop_index(op.f("ix_datasets_tenant_id"), table_name="datasets") op.drop_column("datasets", "tenant_id") - op.drop_table("user_tenants") # ### end Alembic commands ### From c146de3a4d2f5327f4cffd347a8a782e39906da0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:41:00 +0100 Subject: [PATCH 47/85] fix: Remove creation of database and db tables from env.py --- alembic/env.py | 5 ----- alembic/versions/c946955da633_multi_tenant_support.py | 4 ++++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/alembic/env.py b/alembic/env.py index 1cbef65f7..8ca09968d 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -87,11 +87,6 @@ db_engine = get_relational_engine() print("Using database:", db_engine.db_uri) -if "sqlite" in db_engine.db_uri: - from cognee.infrastructure.utils.run_sync import run_sync - - run_sync(db_engine.create_database()) - config.set_section_option( config.config_ini_section, "SQLALCHEMY_DATABASE_URI", diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index d8fccdfbf..7806fdde8 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -79,6 +79,10 @@ def upgrade() -> None: dataset = _define_dataset_table() user = _define_user_table() + print(insp.get_table_names()) + + print(_get_column(insp, "user_tenants", "tenant_id")) + if "user_tenants" not in insp.get_table_names(): # Define table with all necessary columns including primary key user_tenants = op.create_table( From 49fbad9ec0cee49048c97ee6e80c78963a656042 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 6 Nov 2025 16:44:43 +0100 Subject: [PATCH 48/85] CI: Run release workflow on PR to main --- .github/workflows/release_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 23679bef4..6ac3ca515 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -6,6 +6,9 @@ permissions: on: workflow_dispatch: + pull_request: + branches: + - main jobs: load-tests: From efb46c99f9d0d5ac426540b95aadd0a1bfd3e5de Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:47:42 +0100 Subject: [PATCH 49/85] fix: resolve issue with sqlite migration --- alembic/versions/c946955da633_multi_tenant_support.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 7806fdde8..d8fccdfbf 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -79,10 +79,6 @@ def upgrade() -> None: dataset = _define_dataset_table() user = _define_user_table() - print(insp.get_table_names()) - - print(_get_column(insp, "user_tenants", "tenant_id")) - if "user_tenants" not in insp.get_table_names(): # Define table with all necessary columns including primary key user_tenants = op.create_table( From 0d68175167af5da9b69e4024f434cbf0bd64b2ae Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:53:22 +0100 Subject: [PATCH 50/85] fix: remove database creation from migrations --- alembic/versions/8057ae7329c2_initial_migration.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/alembic/versions/8057ae7329c2_initial_migration.py b/alembic/versions/8057ae7329c2_initial_migration.py index aa0ecd4b8..42e9904a8 100644 --- a/alembic/versions/8057ae7329c2_initial_migration.py +++ b/alembic/versions/8057ae7329c2_initial_migration.py @@ -18,11 +18,8 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - db_engine = get_relational_engine() - # we might want to delete this - await_only(db_engine.create_database()) + pass def downgrade() -> None: - db_engine = get_relational_engine() - await_only(db_engine.delete_database()) + pass From bcc59cf9a0b6f5f765269a7ea2725fbd27e971f5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:57:59 +0100 Subject: [PATCH 51/85] fix: Remove default user creation --- alembic/versions/482cd6517ce4_add_default_user.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/alembic/versions/482cd6517ce4_add_default_user.py b/alembic/versions/482cd6517ce4_add_default_user.py index d85f0f146..fafa111f9 100644 --- a/alembic/versions/482cd6517ce4_add_default_user.py +++ b/alembic/versions/482cd6517ce4_add_default_user.py @@ -23,11 +23,8 @@ depends_on: Union[str, Sequence[str], None] = "8057ae7329c2" def upgrade() -> None: - try: - await_only(create_default_user()) - except UserAlreadyExists: - pass # It's fine if the default user already exists + pass # It's fine if the default user already exists def downgrade() -> None: - await_only(delete_user("default_user@example.com")) + pass From 61e1c2903f5f7372e5281a3c7126cc2bcb71bde5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 17:00:46 +0100 Subject: [PATCH 52/85] fix: Remove issue with default user creation --- alembic/versions/482cd6517ce4_add_default_user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/482cd6517ce4_add_default_user.py b/alembic/versions/482cd6517ce4_add_default_user.py index fafa111f9..c8a3dc5d5 100644 --- a/alembic/versions/482cd6517ce4_add_default_user.py +++ b/alembic/versions/482cd6517ce4_add_default_user.py @@ -23,7 +23,7 @@ depends_on: Union[str, Sequence[str], None] = "8057ae7329c2" def upgrade() -> None: - pass # It's fine if the default user already exists + pass def downgrade() -> None: From 7dec6bfdedf30149113a25aa66bf6c21980b605b Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 18:10:04 +0100 Subject: [PATCH 53/85] refactor: Add migrations as part of python package --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5f0aef1d8..8af35113c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,7 +167,6 @@ exclude = [ "/dist", "/.data", "/.github", - "/alembic", "/deployment", "/cognee-mcp", "/cognee-frontend", From 96c8bba5807e13cf376802da28817788ae4d6dbd Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 19:12:09 +0100 Subject: [PATCH 54/85] refactor: Add db creation as step in MCP creation --- cognee-mcp/src/server.py | 4 ++++ cognee/modules/data/models/Dataset.py | 1 + 2 files changed, 5 insertions(+) diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py index ce6dad88a..7c708638c 100755 --- a/cognee-mcp/src/server.py +++ b/cognee-mcp/src/server.py @@ -1096,6 +1096,10 @@ async def main(): # Skip migrations when in API mode (the API server handles its own database) if not args.no_migration and not args.api_url: + from cognee.modules.engine.operations.setup import setup + + await setup() + # Run Alembic migrations from the main cognee directory where alembic.ini is located logger.info("Running database migrations...") migration_result = subprocess.run( diff --git a/cognee/modules/data/models/Dataset.py b/cognee/modules/data/models/Dataset.py index 00ed4da96..fba065253 100644 --- a/cognee/modules/data/models/Dataset.py +++ b/cognee/modules/data/models/Dataset.py @@ -37,5 +37,6 @@ class Dataset(Base): "createdAt": self.created_at.isoformat(), "updatedAt": self.updated_at.isoformat() if self.updated_at else None, "ownerId": str(self.owner_id), + "tenantId": str(self.tenant_id), "data": [data.to_json() for data in self.data], } From 7d5d19347a4ccd44eb3bbbe23c821c4c600b7989 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Fri, 7 Nov 2025 11:11:05 +0100 Subject: [PATCH 55/85] CI: removed unnecessary ulimit --- .github/workflows/load_tests.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/load_tests.yml b/.github/workflows/load_tests.yml index ff11bb88b..f5b64d8ce 100644 --- a/.github/workflows/load_tests.yml +++ b/.github/workflows/load_tests.yml @@ -45,15 +45,9 @@ jobs: python-version: '3.11.x' extra-dependencies: "aws" - - name: Set File Descriptor Limit - run: sudo prlimit --pid $$ --nofile=4096:4096 - - name: Verify File Descriptor Limit run: ulimit -n - - name: Dependencies already installed - run: echo "Dependencies already installed in setup" - - name: Run Load Test env: ENV: 'dev' From 59f758d5c227b04f91e8086915fde078be3089db Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 7 Nov 2025 15:50:49 +0100 Subject: [PATCH 56/85] feat: Add test for multi tenancy, add ability to share name for dataset across tenants for one user --- .github/workflows/e2e_tests.yml | 29 ++- cognee/modules/data/methods/create_dataset.py | 1 + .../modules/data/methods/get_dataset_ids.py | 6 +- cognee/modules/search/methods/search.py | 2 + cognee/tests/test_multi_tenancy.py | 165 ++++++++++++++++++ 5 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 cognee/tests/test_multi_tenancy.py diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 0596f22d3..715487372 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -226,7 +226,7 @@ jobs: - name: Dependencies already installed run: echo "Dependencies already installed in setup" - - name: Run parallel databases test + - name: Run permissions test env: ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -239,6 +239,31 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_permissions.py + test-multi-tenancy: + name: Test multi tenancy with different situations in Cognee + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run multi tenancy test + env: + ENV: 'dev' + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_multi_tenancy.py + test-graph-edges: name: Test graph edge ingestion runs-on: ubuntu-22.04 @@ -487,4 +512,4 @@ jobs: AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} - run: uv run python ./cognee/tests/test_load.py \ No newline at end of file + run: uv run python ./cognee/tests/test_load.py diff --git a/cognee/modules/data/methods/create_dataset.py b/cognee/modules/data/methods/create_dataset.py index 280c9e105..7e28a8255 100644 --- a/cognee/modules/data/methods/create_dataset.py +++ b/cognee/modules/data/methods/create_dataset.py @@ -16,6 +16,7 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) - .options(joinedload(Dataset.data)) .filter(Dataset.name == dataset_name) .filter(Dataset.owner_id == owner_id) + .filter(Dataset.tenant_id == user.tenant_id) ) ).first() diff --git a/cognee/modules/data/methods/get_dataset_ids.py b/cognee/modules/data/methods/get_dataset_ids.py index d4402ff36..a61e85310 100644 --- a/cognee/modules/data/methods/get_dataset_ids.py +++ b/cognee/modules/data/methods/get_dataset_ids.py @@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user): # Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.) user_datasets = await get_datasets(user.id) # Filter out non name mentioned datasets - dataset_ids = [dataset.id for dataset in user_datasets if dataset.name in datasets] + dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets] + # Filter out non current tenant datasets + dataset_ids = [ + dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id + ] else: raise DatasetTypeError( f"One or more of the provided dataset types is not handled: f{datasets}" diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 5e465b239..b4278424b 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -172,6 +172,7 @@ async def search( "search_result": [context] if context else None, "dataset_id": datasets[0].id, "dataset_name": datasets[0].name, + "dataset_tenant_id": datasets[0].tenant_id, "graphs": graphs, } ) @@ -181,6 +182,7 @@ async def search( "search_result": [result] if result else None, "dataset_id": datasets[0].id, "dataset_name": datasets[0].name, + "dataset_tenant_id": datasets[0].tenant_id, "graphs": graphs, } ) diff --git a/cognee/tests/test_multi_tenancy.py b/cognee/tests/test_multi_tenancy.py new file mode 100644 index 000000000..7cdcda8d8 --- /dev/null +++ b/cognee/tests/test_multi_tenancy.py @@ -0,0 +1,165 @@ +import cognee +import pytest + +from cognee.modules.users.exceptions import PermissionDeniedError +from cognee.modules.users.tenants.methods import select_tenant +from cognee.modules.users.methods import get_user +from cognee.shared.logging_utils import get_logger +from cognee.modules.search.types import SearchType +from cognee.modules.users.methods import create_user +from cognee.modules.users.permissions.methods import authorized_give_permission_on_datasets +from cognee.modules.users.roles.methods import add_user_to_role +from cognee.modules.users.roles.methods import create_role +from cognee.modules.users.tenants.methods import create_tenant +from cognee.modules.users.tenants.methods import add_user_to_tenant +from cognee.modules.engine.operations.setup import setup +from cognee.shared.logging_utils import setup_logging, CRITICAL + +logger = get_logger() + + +async def main(): + # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") + + # Set up the necessary databases and tables for user management. + await setup() + + # Add document for user_1, add it under dataset name AI + text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena. + At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages + this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the + preparation and manipulation of quantum state""" + + print("Creating user_1: user_1@example.com") + user_1 = await create_user("user_1@example.com", "example") + await cognee.add([text], dataset_name="AI", user=user_1) + + print("\nCreating user_2: user_2@example.com") + user_2 = await create_user("user_2@example.com", "example") + + # Run cognify for both datasets as the appropriate user/owner + print("\nCreating different datasets for user_1 (AI dataset) and user_2 (QUANTUM dataset)") + ai_cognify_result = await cognee.cognify(["AI"], user=user_1) + + # Extract dataset_ids from cognify results + def extract_dataset_id_from_cognify(cognify_result): + """Extract dataset_id from cognify output dictionary""" + for dataset_id, pipeline_result in cognify_result.items(): + return dataset_id # Return the first dataset_id + return None + + # Get dataset IDs from cognify results + # Note: When we want to work with datasets from other users (search, add, cognify and etc.) we must supply dataset + # information through dataset_id using dataset name only looks for datasets owned by current user + ai_dataset_id = extract_dataset_id_from_cognify(ai_cognify_result) + + # We can see here that user_1 can read his own dataset (AI dataset) + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_1, + datasets=[ai_dataset_id], + ) + + # Verify that user_2 cannot access user_1's dataset without permission + with pytest.raises(PermissionDeniedError): + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_2, + datasets=[ai_dataset_id], + ) + + # Create new tenant and role, add user_2 to tenant and role + tenant_id = await create_tenant("CogneeLab", user_1.id) + await select_tenant(user_id=user_1.id, tenant_id=tenant_id) + role_id = await create_role(role_name="Researcher", owner_id=user_1.id) + await add_user_to_tenant( + user_id=user_2.id, tenant_id=tenant_id, owner_id=user_1.id, set_as_active_tenant=True + ) + await add_user_to_role(user_id=user_2.id, role_id=role_id, owner_id=user_1.id) + + # Assert that user_1 cannot give permissions on his dataset to role before switching to the correct tenant + # AI dataset was made with default tenant and not CogneeLab tenant + with pytest.raises(PermissionDeniedError): + await authorized_give_permission_on_datasets( + role_id, + [ai_dataset_id], + "read", + user_1.id, + ) + + # We need to refresh the user object with changes made when switching tenants + user_1 = await get_user(user_1.id) + await cognee.add([text], dataset_name="AI_COGNEE_LAB", user=user_1) + ai_cognee_lab_cognify_result = await cognee.cognify(["AI_COGNEE_LAB"], user=user_1) + + ai_cognee_lab_dataset_id = extract_dataset_id_from_cognify(ai_cognee_lab_cognify_result) + + await authorized_give_permission_on_datasets( + role_id, + [ai_cognee_lab_dataset_id], + "read", + user_1.id, + ) + + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_2, + dataset_ids=[ai_cognee_lab_dataset_id], + ) + for result in search_results: + print(f"{result}\n") + + # Let's test changing tenants + tenant_id = await create_tenant("CogneeLab2", user_1.id) + await select_tenant(user_id=user_1.id, tenant_id=tenant_id) + + user_1 = await get_user(user_1.id) + await cognee.add([text], dataset_name="AI_COGNEE_LAB", user=user_1) + await cognee.cognify(["AI_COGNEE_LAB"], user=user_1) + + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_1, + ) + + # Assert only AI_COGNEE_LAB dataset from CogneeLab2 tenant is visible as the currently selected tenant + assert len(search_results) == 1, ( + f"Search results must only contain one dataset from current tenant: {search_results}" + ) + assert search_results[0]["dataset_name"] == "AI_COGNEE_LAB", ( + f"Dict must contain dataset name 'AI_COGNEE_LAB': {search_results[0]}" + ) + assert search_results[0]["dataset_tenant_id"] == user_1.tenant_id, ( + f"Dataset tenant_id must be same as user_1 tenant_id: {search_results[0]}" + ) + + # Switch back to no tenant (default tenant) + await select_tenant(user_id=user_1.id, tenant_id=None) + # Refresh user_1 object + user_1 = await get_user(user_1.id) + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_1, + ) + assert len(search_results) == 1, ( + f"Search results must only contain one dataset from default tenant: {search_results}" + ) + assert search_results[0]["dataset_name"] == "AI", ( + f"Dict must contain dataset name 'AI': {search_results[0]}" + ) + + +if __name__ == "__main__": + import asyncio + + logger = setup_logging(log_level=CRITICAL) + asyncio.run(main()) From d6e2bd132b85d9e038475ec4adf87140f69e53ce Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 7 Nov 2025 16:37:37 +0100 Subject: [PATCH 57/85] refactor: Remove testme comment --- cognee/modules/users/roles/methods/add_user_to_role.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/users/roles/methods/add_user_to_role.py b/cognee/modules/users/roles/methods/add_user_to_role.py index d764ac900..23bb947f0 100644 --- a/cognee/modules/users/roles/methods/add_user_to_role.py +++ b/cognee/modules/users/roles/methods/add_user_to_role.py @@ -48,7 +48,7 @@ async def add_user_to_role(user_id: UUID, role_id: UUID, owner_id: UUID): raise UserNotFoundError elif not role: raise RoleNotFoundError - elif role.tenant_id not in [tenant.id for tenant in user_tenants]: # TESTME + elif role.tenant_id not in [tenant.id for tenant in user_tenants]: raise TenantNotFoundError( message="User tenant does not match role tenant. User cannot be added to role." ) From 234b13a8c9a62ecd7e85c5d250c37b13bd8dace3 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Fri, 7 Nov 2025 17:39:42 +0100 Subject: [PATCH 58/85] feat:add emptiness check to neptune adapter --- .../hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py b/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py index 5357f3d7c..1e16642b5 100644 --- a/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +++ b/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py @@ -416,6 +416,15 @@ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface): self._client.query(f"MATCH (n :{self._VECTOR_NODE_LABEL}) DETACH DELETE n") pass + async def is_empty(self) -> bool: + query = """ + MATCH (n) + RETURN true + LIMIT 1; + """ + query_result = await self._client.query(query) + return len(query_result) == 0 + @staticmethod def _get_scored_result( item: dict, with_vector: bool = False, with_score: bool = False From 3710eec94ff547fb9fb80f3b3b35223098269ffc Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 10 Nov 2025 16:23:34 +0100 Subject: [PATCH 59/85] refactor: update docstring message --- cognee/api/v1/permissions/routers/get_permissions_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 20d35e748..63de97eaa 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -246,7 +246,7 @@ def get_permissions_router() -> APIRouter: - **tenant_id** (Union[UUID, None]): UUID of the tenant to select, If null/None is provided use the default single user tenant ## Response - Returns a success message indicating the tenant was created. + Returns a success message along with selected tenant id. """ send_telemetry( "Permissions API Endpoint Invoked", From b5f94c889d00e4043f9f7373b449d4dd165e2391 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Tue, 11 Nov 2025 12:51:09 +0100 Subject: [PATCH 60/85] Update cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py Co-authored-by: Boris --- .../permissions/methods/get_all_user_permission_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index ee1de3c72..5eed992db 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -39,7 +39,7 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) - # Filter out dataset that aren't part of the current user's tenant + # Filter out dataset that aren't part of the selected user's tenant filtered_datasets = [] for dataset in list(unique.values()): if dataset.tenant_id == user.tenant_id: From 8e8aecb76ff66a48e4b5fe18a7ffaac48434f89a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrej=20Mili=C4=87evi=C4=87?= <85933103+siillee@users.noreply.github.com> Date: Tue, 11 Nov 2025 17:03:48 +0100 Subject: [PATCH 61/85] feat: enable multi user for falkor (#1689) ## Description Added multi-user support for Falkor. Adding support for the rest of the graph dbs should be a bit easier after this first one, especially since Falkor is hybrid. There are a few things code quality wise that might need changing, I am open to suggestions. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Andrej Milicevic Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Igor Ilic --- ..._expand_dataset_database_for_multi_user.py | 98 +++++++++++++++++++ cognee/context_global_variables.py | 25 ++--- .../infrastructure/databases/graph/config.py | 4 + .../databases/graph/get_graph_engine.py | 2 + .../utils/get_or_create_dataset_database.py | 40 +++++++- .../infrastructure/databases/vector/config.py | 3 + .../databases/vector/create_vector_engine.py | 4 + .../modules/users/models/DatasetDatabase.py | 9 ++ cognee/tests/test_parallel_databases.py | 2 + 9 files changed, 172 insertions(+), 15 deletions(-) create mode 100644 alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py diff --git a/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py new file mode 100644 index 000000000..7e13898ae --- /dev/null +++ b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py @@ -0,0 +1,98 @@ +"""Expand dataset database for multi user + +Revision ID: 76625596c5c3 +Revises: 211ab850ef3d +Create Date: 2025-10-30 12:55:20.239562 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "76625596c5c3" +down_revision: Union[str, None] = "c946955da633" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + vector_database_provider_column = _get_column( + insp, "dataset_database", "vector_database_provider" + ) + if not vector_database_provider_column: + op.add_column( + "dataset_database", + sa.Column( + "vector_database_provider", + sa.String(), + unique=False, + nullable=False, + server_default="lancedb", + ), + ) + + graph_database_provider_column = _get_column( + insp, "dataset_database", "graph_database_provider" + ) + if not graph_database_provider_column: + op.add_column( + "dataset_database", + sa.Column( + "graph_database_provider", + sa.String(), + unique=False, + nullable=False, + server_default="kuzu", + ), + ) + + vector_database_url_column = _get_column(insp, "dataset_database", "vector_database_url") + if not vector_database_url_column: + op.add_column( + "dataset_database", + sa.Column("vector_database_url", sa.String(), unique=False, nullable=True), + ) + + graph_database_url_column = _get_column(insp, "dataset_database", "graph_database_url") + if not graph_database_url_column: + op.add_column( + "dataset_database", + sa.Column("graph_database_url", sa.String(), unique=False, nullable=True), + ) + + vector_database_key_column = _get_column(insp, "dataset_database", "vector_database_key") + if not vector_database_key_column: + op.add_column( + "dataset_database", + sa.Column("vector_database_key", sa.String(), unique=False, nullable=True), + ) + + graph_database_key_column = _get_column(insp, "dataset_database", "graph_database_key") + if not graph_database_key_column: + op.add_column( + "dataset_database", + sa.Column("graph_database_key", sa.String(), unique=False, nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("dataset_database", "vector_database_provider") + op.drop_column("dataset_database", "graph_database_provider") + op.drop_column("dataset_database", "vector_database_url") + op.drop_column("dataset_database", "graph_database_url") + op.drop_column("dataset_database", "vector_database_key") + op.drop_column("dataset_database", "graph_database_key") diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index f17c9187a..62e06fc64 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -16,8 +16,8 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) -vector_dbs_with_multi_user_support = ["lancedb"] -graph_dbs_with_multi_user_support = ["kuzu"] +VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"] +GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"] async def set_session_user_context_variable(user): @@ -28,8 +28,8 @@ def multi_user_support_possible(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() return ( - graph_db_config["graph_database_provider"] in graph_dbs_with_multi_user_support - and vector_db_config["vector_db_provider"] in vector_dbs_with_multi_user_support + graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT + and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT ) @@ -69,8 +69,6 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ """ - base_config = get_base_config() - if not backend_access_control_enabled(): return @@ -79,6 +77,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ # To ensure permissions are enforced properly all datasets will have their own databases dataset_database = await get_or_create_dataset_database(dataset, user) + base_config = get_base_config() data_root_directory = os.path.join( base_config.data_root_directory, str(user.tenant_id or user.id) ) @@ -88,15 +87,17 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ # Set vector and graph database configuration based on dataset database information vector_config = { - "vector_db_url": os.path.join( - databases_directory_path, dataset_database.vector_database_name - ), - "vector_db_key": "", - "vector_db_provider": "lancedb", + "vector_db_provider": dataset_database.vector_database_provider, + "vector_db_url": dataset_database.vector_database_url, + "vector_db_key": dataset_database.vector_database_key, + "vector_db_name": dataset_database.vector_database_name, } graph_config = { - "graph_database_provider": "kuzu", + "graph_database_provider": dataset_database.graph_database_provider, + "graph_database_url": dataset_database.graph_database_url, + "graph_database_name": dataset_database.graph_database_name, + "graph_database_key": dataset_database.graph_database_key, "graph_file_path": os.path.join( databases_directory_path, dataset_database.graph_database_name ), diff --git a/cognee/infrastructure/databases/graph/config.py b/cognee/infrastructure/databases/graph/config.py index b7907313c..23687b359 100644 --- a/cognee/infrastructure/databases/graph/config.py +++ b/cognee/infrastructure/databases/graph/config.py @@ -26,6 +26,7 @@ class GraphConfig(BaseSettings): - graph_database_username - graph_database_password - graph_database_port + - graph_database_key - graph_file_path - graph_model - graph_topology @@ -41,6 +42,7 @@ class GraphConfig(BaseSettings): graph_database_username: str = "" graph_database_password: str = "" graph_database_port: int = 123 + graph_database_key: str = "" graph_file_path: str = "" graph_filename: str = "" graph_model: object = KnowledgeGraph @@ -90,6 +92,7 @@ class GraphConfig(BaseSettings): "graph_database_username": self.graph_database_username, "graph_database_password": self.graph_database_password, "graph_database_port": self.graph_database_port, + "graph_database_key": self.graph_database_key, "graph_file_path": self.graph_file_path, "graph_model": self.graph_model, "graph_topology": self.graph_topology, @@ -116,6 +119,7 @@ class GraphConfig(BaseSettings): "graph_database_username": self.graph_database_username, "graph_database_password": self.graph_database_password, "graph_database_port": self.graph_database_port, + "graph_database_key": self.graph_database_key, "graph_file_path": self.graph_file_path, } diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 1ea61d29f..82e3cad6e 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -33,6 +33,7 @@ def create_graph_engine( graph_database_username="", graph_database_password="", graph_database_port="", + graph_database_key="", ): """ Create a graph engine based on the specified provider type. @@ -69,6 +70,7 @@ def create_graph_engine( graph_database_url=graph_database_url, graph_database_username=graph_database_username, graph_database_password=graph_database_password, + database_name=graph_database_name, ) if graph_database_provider == "neo4j": diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 29156025d..3684bb100 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -1,11 +1,15 @@ +import os from uuid import UUID from typing import Union from sqlalchemy import select from sqlalchemy.exc import IntegrityError -from cognee.modules.data.methods import create_dataset +from cognee.base_config import get_base_config +from cognee.modules.data.methods import create_dataset from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.infrastructure.databases.vector import get_vectordb_config +from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.modules.data.methods import get_unique_dataset_id from cognee.modules.users.models import DatasetDatabase from cognee.modules.users.models import User @@ -32,8 +36,32 @@ async def get_or_create_dataset_database( dataset_id = await get_unique_dataset_id(dataset, user) - vector_db_name = f"{dataset_id}.lance.db" - graph_db_name = f"{dataset_id}.pkl" + vector_config = get_vectordb_config() + graph_config = get_graph_config() + + # Note: for hybrid databases both graph and vector DB name have to be the same + if graph_config.graph_database_provider == "kuzu": + graph_db_name = f"{dataset_id}.pkl" + else: + graph_db_name = f"{dataset_id}" + + if vector_config.vector_db_provider == "lancedb": + vector_db_name = f"{dataset_id}.lance.db" + else: + vector_db_name = f"{dataset_id}" + + base_config = get_base_config() + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(user.id) + ) + + # Determine vector database URL + if vector_config.vector_db_provider == "lancedb": + vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name) + else: + vector_db_url = vector_config.vector_database_url + + # Determine graph database URL async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist @@ -55,6 +83,12 @@ async def get_or_create_dataset_database( dataset_id=dataset_id, vector_database_name=vector_db_name, graph_database_name=graph_db_name, + vector_database_provider=vector_config.vector_db_provider, + graph_database_provider=graph_config.graph_database_provider, + vector_database_url=vector_db_url, + graph_database_url=graph_config.graph_database_url, + vector_database_key=vector_config.vector_db_key, + graph_database_key=graph_config.graph_database_key, ) try: diff --git a/cognee/infrastructure/databases/vector/config.py b/cognee/infrastructure/databases/vector/config.py index b6d3ae644..7d28f1668 100644 --- a/cognee/infrastructure/databases/vector/config.py +++ b/cognee/infrastructure/databases/vector/config.py @@ -18,12 +18,14 @@ class VectorConfig(BaseSettings): Instance variables: - vector_db_url: The URL of the vector database. - vector_db_port: The port for the vector database. + - vector_db_name: The name of the vector database. - vector_db_key: The key for accessing the vector database. - vector_db_provider: The provider for the vector database. """ vector_db_url: str = "" vector_db_port: int = 1234 + vector_db_name: str = "" vector_db_key: str = "" vector_db_provider: str = "lancedb" @@ -58,6 +60,7 @@ class VectorConfig(BaseSettings): return { "vector_db_url": self.vector_db_url, "vector_db_port": self.vector_db_port, + "vector_db_name": self.vector_db_name, "vector_db_key": self.vector_db_key, "vector_db_provider": self.vector_db_provider, } diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index c54d94f6c..b182f084b 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -1,5 +1,6 @@ from .supported_databases import supported_databases from .embeddings import get_embedding_engine +from cognee.infrastructure.databases.graph.config import get_graph_context_config from functools import lru_cache @@ -8,6 +9,7 @@ from functools import lru_cache def create_vector_engine( vector_db_provider: str, vector_db_url: str, + vector_db_name: str, vector_db_port: str = "", vector_db_key: str = "", ): @@ -27,6 +29,7 @@ def create_vector_engine( - vector_db_url (str): The URL for the vector database instance. - vector_db_port (str): The port for the vector database instance. Required for some providers. + - vector_db_name (str): The name of the vector database instance. - vector_db_key (str): The API key or access token for the vector database instance. - vector_db_provider (str): The name of the vector database provider to use (e.g., 'pgvector'). @@ -45,6 +48,7 @@ def create_vector_engine( url=vector_db_url, api_key=vector_db_key, embedding_engine=embedding_engine, + database_name=vector_db_name, ) if vector_db_provider.lower() == "pgvector": diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 0d71d8413..25d610ab9 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -15,5 +15,14 @@ class DatasetDatabase(Base): vector_database_name = Column(String, unique=True, nullable=False) graph_database_name = Column(String, unique=True, nullable=False) + vector_database_provider = Column(String, unique=False, nullable=False) + graph_database_provider = Column(String, unique=False, nullable=False) + + vector_database_url = Column(String, unique=False, nullable=True) + graph_database_url = Column(String, unique=False, nullable=True) + + vector_database_key = Column(String, unique=False, nullable=True) + graph_database_key = Column(String, unique=False, nullable=True) + created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) diff --git a/cognee/tests/test_parallel_databases.py b/cognee/tests/test_parallel_databases.py index 9a590921a..3164206ed 100755 --- a/cognee/tests/test_parallel_databases.py +++ b/cognee/tests/test_parallel_databases.py @@ -33,11 +33,13 @@ async def main(): "vector_db_url": "cognee1.test", "vector_db_key": "", "vector_db_provider": "lancedb", + "vector_db_name": "", } task_2_config = { "vector_db_url": "cognee2.test", "vector_db_key": "", "vector_db_provider": "lancedb", + "vector_db_name": "", } task_1_graph_config = { From cc0e1a83ab71f4bb47d1ef0d6308eca185294c86 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 19:55:29 +0100 Subject: [PATCH 62/85] refactor: Disable telemetry for all non telemetry tests --- .github/actions/cognee_setup/action.yml | 4 ++++ .github/workflows/basic_tests.yml | 5 +++++ .github/workflows/cli_tests.yml | 3 +++ .github/workflows/db_examples_tests.yml | 6 +++--- .github/workflows/e2e_tests.yml | 2 +- .github/workflows/examples_tests.yml | 11 +++++++++++ 6 files changed, 27 insertions(+), 4 deletions(-) diff --git a/.github/actions/cognee_setup/action.yml b/.github/actions/cognee_setup/action.yml index 4017d524b..bdc0ae690 100644 --- a/.github/actions/cognee_setup/action.yml +++ b/.github/actions/cognee_setup/action.yml @@ -42,3 +42,7 @@ runs: done fi uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS + + - name: Add telemetry identifier for telemetry test and in case telemetry is enabled by accident + run: | + echo "test-machine" > .anon_id diff --git a/.github/workflows/basic_tests.yml b/.github/workflows/basic_tests.yml index b7f324310..98ced21dc 100644 --- a/.github/workflows/basic_tests.yml +++ b/.github/workflows/basic_tests.yml @@ -75,6 +75,7 @@ jobs: name: Run Unit Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -104,6 +105,7 @@ jobs: name: Run Integration Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -132,6 +134,7 @@ jobs: name: Run Simple Examples runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -161,6 +164,7 @@ jobs: name: Run Simple Examples BAML runs-on: ubuntu-22.04 env: + ENV: 'dev' STRUCTURED_OUTPUT_FRAMEWORK: "BAML" BAML_LLM_PROVIDER: openai BAML_LLM_MODEL: ${{ secrets.OPENAI_MODEL }} @@ -198,6 +202,7 @@ jobs: name: Run Basic Graph Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} diff --git a/.github/workflows/cli_tests.yml b/.github/workflows/cli_tests.yml index 958d341ae..d4f8e5ac0 100644 --- a/.github/workflows/cli_tests.yml +++ b/.github/workflows/cli_tests.yml @@ -39,6 +39,7 @@ jobs: name: CLI Unit Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -66,6 +67,7 @@ jobs: name: CLI Integration Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -93,6 +95,7 @@ jobs: name: CLI Functionality Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} diff --git a/.github/workflows/db_examples_tests.yml b/.github/workflows/db_examples_tests.yml index 51ac9a82a..c58bc48ef 100644 --- a/.github/workflows/db_examples_tests.yml +++ b/.github/workflows/db_examples_tests.yml @@ -60,7 +60,7 @@ jobs: - name: Run Neo4j Example env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -95,7 +95,7 @@ jobs: - name: Run Kuzu Example env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -141,7 +141,7 @@ jobs: - name: Run PGVector Example env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index bfa75f693..584225afe 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -454,7 +454,7 @@ jobs: - name: Run Conversation session tests env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 36953e259..f7cc278cb 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -21,6 +21,7 @@ jobs: - name: Run Multimedia Example env: + ENV: 'dev' LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: uv run python ./examples/python/multimedia_example.py @@ -40,6 +41,7 @@ jobs: - name: Run Evaluation Framework Example env: + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -69,6 +71,7 @@ jobs: - name: Run Descriptive Graph Metrics Example env: + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -99,6 +102,7 @@ jobs: - name: Run Dynamic Steps Tests env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -124,6 +128,7 @@ jobs: - name: Run Temporal Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -149,6 +154,7 @@ jobs: - name: Run Ontology Demo Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -174,6 +180,7 @@ jobs: - name: Run Agentic Reasoning Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -199,6 +206,7 @@ jobs: - name: Run Memify Tests env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -224,6 +232,7 @@ jobs: - name: Run Custom Pipeline Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -249,6 +258,7 @@ jobs: - name: Run Memify Tests env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -274,6 +284,7 @@ jobs: - name: Run Docling Test env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} From ba693b7ef46b617a5aa069ad7bb7bee00fb04586 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 19:58:30 +0100 Subject: [PATCH 63/85] chore: add shell to setting of anon_id in gh action --- .github/actions/cognee_setup/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/cognee_setup/action.yml b/.github/actions/cognee_setup/action.yml index bdc0ae690..3f5726015 100644 --- a/.github/actions/cognee_setup/action.yml +++ b/.github/actions/cognee_setup/action.yml @@ -44,5 +44,6 @@ runs: uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS - name: Add telemetry identifier for telemetry test and in case telemetry is enabled by accident + shell: bash run: | echo "test-machine" > .anon_id From b716c2e3c431ce10679a30150afb554d32557212 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Wed, 12 Nov 2025 13:35:04 +0100 Subject: [PATCH 64/85] Chore: Acceptance Criteria for PRs --- .github/pull_request_template.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0e6f74188..be9d219c1 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -6,6 +6,14 @@ Please provide a clear, human-generated description of the changes in this PR. DO NOT use AI-generated descriptions. We want to understand your thought process and reasoning. --> +## Acceptance Criteria + + ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) From 056424f244fe029cc3acdd0127f70796bac25377 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 12 Nov 2025 14:34:30 +0000 Subject: [PATCH 65/85] feat: fs-cache (#1645) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Implement File-Based Version of the Redis Cache Adapter Description and acceptance criteria: This PR introduces a file-based cache adapter as an alternative to the existing Redis-based adapter. It provides the same core functionality for caching session data and maintaining context across multiple user interactions but stores data locally in files instead of Redis. Because the shared Kùzu lock mechanism relies on Redis, it is not supported in this implementation. If a lock is configured, the adapter will raise an error to prevent misconfiguration. You can test this adapter by enabling caching with the following settings: caching=True cache_backend="fs" When running multiple searches in a session, the system should correctly maintain conversational context. For example: - What is XY? - Are you sure? - What was my first question? In this case, the adapter should preserve previous user–Cognee interactions within the cache file so that follow-up queries remain context-aware. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- .github/workflows/e2e_tests.yml | 63 +++++- .../infrastructure/databases/cache/config.py | 4 +- .../databases/cache/fscache/FsCacheAdapter.py | 151 +++++++++++++ .../databases/cache/get_cache_engine.py | 30 ++- .../databases/exceptions/exceptions.py | 16 ++ cognee/shared/logging_utils.py | 2 + .../databases/cache/test_cache_config.py | 5 + poetry.lock | 206 +++++++++++++++--- pyproject.toml | 7 + uv.lock | 104 +++++++++ 10 files changed, 543 insertions(+), 45 deletions(-) create mode 100644 cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 584225afe..3dea2548c 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -333,7 +333,7 @@ jobs: python-version: '3.11.x' extra-dependencies: "postgres redis" - - name: Run Concurrent subprocess access test (Kuzu/Lancedb/Postgres) + - name: Run Concurrent subprocess access test (Kuzu/Lancedb/Postgres/Redis) env: ENV: dev LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -346,6 +346,7 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_PROVIDER: 'kuzu' CACHING: true + CACHE_BACKEND: 'redis' SHARED_KUZU_LOCK: true DB_PROVIDER: 'postgres' DB_NAME: 'cognee_db' @@ -411,8 +412,8 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_feedback_enrichment.py - run_conversation_sessions_test: - name: Conversation sessions test + run_conversation_sessions_test_redis: + name: Conversation sessions test (Redis) runs-on: ubuntu-latest defaults: run: @@ -452,7 +453,7 @@ jobs: python-version: '3.11.x' extra-dependencies: "postgres redis" - - name: Run Conversation session tests + - name: Run Conversation session tests (Redis) env: ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -465,6 +466,60 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_PROVIDER: 'kuzu' CACHING: true + CACHE_BACKEND: 'redis' + DB_PROVIDER: 'postgres' + DB_NAME: 'cognee_db' + DB_HOST: '127.0.0.1' + DB_PORT: 5432 + DB_USERNAME: cognee + DB_PASSWORD: cognee + run: uv run python ./cognee/tests/test_conversation_history.py + + run_conversation_sessions_test_fs: + name: Conversation sessions test (FS) + runs-on: ubuntu-latest + defaults: + run: + shell: bash + services: + postgres: + image: pgvector/pgvector:pg17 + env: + POSTGRES_USER: cognee + POSTGRES_PASSWORD: cognee + POSTGRES_DB: cognee_db + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + extra-dependencies: "postgres" + + - name: Run Conversation session tests (FS) + env: + ENV: dev + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + GRAPH_DATABASE_PROVIDER: 'kuzu' + CACHING: true + CACHE_BACKEND: 'fs' DB_PROVIDER: 'postgres' DB_NAME: 'cognee_db' DB_HOST: '127.0.0.1' diff --git a/cognee/infrastructure/databases/cache/config.py b/cognee/infrastructure/databases/cache/config.py index 3a28827fe..88ac05885 100644 --- a/cognee/infrastructure/databases/cache/config.py +++ b/cognee/infrastructure/databases/cache/config.py @@ -1,6 +1,6 @@ from pydantic_settings import BaseSettings, SettingsConfigDict from functools import lru_cache -from typing import Optional +from typing import Optional, Literal class CacheConfig(BaseSettings): @@ -15,6 +15,7 @@ class CacheConfig(BaseSettings): - agentic_lock_timeout: Maximum time (in seconds) to wait for the lock release. """ + cache_backend: Literal["redis", "fs"] = "fs" caching: bool = False shared_kuzu_lock: bool = False cache_host: str = "localhost" @@ -28,6 +29,7 @@ class CacheConfig(BaseSettings): def to_dict(self) -> dict: return { + "cache_backend": self.cache_backend, "caching": self.caching, "shared_kuzu_lock": self.shared_kuzu_lock, "cache_host": self.cache_host, diff --git a/cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py b/cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py new file mode 100644 index 000000000..497e6afec --- /dev/null +++ b/cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py @@ -0,0 +1,151 @@ +import asyncio +import json +import os +from datetime import datetime +import time +import threading +import diskcache as dc + +from cognee.infrastructure.databases.cache.cache_db_interface import CacheDBInterface +from cognee.infrastructure.databases.exceptions.exceptions import ( + CacheConnectionError, + SharedKuzuLockRequiresRedisError, +) +from cognee.infrastructure.files.storage.get_storage_config import get_storage_config +from cognee.shared.logging_utils import get_logger + +logger = get_logger("FSCacheAdapter") + + +class FSCacheAdapter(CacheDBInterface): + def __init__(self): + default_key = "sessions_db" + + storage_config = get_storage_config() + data_root_directory = storage_config["data_root_directory"] + cache_directory = os.path.join(data_root_directory, ".cognee_fs_cache", default_key) + os.makedirs(cache_directory, exist_ok=True) + self.cache = dc.Cache(directory=cache_directory) + self.cache.expire() + + logger.debug(f"FSCacheAdapter initialized with cache directory: {cache_directory}") + + def acquire_lock(self): + """Lock acquisition is not available for filesystem cache backend.""" + message = "Shared Kuzu lock requires Redis cache backend." + logger.error(message) + raise SharedKuzuLockRequiresRedisError() + + def release_lock(self): + """Lock release is not available for filesystem cache backend.""" + message = "Shared Kuzu lock requires Redis cache backend." + logger.error(message) + raise SharedKuzuLockRequiresRedisError() + + async def add_qa( + self, + user_id: str, + session_id: str, + question: str, + context: str, + answer: str, + ttl: int | None = 86400, + ): + try: + session_key = f"agent_sessions:{user_id}:{session_id}" + + qa_entry = { + "time": datetime.utcnow().isoformat(), + "question": question, + "context": context, + "answer": answer, + } + + existing_value = self.cache.get(session_key) + if existing_value is not None: + value: list = json.loads(existing_value) + value.append(qa_entry) + else: + value = [qa_entry] + + self.cache.set(session_key, json.dumps(value), expire=ttl) + except Exception as e: + error_msg = f"Unexpected error while adding Q&A to diskcache: {str(e)}" + logger.error(error_msg) + raise CacheConnectionError(error_msg) from e + + async def get_latest_qa(self, user_id: str, session_id: str, last_n: int = 5): + session_key = f"agent_sessions:{user_id}:{session_id}" + value = self.cache.get(session_key) + if value is None: + return None + entries = json.loads(value) + return entries[-last_n:] if len(entries) > last_n else entries + + async def get_all_qas(self, user_id: str, session_id: str): + session_key = f"agent_sessions:{user_id}:{session_id}" + value = self.cache.get(session_key) + if value is None: + return None + return json.loads(value) + + async def close(self): + if self.cache is not None: + self.cache.expire() + self.cache.close() + + +async def main(): + adapter = FSCacheAdapter() + session_id = "demo_session" + user_id = "demo_user_id" + + print("\nAdding sample Q/A pairs...") + await adapter.add_qa( + user_id, + session_id, + "What is Redis?", + "Basic DB context", + "Redis is an in-memory data store.", + ) + await adapter.add_qa( + user_id, + session_id, + "Who created Redis?", + "Historical context", + "Salvatore Sanfilippo (antirez).", + ) + + print("\nLatest QA:") + latest = await adapter.get_latest_qa(user_id, session_id) + print(json.dumps(latest, indent=2)) + + print("\nLast 2 QAs:") + last_two = await adapter.get_latest_qa(user_id, session_id, last_n=2) + print(json.dumps(last_two, indent=2)) + + session_id = "session_expire_demo" + + await adapter.add_qa( + user_id, + session_id, + "What is Redis?", + "Database context", + "Redis is an in-memory data store.", + ) + + await adapter.add_qa( + user_id, + session_id, + "Who created Redis?", + "History context", + "Salvatore Sanfilippo (antirez).", + ) + + print(await adapter.get_all_qas(user_id, session_id)) + + await adapter.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/infrastructure/databases/cache/get_cache_engine.py b/cognee/infrastructure/databases/cache/get_cache_engine.py index c1fa3311c..f70358607 100644 --- a/cognee/infrastructure/databases/cache/get_cache_engine.py +++ b/cognee/infrastructure/databases/cache/get_cache_engine.py @@ -1,9 +1,11 @@ """Factory to get the appropriate cache coordination engine (e.g., Redis).""" from functools import lru_cache +import os from typing import Optional from cognee.infrastructure.databases.cache.config import get_cache_config from cognee.infrastructure.databases.cache.cache_db_interface import CacheDBInterface +from cognee.infrastructure.databases.cache.fscache.FsCacheAdapter import FSCacheAdapter config = get_cache_config() @@ -33,20 +35,28 @@ def create_cache_engine( Returns: -------- - - CacheDBInterface: An instance of the appropriate cache adapter. :TODO: Now we support only Redis. later if we add more here we can split the logic + - CacheDBInterface: An instance of the appropriate cache adapter. """ if config.caching: from cognee.infrastructure.databases.cache.redis.RedisAdapter import RedisAdapter - return RedisAdapter( - host=cache_host, - port=cache_port, - username=cache_username, - password=cache_password, - lock_name=lock_key, - timeout=agentic_lock_expire, - blocking_timeout=agentic_lock_timeout, - ) + if config.cache_backend == "redis": + return RedisAdapter( + host=cache_host, + port=cache_port, + username=cache_username, + password=cache_password, + lock_name=lock_key, + timeout=agentic_lock_expire, + blocking_timeout=agentic_lock_timeout, + ) + elif config.cache_backend == "fs": + return FSCacheAdapter() + else: + raise ValueError( + f"Unsupported cache backend: '{config.cache_backend}'. " + f"Supported backends are: 'redis', 'fs'" + ) else: return None diff --git a/cognee/infrastructure/databases/exceptions/exceptions.py b/cognee/infrastructure/databases/exceptions/exceptions.py index 72b13e3a2..d8dd99c17 100644 --- a/cognee/infrastructure/databases/exceptions/exceptions.py +++ b/cognee/infrastructure/databases/exceptions/exceptions.py @@ -148,3 +148,19 @@ class CacheConnectionError(CogneeConfigurationError): status_code: int = status.HTTP_503_SERVICE_UNAVAILABLE, ): super().__init__(message, name, status_code) + + +class SharedKuzuLockRequiresRedisError(CogneeConfigurationError): + """ + Raised when shared Kuzu locking is requested without configuring the Redis backend. + """ + + def __init__( + self, + message: str = ( + "Shared Kuzu lock requires Redis cache backend. Configure Redis to enable shared Kuzu locking." + ), + name: str = "SharedKuzuLockRequiresRedisError", + status_code: int = status.HTTP_400_BAD_REQUEST, + ): + super().__init__(message, name, status_code) diff --git a/cognee/shared/logging_utils.py b/cognee/shared/logging_utils.py index 0e5120b1d..e8efde72c 100644 --- a/cognee/shared/logging_utils.py +++ b/cognee/shared/logging_utils.py @@ -450,6 +450,8 @@ def setup_logging(log_level=None, name=None): try: msg = self.format(record) stream = self.stream + if hasattr(stream, "closed") and stream.closed: + return stream.write("\n" + msg + self.terminator) self.flush() except Exception: diff --git a/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py b/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py index a8d3bda82..837a9955c 100644 --- a/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +++ b/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py @@ -8,6 +8,7 @@ def test_cache_config_defaults(): """Test that CacheConfig has the correct default values.""" config = CacheConfig() + assert config.cache_backend == "fs" assert config.caching is False assert config.shared_kuzu_lock is False assert config.cache_host == "localhost" @@ -19,6 +20,7 @@ def test_cache_config_defaults(): def test_cache_config_custom_values(): """Test that CacheConfig accepts custom values.""" config = CacheConfig( + cache_backend="redis", caching=True, shared_kuzu_lock=True, cache_host="redis.example.com", @@ -27,6 +29,7 @@ def test_cache_config_custom_values(): agentic_lock_timeout=180, ) + assert config.cache_backend == "redis" assert config.caching is True assert config.shared_kuzu_lock is True assert config.cache_host == "redis.example.com" @@ -38,6 +41,7 @@ def test_cache_config_custom_values(): def test_cache_config_to_dict(): """Test the to_dict method returns all configuration values.""" config = CacheConfig( + cache_backend="fs", caching=True, shared_kuzu_lock=True, cache_host="test-host", @@ -49,6 +53,7 @@ def test_cache_config_to_dict(): config_dict = config.to_dict() assert config_dict == { + "cache_backend": "fs", "caching": True, "shared_kuzu_lock": True, "cache_host": "test-host", diff --git a/poetry.lock b/poetry.lock index 08fd42660..67de51633 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "accelerate" @@ -539,7 +539,7 @@ description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"redis\" and python_full_version < \"3.11.3\" or python_version == \"3.10\"" +markers = "python_full_version < \"3.11.3\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -1231,12 +1231,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["main"] -markers = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")" +groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")", dev = "sys_platform == \"win32\""} [[package]] name = "coloredlogs" @@ -2347,7 +2347,7 @@ version = "1.3.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"}, @@ -2408,6 +2408,32 @@ files = [ [package.dependencies] tzdata = "*" +[[package]] +name = "fakeredis" +version = "2.32.0" +description = "Python implementation of redis API, can be used for testing purposes." +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "fakeredis-2.32.0-py3-none-any.whl", hash = "sha256:c9da8228de84060cfdb72c3cf4555c18c59ba7a5ae4d273f75e4822d6f01ecf8"}, + {file = "fakeredis-2.32.0.tar.gz", hash = "sha256:63d745b40eb6c8be4899cf2a53187c097ccca3afbca04fdbc5edc8b936cd1d59"}, +] + +[package.dependencies] +lupa = {version = ">=2.1,<3.0", optional = true, markers = "extra == \"lua\""} +redis = {version = ">=4.3", markers = "python_version > \"3.8\""} +sortedcontainers = ">=2,<3" +typing-extensions = {version = ">=4.7,<5.0", markers = "python_version < \"3.11\""} + +[package.extras] +bf = ["pyprobables (>=0.6)"] +cf = ["pyprobables (>=0.6)"] +json = ["jsonpath-ng (>=1.6,<2.0)"] +lua = ["lupa (>=2.1,<3.0)"] +probabilistic = ["pyprobables (>=0.6)"] +valkey = ["valkey (>=6) ; python_version >= \"3.8\""] + [[package]] name = "fastapi" version = "0.117.1" @@ -2543,6 +2569,7 @@ files = [ {file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9b31dd488d0778c36f8279b306dc92a42f16904cba54acca71e107d65b60b0c"}, {file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:b19361ee649365eefc717ec08005972d3d1eb9ee39908022d98e3bfa9da59e37"}, {file = "fastuuid-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:8fc66b11423e6f3e1937385f655bedd67aebe56a3dcec0cb835351cfe7d358c9"}, + {file = "fastuuid-0.12.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:2925f67b88d47cb16aa3eb1ab20fdcf21b94d74490e0818c91ea41434b987493"}, {file = "fastuuid-0.12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7b15c54d300279ab20a9cc0579ada9c9f80d1bc92997fc61fb7bf3103d7cb26b"}, {file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:458f1bc3ebbd76fdb89ad83e6b81ccd3b2a99fa6707cd3650b27606745cfb170"}, {file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_34_x86_64.whl", hash = "sha256:a8f0f83fbba6dc44271a11b22e15838641b8c45612cdf541b4822a5930f6893c"}, @@ -3705,14 +3732,14 @@ type = ["pytest-mypy"] name = "iniconfig" version = "2.1.0" description = "brain-dead simple config-ini parsing" -optional = true +optional = false python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\"" +groups = ["main", "dev"] files = [ {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [[package]] name = "instructor" @@ -4169,6 +4196,8 @@ groups = ["main"] markers = "extra == \"dlt\"" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, + {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, + {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -5082,6 +5111,104 @@ win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} [package.extras] dev = ["Sphinx (==8.1.3) ; python_version >= \"3.11\"", "build (==1.2.2) ; python_version >= \"3.11\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.5.0) ; python_version >= \"3.8\"", "mypy (==v0.910) ; python_version < \"3.6\"", "mypy (==v0.971) ; python_version == \"3.6\"", "mypy (==v1.13.0) ; python_version >= \"3.8\"", "mypy (==v1.4.1) ; python_version == \"3.7\"", "myst-parser (==4.0.0) ; python_version >= \"3.11\"", "pre-commit (==4.0.1) ; python_version >= \"3.9\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==8.3.2) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==5.0.0) ; python_version == \"3.8\"", "pytest-cov (==6.0.0) ; python_version >= \"3.9\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.1.0) ; python_version >= \"3.8\"", "sphinx-rtd-theme (==3.0.2) ; python_version >= \"3.11\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.23.2) ; python_version >= \"3.8\"", "twine (==6.0.1) ; python_version >= \"3.11\""] +[[package]] +name = "lupa" +version = "2.6" +description = "Python wrapper around Lua and LuaJIT" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "lupa-2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6b3dabda836317e63c5ad052826e156610f356a04b3003dfa0dbe66b5d54d671"}, + {file = "lupa-2.6-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8726d1c123bbe9fbb974ce29825e94121824e66003038ff4532c14cc2ed0c51c"}, + {file = "lupa-2.6-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:f4e159e7d814171199b246f9235ca8961f6461ea8c1165ab428afa13c9289a94"}, + {file = "lupa-2.6-cp310-cp310-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:202160e80dbfddfb79316692a563d843b767e0f6787bbd1c455f9d54052efa6c"}, + {file = "lupa-2.6-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5deede7c5b36ab64f869dae4831720428b67955b0bb186c8349cf6ea121c852b"}, + {file = "lupa-2.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86f04901f920bbf7c0cac56807dc9597e42347123e6f1f3ca920f15f54188ce5"}, + {file = "lupa-2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6deef8f851d6afb965c84849aa5b8c38856942df54597a811ce0369ced678610"}, + {file = "lupa-2.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:21f2b5549681c2a13b1170a26159d30875d367d28f0247b81ca347222c755038"}, + {file = "lupa-2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:66eea57630eab5e6f49fdc5d7811c0a2a41f2011be4ea56a087ea76112011eb7"}, + {file = "lupa-2.6-cp310-cp310-win32.whl", hash = "sha256:60a403de8cab262a4fe813085dd77010effa6e2eb1886db2181df803140533b1"}, + {file = "lupa-2.6-cp310-cp310-win_amd64.whl", hash = "sha256:e4656a39d93dfa947cf3db56dc16c7916cb0cc8024acd3a952071263f675df64"}, + {file = "lupa-2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6d988c0f9331b9f2a5a55186701a25444ab10a1432a1021ee58011499ecbbdd5"}, + {file = "lupa-2.6-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:ebe1bbf48259382c72a6fe363dea61a0fd6fe19eab95e2ae881e20f3654587bf"}, + {file = "lupa-2.6-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:a8fcee258487cf77cdd41560046843bb38c2e18989cd19671dd1e2596f798306"}, + {file = "lupa-2.6-cp311-cp311-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:561a8e3be800827884e767a694727ed8482d066e0d6edfcbf423b05e63b05535"}, + {file = "lupa-2.6-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af880a62d47991cae78b8e9905c008cbfdc4a3a9723a66310c2634fc7644578c"}, + {file = "lupa-2.6-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80b22923aa4023c86c0097b235615f89d469a0c4eee0489699c494d3367c4c85"}, + {file = "lupa-2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:153d2cc6b643f7efb9cfc0c6bb55ec784d5bac1a3660cfc5b958a7b8f38f4a75"}, + {file = "lupa-2.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3fa8777e16f3ded50b72967dc17e23f5a08e4f1e2c9456aff2ebdb57f5b2869f"}, + {file = "lupa-2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8dbdcbe818c02a2f56f5ab5ce2de374dab03e84b25266cfbaef237829bc09b3f"}, + {file = "lupa-2.6-cp311-cp311-win32.whl", hash = "sha256:defaf188fde8f7a1e5ce3a5e6d945e533b8b8d547c11e43b96c9b7fe527f56dc"}, + {file = "lupa-2.6-cp311-cp311-win_amd64.whl", hash = "sha256:9505ae600b5c14f3e17e70f87f88d333717f60411faca1ddc6f3e61dce85fa9e"}, + {file = "lupa-2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47ce718817ef1cc0c40d87c3d5ae56a800d61af00fbc0fad1ca9be12df2f3b56"}, + {file = "lupa-2.6-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7aba985b15b101495aa4b07112cdc08baa0c545390d560ad5cfde2e9e34f4d58"}, + {file = "lupa-2.6-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:b766f62f95b2739f2248977d29b0722e589dcf4f0ccfa827ccbd29f0148bd2e5"}, + {file = "lupa-2.6-cp312-cp312-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:00a934c23331f94cb51760097ebfab14b005d55a6b30a2b480e3c53dd2fa290d"}, + {file = "lupa-2.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21de9f38bd475303e34a042b7081aabdf50bd9bafd36ce4faea2f90fd9f15c31"}, + {file = "lupa-2.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf3bda96d3fc41237e964a69c23647d50d4e28421111360274d4799832c560e9"}, + {file = "lupa-2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a76ead245da54801a81053794aa3975f213221f6542d14ec4b859ee2e7e0323"}, + {file = "lupa-2.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8dd0861741caa20886ddbda0a121d8e52fb9b5bb153d82fa9bba796962bf30e8"}, + {file = "lupa-2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:239e63948b0b23023f81d9a19a395e768ed3da6a299f84e7963b8f813f6e3f9c"}, + {file = "lupa-2.6-cp312-cp312-win32.whl", hash = "sha256:325894e1099499e7a6f9c351147661a2011887603c71086d36fe0f964d52d1ce"}, + {file = "lupa-2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c735a1ce8ee60edb0fe71d665f1e6b7c55c6021f1d340eb8c865952c602cd36f"}, + {file = "lupa-2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:663a6e58a0f60e7d212017d6678639ac8df0119bc13c2145029dcba084391310"}, + {file = "lupa-2.6-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:d1f5afda5c20b1f3217a80e9bc1b77037f8a6eb11612fd3ada19065303c8f380"}, + {file = "lupa-2.6-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:26f2b3c085fe76e9119e48c1013c1cccdc1f51585d456858290475aa38e7089e"}, + {file = "lupa-2.6-cp313-cp313-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:60d2f902c7b96fb8ab98493dcff315e7bb4d0b44dc9dd76eb37de575025d5685"}, + {file = "lupa-2.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a02d25dee3a3250967c36590128d9220ae02f2eda166a24279da0b481519cbff"}, + {file = "lupa-2.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6eae1ee16b886b8914ff292dbefbf2f48abfbdee94b33a88d1d5475e02423203"}, + {file = "lupa-2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0edd5073a4ee74ab36f74fe61450148e6044f3952b8d21248581f3c5d1a58be"}, + {file = "lupa-2.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0c53ee9f22a8a17e7d4266ad48e86f43771951797042dd51d1494aaa4f5f3f0a"}, + {file = "lupa-2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:de7c0f157a9064a400d828789191a96da7f4ce889969a588b87ec80de9b14772"}, + {file = "lupa-2.6-cp313-cp313-win32.whl", hash = "sha256:ee9523941ae0a87b5b703417720c5d78f72d2f5bc23883a2ea80a949a3ed9e75"}, + {file = "lupa-2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b1335a5835b0a25ebdbc75cf0bda195e54d133e4d994877ef025e218c2e59db9"}, + {file = "lupa-2.6-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:dcb6d0a3264873e1653bc188499f48c1fb4b41a779e315eba45256cfe7bc33c1"}, + {file = "lupa-2.6-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:a37e01f2128f8c36106726cb9d360bac087d58c54b4522b033cc5691c584db18"}, + {file = "lupa-2.6-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:458bd7e9ff3c150b245b0fcfbb9bd2593d1152ea7f0a7b91c1d185846da033fe"}, + {file = "lupa-2.6-cp314-cp314-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:052ee82cac5206a02df77119c325339acbc09f5ce66967f66a2e12a0f3211cad"}, + {file = "lupa-2.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96594eca3c87dd07938009e95e591e43d554c1dbd0385be03c100367141db5a8"}, + {file = "lupa-2.6-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8faddd9d198688c8884091173a088a8e920ecc96cda2ffed576a23574c4b3f6"}, + {file = "lupa-2.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:daebb3a6b58095c917e76ba727ab37b27477fb926957c825205fbda431552134"}, + {file = "lupa-2.6-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:f3154e68972befe0f81564e37d8142b5d5d79931a18309226a04ec92487d4ea3"}, + {file = "lupa-2.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e4dadf77b9fedc0bfa53417cc28dc2278a26d4cbd95c29f8927ad4d8fe0a7ef9"}, + {file = "lupa-2.6-cp314-cp314-win32.whl", hash = "sha256:cb34169c6fa3bab3e8ac58ca21b8a7102f6a94b6a5d08d3636312f3f02fafd8f"}, + {file = "lupa-2.6-cp314-cp314-win_amd64.whl", hash = "sha256:b74f944fe46c421e25d0f8692aef1e842192f6f7f68034201382ac440ef9ea67"}, + {file = "lupa-2.6-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0e21b716408a21ab65723f8841cf7f2f37a844b7a965eeabb785e27fca4099cf"}, + {file = "lupa-2.6-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:589db872a141bfff828340079bbdf3e9a31f2689f4ca0d88f97d9e8c2eae6142"}, + {file = "lupa-2.6-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:cd852a91a4a9d4dcbb9a58100f820a75a425703ec3e3f049055f60b8533b7953"}, + {file = "lupa-2.6-cp314-cp314t-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:0334753be028358922415ca97a64a3048e4ed155413fc4eaf87dd0a7e2752983"}, + {file = "lupa-2.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:661d895cd38c87658a34780fac54a690ec036ead743e41b74c3fb81a9e65a6aa"}, + {file = "lupa-2.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aa58454ccc13878cc177c62529a2056be734da16369e451987ff92784994ca7"}, + {file = "lupa-2.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1425017264e470c98022bba8cff5bd46d054a827f5df6b80274f9cc71dafd24f"}, + {file = "lupa-2.6-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:224af0532d216e3105f0a127410f12320f7c5f1aa0300bdf9646b8d9afb0048c"}, + {file = "lupa-2.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9abb98d5a8fd27c8285302e82199f0e56e463066f88f619d6594a450bf269d80"}, + {file = "lupa-2.6-cp314-cp314t-win32.whl", hash = "sha256:1849efeba7a8f6fb8aa2c13790bee988fd242ae404bd459509640eeea3d1e291"}, + {file = "lupa-2.6-cp314-cp314t-win_amd64.whl", hash = "sha256:fc1498d1a4fc028bc521c26d0fad4ca00ed63b952e32fb95949bda76a04bad52"}, + {file = "lupa-2.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9591700991e333b70dd92b48f152eb4731b8b24af671a9f6f721b74d68ed4499"}, + {file = "lupa-2.6-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:ef8dfa7fe08bc3f4591411b8945bbeb15af8512c3e7ad5e9b1e3a9036cdbbce7"}, + {file = "lupa-2.6-cp38-cp38-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:728c466e91174dad238f8a9c1cbdb8e69ffe559df85f87ee76edac3395300949"}, + {file = "lupa-2.6-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c781170bc7134704ae317a66204d30688b41d3e471e17e659987ea4947e11f20"}, + {file = "lupa-2.6-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241f4ddab33b9a686fc76667241bebc39a06b74ec40d79ec222f5add9000fe57"}, + {file = "lupa-2.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:c17f6b6193ced33cc7ca0b2b08b319a1b3501b014a3a3f9999c01cafc04c40f5"}, + {file = "lupa-2.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:fa6c1379e83d4104065c151736250a09f3a99e368423c7a20f9c59b15945e9fc"}, + {file = "lupa-2.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aef1a8bc10c50695e1a33a07dbef803b93eb97fc150fdb19858d704a603a67dd"}, + {file = "lupa-2.6-cp38-cp38-win32.whl", hash = "sha256:10c191bc1d5565e4360d884bea58320975ddb33270cdf9a9f55d1a1efe79aa03"}, + {file = "lupa-2.6-cp38-cp38-win_amd64.whl", hash = "sha256:05681f8ffb41f0c7fbb9ca859cc3a7e4006e9c6350d25358b535c5295c6a9928"}, + {file = "lupa-2.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8897dc6c3249786b2cdf2f83324febb436193d4581b6a71dea49f77bf8b19bb0"}, + {file = "lupa-2.6-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:4446396ca3830be0c106c70db4b4f622c37b2d447874c07952cafb9c57949a4a"}, + {file = "lupa-2.6-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:5826e687c89995a6eaafeae242071ba16448eec1a9ee8e17ed48551b5d1e21c2"}, + {file = "lupa-2.6-cp39-cp39-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:5871935cb36d1d22f9c04ac0db75c06751bd95edcfa0d9309f732de908e297a9"}, + {file = "lupa-2.6-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:43eb6e43ea8512d0d65b995d36dd9d77aa02598035e25b84c23a1b58700c9fb2"}, + {file = "lupa-2.6-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:559714053018d9885cc8c36a33c5b7eb9aad30fb6357719cac3ce4dc6b39157e"}, + {file = "lupa-2.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:57ac88a00ce59bd9d4ddcd4fca8e02564765725f5068786b011c9d1be3de20c5"}, + {file = "lupa-2.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:b683fbd867c2e54c44a686361b75eee7e7a790da55afdbe89f1f23b106de0274"}, + {file = "lupa-2.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d2f656903a2ed2e074bf2b7d300968028dfa327a45b055be8e3b51ef0b82f9bf"}, + {file = "lupa-2.6-cp39-cp39-win32.whl", hash = "sha256:bf28f68ae231b72008523ab5ac23835ba0f76e0e99ec38b59766080a84eb596a"}, + {file = "lupa-2.6-cp39-cp39-win_amd64.whl", hash = "sha256:b4b2e9b3795a9897cf6cfcc58d08210fdc0d13ab47c9a0e13858c68932d8353c"}, + {file = "lupa-2.6.tar.gz", hash = "sha256:9a770a6e89576be3447668d7ced312cd6fd41d3c13c2462c9dc2c2ab570e45d9"}, +] + [[package]] name = "lxml" version = "4.9.4" @@ -7507,7 +7634,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -8162,14 +8289,14 @@ kaleido = ["kaleido (>=1.0.0)"] name = "pluggy" version = "1.6.0" description = "plugin and hook calling mechanisms for python" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\"" +groups = ["main", "dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\""} [package.extras] dev = ["pre-commit", "tox"] @@ -8529,6 +8656,7 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, + {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -8590,6 +8718,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -9569,14 +9698,14 @@ files = [ name = "pytest" version = "7.4.4" description = "pytest: simple powerful testing with Python" -optional = true +optional = false python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\"" +groups = ["main", "dev"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -9663,6 +9792,21 @@ files = [ packaging = ">=17.1" pytest = ">=6.2" +[[package]] +name = "pytest-timeout" +version = "2.4.0" +description = "pytest plugin to abort hanging tests" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"}, + {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"}, +] + +[package.dependencies] +pytest = ">=7.0.0" + [[package]] name = "pytest-xdist" version = "3.8.0" @@ -10245,10 +10389,9 @@ orjson = ["orjson (>=3.9.14,<4)"] name = "redis" version = "5.3.1" description = "Python client for Redis database and key-value store" -optional = true +optional = false python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"redis\"" files = [ {file = "redis-5.3.1-py3-none-any.whl", hash = "sha256:dc1909bd24669cc31b5f67a039700b16ec30571096c5f1f0d9d2324bff31af97"}, {file = "redis-5.3.1.tar.gz", hash = "sha256:ca49577a531ea64039b5a36db3d6cd1a0c7a60c34124d46924a45b956e8cf14c"}, @@ -11478,6 +11621,18 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] + [[package]] name = "soupsieve" version = "2.8" @@ -11501,9 +11656,7 @@ groups = ["main"] files = [ {file = "SQLAlchemy-2.0.43-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:21ba7a08a4253c5825d1db389d4299f64a100ef9800e4624c8bf70d8f136e6ed"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11b9503fa6f8721bef9b8567730f664c5a5153d25e247aadc69247c4bc605227"}, - {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07097c0a1886c150ef2adba2ff7437e84d40c0f7dcb44a2c2b9c905ccfc6361c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cdeff998cb294896a34e5b2f00e383e7c5c4ef3b4bfa375d9104723f15186443"}, - {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:bcf0724a62a5670e5718957e05c56ec2d6850267ea859f8ad2481838f889b42c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win32.whl", hash = "sha256:c697575d0e2b0a5f0433f679bda22f63873821d991e95a90e9e52aae517b2e32"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win_amd64.whl", hash = "sha256:d34c0f6dbefd2e816e8f341d0df7d4763d382e3f452423e752ffd1e213da2512"}, {file = "sqlalchemy-2.0.43-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:70322986c0c699dca241418fcf18e637a4369e0ec50540a2b907b184c8bca069"}, @@ -11538,20 +11691,12 @@ files = [ {file = "sqlalchemy-2.0.43-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9df7126fd9db49e3a5a3999442cc67e9ee8971f3cb9644250107d7296cb2a164"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win32.whl", hash = "sha256:7f1ac7828857fcedb0361b48b9ac4821469f7694089d15550bbcf9ab22564a1d"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win_amd64.whl", hash = "sha256:971ba928fcde01869361f504fcff3b7143b47d30de188b11c6357c0505824197"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4e6aeb2e0932f32950cf56a8b4813cb15ff792fc0c9b3752eaf067cfe298496a"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:61f964a05356f4bca4112e6334ed7c208174511bd56e6b8fc86dad4d024d4185"}, {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46293c39252f93ea0910aababa8752ad628bcce3a10d3f260648dd472256983f"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:136063a68644eca9339d02e6693932116f6a8591ac013b0014479a1de664e40a"}, {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6e2bf13d9256398d037fef09fd8bf9b0bf77876e22647d10761d35593b9ac547"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:44337823462291f17f994d64282a71c51d738fc9ef561bf265f1d0fd9116a782"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win32.whl", hash = "sha256:13194276e69bb2af56198fef7909d48fd34820de01d9c92711a5fa45497cc7ed"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win_amd64.whl", hash = "sha256:334f41fa28de9f9be4b78445e68530da3c5fa054c907176460c81494f4ae1f5e"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ceb5c832cc30663aeaf5e39657712f4c4241ad1f638d487ef7216258f6d41fe7"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11f43c39b4b2ec755573952bbcc58d976779d482f6f832d7f33a8d869ae891bf"}, {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:413391b2239db55be14fa4223034d7e13325a1812c8396ecd4f2c08696d5ccad"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c379e37b08c6c527181a397212346be39319fb64323741d23e46abd97a400d34"}, {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:03d73ab2a37d9e40dec4984d1813d7878e01dbdc742448d44a7341b7a9f408c7"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8cee08f15d9e238ede42e9bbc1d6e7158d0ca4f176e4eab21f88ac819ae3bd7b"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win32.whl", hash = "sha256:b3edaec7e8b6dc5cd94523c6df4f294014df67097c8217a89929c99975811414"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win_amd64.whl", hash = "sha256:227119ce0a89e762ecd882dc661e0aa677a690c914e358f0dd8932a2e8b2765b"}, {file = "sqlalchemy-2.0.43-py3-none-any.whl", hash = "sha256:1681c21dd2ccee222c2fe0bef671d1aef7c504087c9c4e800371cfcc8ac966fc"}, @@ -11920,7 +12065,7 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, @@ -12392,11 +12537,12 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] +markers = {dev = "python_version == \"3.10\""} [[package]] name = "typing-inspect" @@ -13527,4 +13673,4 @@ scraping = ["APScheduler", "beautifulsoup4", "lxml", "playwright", "protego", "t [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "9490de8c950400c004a87333eda35311109bc1708a98e053bc2f66d883f4f702" +content-hash = "b6ede4c196d086f7159f84142c16d16fcc19bc73fcb9ab274a3b6351e6fcbb7e" diff --git a/pyproject.toml b/pyproject.toml index 8af35113c..13266f83e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,8 @@ dependencies = [ "websockets>=15.0.1,<16.0.0", "mistralai>=1.9.10", "tenacity>=9.0.0", + "fakeredis[lua]>=2.32.0", + "diskcache>=5.6.3", ] [project.optional-dependencies] @@ -198,3 +200,8 @@ exclude = [ [tool.ruff.lint] ignore = ["F401"] + +[dependency-groups] +dev = [ + "pytest-timeout>=2.4.0", +] diff --git a/uv.lock b/uv.lock index e2fc1df83..8c35a3366 100644 --- a/uv.lock +++ b/uv.lock @@ -936,6 +936,8 @@ dependencies = [ { name = "aiohttp" }, { name = "aiosqlite" }, { name = "alembic" }, + { name = "diskcache" }, + { name = "fakeredis", extra = ["lua"] }, { name = "fastapi" }, { name = "fastapi-users", extra = ["sqlalchemy"] }, { name = "fastembed" }, @@ -1097,6 +1099,11 @@ scraping = [ { name = "tavily-python" }, ] +[package.dev-dependencies] +dev = [ + { name = "pytest-timeout" }, +] + [package.metadata] requires-dist = [ { name = "aiofiles", specifier = ">=23.2.1,<24.0.0" }, @@ -1114,8 +1121,10 @@ requires-dist = [ { name = "debugpy", marker = "extra == 'debug'", specifier = ">=1.8.9,<2.0.0" }, { name = "deepeval", marker = "extra == 'deepeval'", specifier = ">=3.0.1,<4" }, { name = "deptry", marker = "extra == 'dev'", specifier = ">=0.20.0,<0.21" }, + { name = "diskcache", specifier = ">=5.6.3" }, { name = "dlt", extras = ["sqlalchemy"], marker = "extra == 'dlt'", specifier = ">=1.9.0,<2" }, { name = "docling", marker = "extra == 'docling'", specifier = ">=2.54" }, + { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.0" }, { name = "fastapi", specifier = ">=0.116.2,<1.0.0" }, { name = "fastapi-users", extras = ["sqlalchemy"], specifier = ">=14.0.1,<15.0.0" }, { name = "fastembed", specifier = "<=0.6.0" }, @@ -1203,6 +1212,9 @@ requires-dist = [ ] provides-extras = ["api", "distributed", "scraping", "neo4j", "neptune", "postgres", "postgres-binary", "notebook", "langchain", "llama-index", "huggingface", "ollama", "mistral", "anthropic", "deepeval", "posthog", "groq", "chromadb", "docs", "codegraph", "evals", "graphiti", "aws", "dlt", "baml", "dev", "debug", "redis", "monitoring", "docling"] +[package.metadata.requires-dev] +dev = [{ name = "pytest-timeout", specifier = ">=2.4.0" }] + [[package]] name = "colorama" version = "0.4.6" @@ -2047,6 +2059,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a3/46/8f4097b55e43af39e8e71e1f7aec59ff7398bca54d975c30889bc844719d/faker-37.11.0-py3-none-any.whl", hash = "sha256:1508d2da94dfd1e0087b36f386126d84f8583b3de19ac18e392a2831a6676c57", size = 1975525, upload-time = "2025-10-07T14:48:58.29Z" }, ] +[[package]] +name = "fakeredis" +version = "2.32.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "redis" }, + { name = "sortedcontainers" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e1/2e/94ca3f2ff35f086d7d3eeb924054e328b2ac851f0a20302d942c8d29726c/fakeredis-2.32.0.tar.gz", hash = "sha256:63d745b40eb6c8be4899cf2a53187c097ccca3afbca04fdbc5edc8b936cd1d59", size = 171097, upload-time = "2025-10-07T10:46:58.876Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/1b/84ab7fd197eba5243b6625c78fbcffaa4cf6ac7dda42f95d22165f52187e/fakeredis-2.32.0-py3-none-any.whl", hash = "sha256:c9da8228de84060cfdb72c3cf4555c18c59ba7a5ae4d273f75e4822d6f01ecf8", size = 118422, upload-time = "2025-10-07T10:46:57.643Z" }, +] + +[package.optional-dependencies] +lua = [ + { name = "lupa" }, +] + [[package]] name = "fastapi" version = "0.119.0" @@ -3880,6 +3911,58 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, ] +[[package]] +name = "lupa" +version = "2.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b8/1c/191c3e6ec6502e3dbe25a53e27f69a5daeac3e56de1f73c0138224171ead/lupa-2.6.tar.gz", hash = "sha256:9a770a6e89576be3447668d7ced312cd6fd41d3c13c2462c9dc2c2ab570e45d9", size = 7240282, upload-time = "2025-10-24T07:20:29.738Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/15/713cab5d0dfa4858f83b99b3e0329072df33dc14fc3ebbaa017e0f9755c4/lupa-2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6b3dabda836317e63c5ad052826e156610f356a04b3003dfa0dbe66b5d54d671", size = 954828, upload-time = "2025-10-24T07:17:15.726Z" }, + { url = "https://files.pythonhosted.org/packages/2e/71/704740cbc6e587dd6cc8dabf2f04820ac6a671784e57cc3c29db795476db/lupa-2.6-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8726d1c123bbe9fbb974ce29825e94121824e66003038ff4532c14cc2ed0c51c", size = 1919259, upload-time = "2025-10-24T07:17:18.586Z" }, + { url = "https://files.pythonhosted.org/packages/eb/18/f248341c423c5d48837e35584c6c3eb4acab7e722b6057d7b3e28e42dae8/lupa-2.6-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:f4e159e7d814171199b246f9235ca8961f6461ea8c1165ab428afa13c9289a94", size = 984998, upload-time = "2025-10-24T07:17:20.428Z" }, + { url = "https://files.pythonhosted.org/packages/44/1e/8a4bd471e018aad76bcb9455d298c2c96d82eced20f2ae8fcec8cd800948/lupa-2.6-cp310-cp310-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:202160e80dbfddfb79316692a563d843b767e0f6787bbd1c455f9d54052efa6c", size = 1174871, upload-time = "2025-10-24T07:17:22.755Z" }, + { url = "https://files.pythonhosted.org/packages/2a/5c/3a3f23fd6a91b0986eea1ceaf82ad3f9b958fe3515a9981fb9c4eb046c8b/lupa-2.6-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5deede7c5b36ab64f869dae4831720428b67955b0bb186c8349cf6ea121c852b", size = 1057471, upload-time = "2025-10-24T07:17:24.908Z" }, + { url = "https://files.pythonhosted.org/packages/45/ac/01be1fed778fb0c8f46ee8cbe344e4d782f6806fac12717f08af87aa4355/lupa-2.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86f04901f920bbf7c0cac56807dc9597e42347123e6f1f3ca920f15f54188ce5", size = 2100592, upload-time = "2025-10-24T07:17:27.089Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6c/1a05bb873e30830f8574e10cd0b4cdbc72e9dbad2a09e25810b5e3b1f75d/lupa-2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6deef8f851d6afb965c84849aa5b8c38856942df54597a811ce0369ced678610", size = 1081396, upload-time = "2025-10-24T07:17:29.064Z" }, + { url = "https://files.pythonhosted.org/packages/a2/c2/a19dd80d6dc98b39bbf8135b8198e38aa7ca3360b720eac68d1d7e9286b5/lupa-2.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:21f2b5549681c2a13b1170a26159d30875d367d28f0247b81ca347222c755038", size = 1192007, upload-time = "2025-10-24T07:17:31.362Z" }, + { url = "https://files.pythonhosted.org/packages/4f/43/e1b297225c827f55752e46fdbfb021c8982081b0f24490e42776ea69ae3b/lupa-2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:66eea57630eab5e6f49fdc5d7811c0a2a41f2011be4ea56a087ea76112011eb7", size = 2196661, upload-time = "2025-10-24T07:17:33.484Z" }, + { url = "https://files.pythonhosted.org/packages/2e/8f/2272d429a7fa9dc8dbd6e9c5c9073a03af6007eb22a4c78829fec6a34b80/lupa-2.6-cp310-cp310-win32.whl", hash = "sha256:60a403de8cab262a4fe813085dd77010effa6e2eb1886db2181df803140533b1", size = 1412738, upload-time = "2025-10-24T07:17:35.11Z" }, + { url = "https://files.pythonhosted.org/packages/35/2a/1708911271dd49ad87b4b373b5a4b0e0a0516d3d2af7b76355946c7ee171/lupa-2.6-cp310-cp310-win_amd64.whl", hash = "sha256:e4656a39d93dfa947cf3db56dc16c7916cb0cc8024acd3a952071263f675df64", size = 1656898, upload-time = "2025-10-24T07:17:36.949Z" }, + { url = "https://files.pythonhosted.org/packages/ca/29/1f66907c1ebf1881735afa695e646762c674f00738ebf66d795d59fc0665/lupa-2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6d988c0f9331b9f2a5a55186701a25444ab10a1432a1021ee58011499ecbbdd5", size = 962875, upload-time = "2025-10-24T07:17:39.107Z" }, + { url = "https://files.pythonhosted.org/packages/e6/67/4a748604be360eb9c1c215f6a0da921cd1a2b44b2c5951aae6fb83019d3a/lupa-2.6-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:ebe1bbf48259382c72a6fe363dea61a0fd6fe19eab95e2ae881e20f3654587bf", size = 1935390, upload-time = "2025-10-24T07:17:41.427Z" }, + { url = "https://files.pythonhosted.org/packages/ac/0c/8ef9ee933a350428b7bdb8335a37ef170ab0bb008bbf9ca8f4f4310116b6/lupa-2.6-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:a8fcee258487cf77cdd41560046843bb38c2e18989cd19671dd1e2596f798306", size = 992193, upload-time = "2025-10-24T07:17:43.231Z" }, + { url = "https://files.pythonhosted.org/packages/65/46/e6c7facebdb438db8a65ed247e56908818389c1a5abbf6a36aab14f1057d/lupa-2.6-cp311-cp311-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:561a8e3be800827884e767a694727ed8482d066e0d6edfcbf423b05e63b05535", size = 1165844, upload-time = "2025-10-24T07:17:45.437Z" }, + { url = "https://files.pythonhosted.org/packages/1c/26/9f1154c6c95f175ccbf96aa96c8f569c87f64f463b32473e839137601a8b/lupa-2.6-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af880a62d47991cae78b8e9905c008cbfdc4a3a9723a66310c2634fc7644578c", size = 1048069, upload-time = "2025-10-24T07:17:47.181Z" }, + { url = "https://files.pythonhosted.org/packages/68/67/2cc52ab73d6af81612b2ea24c870d3fa398443af8e2875e5befe142398b1/lupa-2.6-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80b22923aa4023c86c0097b235615f89d469a0c4eee0489699c494d3367c4c85", size = 2079079, upload-time = "2025-10-24T07:17:49.755Z" }, + { url = "https://files.pythonhosted.org/packages/2e/dc/f843f09bbf325f6e5ee61730cf6c3409fc78c010d968c7c78acba3019ca7/lupa-2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:153d2cc6b643f7efb9cfc0c6bb55ec784d5bac1a3660cfc5b958a7b8f38f4a75", size = 1071428, upload-time = "2025-10-24T07:17:51.991Z" }, + { url = "https://files.pythonhosted.org/packages/2e/60/37533a8d85bf004697449acb97ecdacea851acad28f2ad3803662487dd2a/lupa-2.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3fa8777e16f3ded50b72967dc17e23f5a08e4f1e2c9456aff2ebdb57f5b2869f", size = 1181756, upload-time = "2025-10-24T07:17:53.752Z" }, + { url = "https://files.pythonhosted.org/packages/e4/f2/cf29b20dbb4927b6a3d27c339ac5d73e74306ecc28c8e2c900b2794142ba/lupa-2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8dbdcbe818c02a2f56f5ab5ce2de374dab03e84b25266cfbaef237829bc09b3f", size = 2175687, upload-time = "2025-10-24T07:17:56.228Z" }, + { url = "https://files.pythonhosted.org/packages/94/7c/050e02f80c7131b63db1474bff511e63c545b5a8636a24cbef3fc4da20b6/lupa-2.6-cp311-cp311-win32.whl", hash = "sha256:defaf188fde8f7a1e5ce3a5e6d945e533b8b8d547c11e43b96c9b7fe527f56dc", size = 1412592, upload-time = "2025-10-24T07:17:59.062Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9a/6f2af98aa5d771cea661f66c8eb8f53772ec1ab1dfbce24126cfcd189436/lupa-2.6-cp311-cp311-win_amd64.whl", hash = "sha256:9505ae600b5c14f3e17e70f87f88d333717f60411faca1ddc6f3e61dce85fa9e", size = 1669194, upload-time = "2025-10-24T07:18:01.647Z" }, + { url = "https://files.pythonhosted.org/packages/94/86/ce243390535c39d53ea17ccf0240815e6e457e413e40428a658ea4ee4b8d/lupa-2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47ce718817ef1cc0c40d87c3d5ae56a800d61af00fbc0fad1ca9be12df2f3b56", size = 951707, upload-time = "2025-10-24T07:18:03.884Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/cedea5e6cbeb54396fdcc55f6b741696f3f036d23cfaf986d50d680446da/lupa-2.6-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7aba985b15b101495aa4b07112cdc08baa0c545390d560ad5cfde2e9e34f4d58", size = 1916703, upload-time = "2025-10-24T07:18:05.6Z" }, + { url = "https://files.pythonhosted.org/packages/24/be/3d6b5f9a8588c01a4d88129284c726017b2089f3a3fd3ba8bd977292fea0/lupa-2.6-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:b766f62f95b2739f2248977d29b0722e589dcf4f0ccfa827ccbd29f0148bd2e5", size = 985152, upload-time = "2025-10-24T07:18:08.561Z" }, + { url = "https://files.pythonhosted.org/packages/eb/23/9f9a05beee5d5dce9deca4cb07c91c40a90541fc0a8e09db4ee670da550f/lupa-2.6-cp312-cp312-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:00a934c23331f94cb51760097ebfab14b005d55a6b30a2b480e3c53dd2fa290d", size = 1159599, upload-time = "2025-10-24T07:18:10.346Z" }, + { url = "https://files.pythonhosted.org/packages/40/4e/e7c0583083db9d7f1fd023800a9767d8e4391e8330d56c2373d890ac971b/lupa-2.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21de9f38bd475303e34a042b7081aabdf50bd9bafd36ce4faea2f90fd9f15c31", size = 1038686, upload-time = "2025-10-24T07:18:12.112Z" }, + { url = "https://files.pythonhosted.org/packages/1c/9f/5a4f7d959d4feba5e203ff0c31889e74d1ca3153122be4a46dca7d92bf7c/lupa-2.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf3bda96d3fc41237e964a69c23647d50d4e28421111360274d4799832c560e9", size = 2071956, upload-time = "2025-10-24T07:18:14.572Z" }, + { url = "https://files.pythonhosted.org/packages/92/34/2f4f13ca65d01169b1720176aedc4af17bc19ee834598c7292db232cb6dc/lupa-2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a76ead245da54801a81053794aa3975f213221f6542d14ec4b859ee2e7e0323", size = 1057199, upload-time = "2025-10-24T07:18:16.379Z" }, + { url = "https://files.pythonhosted.org/packages/35/2a/5f7d2eebec6993b0dcd428e0184ad71afb06a45ba13e717f6501bfed1da3/lupa-2.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8dd0861741caa20886ddbda0a121d8e52fb9b5bb153d82fa9bba796962bf30e8", size = 1173693, upload-time = "2025-10-24T07:18:18.153Z" }, + { url = "https://files.pythonhosted.org/packages/e4/29/089b4d2f8e34417349af3904bb40bec40b65c8731f45e3fd8d497ca573e5/lupa-2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:239e63948b0b23023f81d9a19a395e768ed3da6a299f84e7963b8f813f6e3f9c", size = 2164394, upload-time = "2025-10-24T07:18:20.403Z" }, + { url = "https://files.pythonhosted.org/packages/f3/1b/79c17b23c921f81468a111cad843b076a17ef4b684c4a8dff32a7969c3f0/lupa-2.6-cp312-cp312-win32.whl", hash = "sha256:325894e1099499e7a6f9c351147661a2011887603c71086d36fe0f964d52d1ce", size = 1420647, upload-time = "2025-10-24T07:18:23.368Z" }, + { url = "https://files.pythonhosted.org/packages/b8/15/5121e68aad3584e26e1425a5c9a79cd898f8a152292059e128c206ee817c/lupa-2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c735a1ce8ee60edb0fe71d665f1e6b7c55c6021f1d340eb8c865952c602cd36f", size = 1688529, upload-time = "2025-10-24T07:18:25.523Z" }, + { url = "https://files.pythonhosted.org/packages/28/1d/21176b682ca5469001199d8b95fa1737e29957a3d185186e7a8b55345f2e/lupa-2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:663a6e58a0f60e7d212017d6678639ac8df0119bc13c2145029dcba084391310", size = 947232, upload-time = "2025-10-24T07:18:27.878Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4c/d327befb684660ca13cf79cd1f1d604331808f9f1b6fb6bf57832f8edf80/lupa-2.6-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:d1f5afda5c20b1f3217a80e9bc1b77037f8a6eb11612fd3ada19065303c8f380", size = 1908625, upload-time = "2025-10-24T07:18:29.944Z" }, + { url = "https://files.pythonhosted.org/packages/66/8e/ad22b0a19454dfd08662237a84c792d6d420d36b061f239e084f29d1a4f3/lupa-2.6-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:26f2b3c085fe76e9119e48c1013c1cccdc1f51585d456858290475aa38e7089e", size = 981057, upload-time = "2025-10-24T07:18:31.553Z" }, + { url = "https://files.pythonhosted.org/packages/5c/48/74859073ab276bd0566c719f9ca0108b0cfc1956ca0d68678d117d47d155/lupa-2.6-cp313-cp313-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:60d2f902c7b96fb8ab98493dcff315e7bb4d0b44dc9dd76eb37de575025d5685", size = 1156227, upload-time = "2025-10-24T07:18:33.981Z" }, + { url = "https://files.pythonhosted.org/packages/09/6c/0e9ded061916877253c2266074060eb71ed99fb21d73c8c114a76725bce2/lupa-2.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a02d25dee3a3250967c36590128d9220ae02f2eda166a24279da0b481519cbff", size = 1035752, upload-time = "2025-10-24T07:18:36.32Z" }, + { url = "https://files.pythonhosted.org/packages/dd/ef/f8c32e454ef9f3fe909f6c7d57a39f950996c37a3deb7b391fec7903dab7/lupa-2.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6eae1ee16b886b8914ff292dbefbf2f48abfbdee94b33a88d1d5475e02423203", size = 2069009, upload-time = "2025-10-24T07:18:38.072Z" }, + { url = "https://files.pythonhosted.org/packages/53/dc/15b80c226a5225815a890ee1c11f07968e0aba7a852df41e8ae6fe285063/lupa-2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0edd5073a4ee74ab36f74fe61450148e6044f3952b8d21248581f3c5d1a58be", size = 1056301, upload-time = "2025-10-24T07:18:40.165Z" }, + { url = "https://files.pythonhosted.org/packages/31/14/2086c1425c985acfb30997a67e90c39457122df41324d3c179d6ee2292c6/lupa-2.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0c53ee9f22a8a17e7d4266ad48e86f43771951797042dd51d1494aaa4f5f3f0a", size = 1170673, upload-time = "2025-10-24T07:18:42.426Z" }, + { url = "https://files.pythonhosted.org/packages/10/e5/b216c054cf86576c0191bf9a9f05de6f7e8e07164897d95eea0078dca9b2/lupa-2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:de7c0f157a9064a400d828789191a96da7f4ce889969a588b87ec80de9b14772", size = 2162227, upload-time = "2025-10-24T07:18:46.112Z" }, + { url = "https://files.pythonhosted.org/packages/59/2f/33ecb5bedf4f3bc297ceacb7f016ff951331d352f58e7e791589609ea306/lupa-2.6-cp313-cp313-win32.whl", hash = "sha256:ee9523941ae0a87b5b703417720c5d78f72d2f5bc23883a2ea80a949a3ed9e75", size = 1419558, upload-time = "2025-10-24T07:18:48.371Z" }, + { url = "https://files.pythonhosted.org/packages/f9/b4/55e885834c847ea610e111d87b9ed4768f0afdaeebc00cd46810f25029f6/lupa-2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b1335a5835b0a25ebdbc75cf0bda195e54d133e4d994877ef025e218c2e59db9", size = 1683424, upload-time = "2025-10-24T07:18:50.976Z" }, +] + [[package]] name = "lxml" version = "4.9.4" @@ -6996,6 +7079,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/14/e02206388902a828cc26894996dfc68eec50f7583bcddc4b5605d0c18b51/pytest_rerunfailures-12.0-py3-none-any.whl", hash = "sha256:9a1afd04e21b8177faf08a9bbbf44de7a0fe3fc29f8ddbe83b9684bd5f8f92a9", size = 12977, upload-time = "2023-07-05T05:53:43.909Z" }, ] +[[package]] +name = "pytest-timeout" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" }, +] + [[package]] name = "pytest-xdist" version = "3.8.0" @@ -8234,6 +8329,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "soupsieve" version = "2.8" From a5bd504daa688efcbf7358ba2fce74a4da359ce4 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Wed, 12 Nov 2025 21:32:22 +0100 Subject: [PATCH 66/85] Relational DB migration test search (#1752) ## Description Add deterministic Cognee search test after rel DB migration. Test gathers all relevant relationships regarding Customers and their Invoices from relational DB that was migrated and then tries to get the same results with Cognee search. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/tests/test_relational_db_migration.py | 53 +++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/cognee/tests/test_relational_db_migration.py b/cognee/tests/test_relational_db_migration.py index 4557e9e2f..ae06e7c5d 100644 --- a/cognee/tests/test_relational_db_migration.py +++ b/cognee/tests/test_relational_db_migration.py @@ -1,6 +1,5 @@ import pathlib import os -from typing import List from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import ( get_migration_relational_engine, @@ -10,7 +9,7 @@ from cognee.infrastructure.databases.vector.pgvector import ( create_db_and_tables as create_pgvector_db_and_tables, ) from cognee.tasks.ingestion import migrate_relational_database -from cognee.modules.search.types import SearchResult, SearchType +from cognee.modules.search.types import SearchType import cognee @@ -274,6 +273,55 @@ async def test_schema_only_migration(): print(f"Edge counts: {edge_counts}") +async def test_search_result_quality(): + from cognee.infrastructure.databases.relational import ( + get_migration_relational_engine, + ) + + # Get relational database with original data + migration_engine = get_migration_relational_engine() + from sqlalchemy import text + + async with migration_engine.engine.connect() as conn: + result = await conn.execute( + text(""" + SELECT + c.CustomerId, + c.FirstName, + c.LastName, + GROUP_CONCAT(i.InvoiceId, ',') AS invoice_ids + FROM Customer AS c + LEFT JOIN Invoice AS i ON c.CustomerId = i.CustomerId + GROUP BY c.CustomerId, c.FirstName, c.LastName + """) + ) + + for row in result: + # Get expected invoice IDs from relational DB for each Customer + customer_id = row.CustomerId + invoice_ids = row.invoice_ids.split(",") if row.invoice_ids else [] + print(f"Relational DB Customer {customer_id}: {invoice_ids}") + + # Use Cognee search to get invoice IDs for the same Customer but by providing Customer name + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=f"List me all the invoices of Customer:{row.FirstName} {row.LastName}.", + top_k=50, + system_prompt="Just return me the invoiceID as a number without any text. This is an example output: ['1', '2', '3']. Where 1, 2, 3 are invoiceIDs of an invoice", + ) + print(f"Cognee search result: {search_results}") + + import ast + + lst = ast.literal_eval(search_results[0]) # converts string -> Python list + # Transfrom both lists to int for comparison, sorting and type consistency + lst = sorted([int(x) for x in lst]) + invoice_ids = sorted([int(x) for x in invoice_ids]) + assert lst == invoice_ids, ( + f"Search results {lst} do not match expected invoice IDs {invoice_ids} for Customer:{customer_id}" + ) + + async def test_migration_sqlite(): database_to_migrate_path = os.path.join(pathlib.Path(__file__).parent, "test_data/") @@ -286,6 +334,7 @@ async def test_migration_sqlite(): ) await relational_db_migration() + await test_search_result_quality() await test_schema_only_migration() From f9cde2f375be2accf6c9bd7fb5f5c681971f692a Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 13 Nov 2025 13:35:07 +0100 Subject: [PATCH 67/85] Fix: Remove cognee script from pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 13266f83e..2436911e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,7 +156,6 @@ Homepage = "https://www.cognee.ai" Repository = "https://github.com/topoteretes/cognee" [project.scripts] -cognee = "cognee.cli._cognee:main" cognee-cli = "cognee.cli._cognee:main" [build-system] From 3b7d030817cea67f08af121d936a9e31312ae38c Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 13 Nov 2025 16:06:07 +0100 Subject: [PATCH 68/85] fix: remove duplicate mistral adapter creation --- .../litellm_instructor/llm/get_llm_client.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py index c7dcecc56..bbdfe49e9 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py @@ -162,20 +162,5 @@ def get_llm_client(raise_api_key_error: bool = True): endpoint=llm_config.llm_endpoint, ) - elif provider == LLMProvider.MISTRAL: - if llm_config.llm_api_key is None: - raise LLMAPIKeyNotSetError() - - from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.mistral.adapter import ( - MistralAdapter, - ) - - return MistralAdapter( - api_key=llm_config.llm_api_key, - model=llm_config.llm_model, - max_completion_tokens=max_completion_tokens, - endpoint=llm_config.llm_endpoint, - ) - else: raise UnsupportedLLMProviderError(provider) From c6454338f9374c0e871938523eca237d6e5a1d16 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 13 Nov 2025 17:35:16 +0100 Subject: [PATCH 69/85] Fix: MCP remove cognee.add() preprequisite from the doc --- cognee-mcp/src/server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py index 7c708638c..4131be988 100755 --- a/cognee-mcp/src/server.py +++ b/cognee-mcp/src/server.py @@ -194,7 +194,6 @@ async def cognify( Prerequisites: - **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation) - - **Data Added**: Must have data previously added via `cognee.add()` - **Vector Database**: Must be accessible for embeddings storage - **Graph Database**: Must be accessible for relationship storage From 2337d36f7b3968cfeff06b00613f7464c8d0ca93 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 13 Nov 2025 18:25:07 +0100 Subject: [PATCH 70/85] feat: add variable to control instructor mode --- cognee/infrastructure/llm/config.py | 2 ++ .../litellm_instructor/llm/anthropic/adapter.py | 8 +++++++- .../litellm_instructor/llm/gemini/adapter.py | 12 +++++++++++- .../llm/generic_llm_api/adapter.py | 12 +++++++++++- .../litellm_instructor/llm/mistral/adapter.py | 8 +++++++- .../litellm_instructor/llm/ollama/adapter.py | 12 +++++++++++- .../litellm_instructor/llm/openai/adapter.py | 11 +++++++++-- 7 files changed, 58 insertions(+), 7 deletions(-) diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index 8fd196eaf..c87054ff6 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -38,6 +38,7 @@ class LLMConfig(BaseSettings): """ structured_output_framework: str = "instructor" + llm_instructor_mode: Optional[str] = None llm_provider: str = "openai" llm_model: str = "openai/gpt-5-mini" llm_endpoint: str = "" @@ -181,6 +182,7 @@ class LLMConfig(BaseSettings): instance. """ return { + "llm_instructor_mode": self.llm_instructor_mode, "provider": self.llm_provider, "model": self.llm_model, "endpoint": self.llm_endpoint, diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py index bf19d6e86..6fb78718e 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py @@ -28,13 +28,19 @@ class AnthropicAdapter(LLMInterface): name = "Anthropic" model: str + default_instructor_mode = "anthropic_tools" def __init__(self, max_completion_tokens: int, model: str = None): import anthropic + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + self.aclient = instructor.patch( create=anthropic.AsyncAnthropic(api_key=get_llm_config().llm_api_key).messages.create, - mode=instructor.Mode.ANTHROPIC_TOOLS, + mode=instructor.Mode(instructor_mode), ) self.model = model diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py index 1187e0cad..68dddc7b7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py @@ -41,6 +41,7 @@ class GeminiAdapter(LLMInterface): name: str model: str api_key: str + default_instructor_mode = "json_mode" def __init__( self, @@ -63,7 +64,16 @@ class GeminiAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + + self.aclient = instructor.from_litellm( + litellm.acompletion, mode=instructor.Mode(instructor_mode) + ) @retry( stop=stop_after_delay(128), diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py index 8bbbaa2cc..ea32dced1 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py @@ -41,6 +41,7 @@ class GenericAPIAdapter(LLMInterface): name: str model: str api_key: str + default_instructor_mode = "json_mode" def __init__( self, @@ -63,7 +64,16 @@ class GenericAPIAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + + self.aclient = instructor.from_litellm( + litellm.acompletion, mode=instructor.Mode(instructor_mode) + ) @retry( stop=stop_after_delay(128), diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py index 78a3cbff5..bed88ce3c 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py @@ -37,6 +37,7 @@ class MistralAdapter(LLMInterface): model: str api_key: str max_completion_tokens: int + default_instructor_mode = "mistral_tools" def __init__(self, api_key: str, model: str, max_completion_tokens: int, endpoint: str = None): from mistralai import Mistral @@ -44,9 +45,14 @@ class MistralAdapter(LLMInterface): self.model = model self.max_completion_tokens = max_completion_tokens + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + self.aclient = instructor.from_litellm( litellm.acompletion, - mode=instructor.Mode.MISTRAL_TOOLS, + mode=instructor.Mode(instructor_mode), api_key=get_llm_config().llm_api_key, ) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py index 9c3d185aa..aa24a7911 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py @@ -42,6 +42,8 @@ class OllamaAPIAdapter(LLMInterface): - aclient """ + default_instructor_mode = "json_mode" + def __init__( self, endpoint: str, api_key: str, model: str, name: str, max_completion_tokens: int ): @@ -51,8 +53,16 @@ class OllamaAPIAdapter(LLMInterface): self.endpoint = endpoint self.max_completion_tokens = max_completion_tokens + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + self.aclient = instructor.from_openai( - OpenAI(base_url=self.endpoint, api_key=self.api_key), mode=instructor.Mode.JSON + OpenAI(base_url=self.endpoint, api_key=self.api_key), + mode=instructor.Mode(instructor_mode), ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py index 305b426b8..69367602d 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py @@ -56,6 +56,7 @@ class OpenAIAdapter(LLMInterface): model: str api_key: str api_version: str + default_instructor_mode = "json_schema_mode" MAX_RETRIES = 5 @@ -74,14 +75,20 @@ class OpenAIAdapter(LLMInterface): fallback_api_key: str = None, fallback_endpoint: str = None, ): + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) # TODO: With gpt5 series models OpenAI expects JSON_SCHEMA as a mode for structured outputs. # Make sure all new gpt models will work with this mode as well. if "gpt-5" in model: self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode.JSON_SCHEMA + litellm.acompletion, mode=instructor.Mode(instructor_mode) ) self.client = instructor.from_litellm( - litellm.completion, mode=instructor.Mode.JSON_SCHEMA + litellm.completion, mode=instructor.Mode(instructor_mode) ) else: self.aclient = instructor.from_litellm(litellm.acompletion) From 661c194f97df5053f70a52d1638c77c23e0d50e3 Mon Sep 17 00:00:00 2001 From: EricXiao Date: Fri, 14 Nov 2025 15:21:47 +0800 Subject: [PATCH 71/85] fix: Resolve issue with csv suffix classification Signed-off-by: EricXiao --- cognee/infrastructure/files/utils/guess_file_type.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index 78b20c93d..4bc96fe80 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -55,6 +55,10 @@ def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type file_type = Type("text/plain", "txt") return file_type + if ext in [".csv"]: + file_type = Type("text/csv", "csv") + return file_type + file_type = filetype.guess(file) # If file type could not be determined consider it a plain text file as they don't have magic number encoding From 205f5a9e0c6d0fb72cc94fa20ce2ba814ebef0d5 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Fri, 14 Nov 2025 11:05:39 +0100 Subject: [PATCH 72/85] fix: Fix based on PR comments --- .../litellm_instructor/llm/anthropic/adapter.py | 9 +++------ .../litellm_instructor/llm/gemini/adapter.py | 10 +++------- .../llm/generic_llm_api/adapter.py | 10 +++------- .../litellm_instructor/llm/get_llm_client.py | 9 ++++++++- .../litellm_instructor/llm/mistral/adapter.py | 16 ++++++++++------ .../litellm_instructor/llm/ollama/adapter.py | 17 +++++++++-------- .../litellm_instructor/llm/openai/adapter.py | 12 ++++-------- 7 files changed, 40 insertions(+), 43 deletions(-) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py index 6fb78718e..dbf0dfbea 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py @@ -30,17 +30,14 @@ class AnthropicAdapter(LLMInterface): model: str default_instructor_mode = "anthropic_tools" - def __init__(self, max_completion_tokens: int, model: str = None): + def __init__(self, max_completion_tokens: int, model: str = None, instructor_mode: str = None): import anthropic - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.patch( create=anthropic.AsyncAnthropic(api_key=get_llm_config().llm_api_key).messages.create, - mode=instructor.Mode(instructor_mode), + mode=instructor.Mode(self.instructor_mode), ) self.model = model diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py index 68dddc7b7..226f291d7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py @@ -50,6 +50,7 @@ class GeminiAdapter(LLMInterface): model: str, api_version: str, max_completion_tokens: int, + instructor_mode: str = None, fallback_model: str = None, fallback_api_key: str = None, fallback_endpoint: str = None, @@ -64,15 +65,10 @@ class GeminiAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode(instructor_mode) + litellm.acompletion, mode=instructor.Mode(self.instructor_mode) ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py index ea32dced1..9d7f25fc5 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py @@ -50,6 +50,7 @@ class GenericAPIAdapter(LLMInterface): model: str, name: str, max_completion_tokens: int, + instructor_mode: str = None, fallback_model: str = None, fallback_api_key: str = None, fallback_endpoint: str = None, @@ -64,15 +65,10 @@ class GenericAPIAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode(instructor_mode) + litellm.acompletion, mode=instructor.Mode(self.instructor_mode) ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py index c7dcecc56..537eda1b2 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py @@ -81,6 +81,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, transcription_model=llm_config.transcription_model, max_completion_tokens=max_completion_tokens, + instructor_mode=llm_config.llm_instructor_mode, streaming=llm_config.llm_streaming, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, @@ -101,6 +102,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Ollama", max_completion_tokens=max_completion_tokens, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.ANTHROPIC: @@ -109,7 +111,9 @@ def get_llm_client(raise_api_key_error: bool = True): ) return AnthropicAdapter( - max_completion_tokens=max_completion_tokens, model=llm_config.llm_model + max_completion_tokens=max_completion_tokens, + model=llm_config.llm_model, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.CUSTOM: @@ -126,6 +130,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Custom", max_completion_tokens=max_completion_tokens, + instructor_mode=llm_config.llm_instructor_mode, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, fallback_model=llm_config.fallback_model, @@ -145,6 +150,7 @@ def get_llm_client(raise_api_key_error: bool = True): max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, api_version=llm_config.llm_api_version, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.MISTRAL: @@ -160,6 +166,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.MISTRAL: diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py index bed88ce3c..355cdae0b 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py @@ -39,20 +39,24 @@ class MistralAdapter(LLMInterface): max_completion_tokens: int default_instructor_mode = "mistral_tools" - def __init__(self, api_key: str, model: str, max_completion_tokens: int, endpoint: str = None): + def __init__( + self, + api_key: str, + model: str, + max_completion_tokens: int, + endpoint: str = None, + instructor_mode: str = None, + ): from mistralai import Mistral self.model = model self.max_completion_tokens = max_completion_tokens - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_litellm( litellm.acompletion, - mode=instructor.Mode(instructor_mode), + mode=instructor.Mode(self.instructor_mode), api_key=get_llm_config().llm_api_key, ) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py index aa24a7911..aabd19867 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py @@ -45,7 +45,13 @@ class OllamaAPIAdapter(LLMInterface): default_instructor_mode = "json_mode" def __init__( - self, endpoint: str, api_key: str, model: str, name: str, max_completion_tokens: int + self, + endpoint: str, + api_key: str, + model: str, + name: str, + max_completion_tokens: int, + instructor_mode: str = None, ): self.name = name self.model = model @@ -53,16 +59,11 @@ class OllamaAPIAdapter(LLMInterface): self.endpoint = endpoint self.max_completion_tokens = max_completion_tokens - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_openai( OpenAI(base_url=self.endpoint, api_key=self.api_key), - mode=instructor.Mode(instructor_mode), + mode=instructor.Mode(self.instructor_mode), ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py index 69367602d..778c8eec7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py @@ -70,25 +70,21 @@ class OpenAIAdapter(LLMInterface): model: str, transcription_model: str, max_completion_tokens: int, + instructor_mode: str = None, streaming: bool = False, fallback_model: str = None, fallback_api_key: str = None, fallback_endpoint: str = None, ): - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode # TODO: With gpt5 series models OpenAI expects JSON_SCHEMA as a mode for structured outputs. # Make sure all new gpt models will work with this mode as well. if "gpt-5" in model: self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode(instructor_mode) + litellm.acompletion, mode=instructor.Mode(self.instructor_mode) ) self.client = instructor.from_litellm( - litellm.completion, mode=instructor.Mode(instructor_mode) + litellm.completion, mode=instructor.Mode(self.instructor_mode) ) else: self.aclient = instructor.from_litellm(litellm.acompletion) From 844b8d635a7646750dc63dd4be13de07f2996940 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Fri, 14 Nov 2025 22:13:00 +0500 Subject: [PATCH 73/85] feat: enhance ontology handling to support multiple uploads and retrievals --- .../v1/cognify/routers/get_cognify_router.py | 41 +++---- cognee/api/v1/ontologies/ontologies.py | 108 +++++++++++++++--- .../ontologies/routers/get_ontology_router.py | 51 ++++++--- .../rdf_xml/RDFLibOntologyResolver.py | 93 +++++++++------ 4 files changed, 202 insertions(+), 91 deletions(-) diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index 252ffe7bf..4f1497e3c 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -41,8 +41,8 @@ class CognifyPayloadDTO(InDTO): custom_prompt: Optional[str] = Field( default="", description="Custom prompt for entity extraction and graph generation" ) - ontology_key: Optional[str] = Field( - default=None, description="Reference to previously uploaded ontology" + ontology_key: Optional[List[str]] = Field( + default=None, description="Reference to one or more previously uploaded ontologies" ) @@ -71,7 +71,7 @@ def get_cognify_router() -> APIRouter: - **dataset_ids** (Optional[List[UUID]]): List of existing dataset UUIDs to process. UUIDs allow processing of datasets not owned by the user (if permitted). - **run_in_background** (Optional[bool]): Whether to execute processing asynchronously. Defaults to False (blocking). - **custom_prompt** (Optional[str]): Custom prompt for entity extraction and graph generation. If provided, this prompt will be used instead of the default prompts for knowledge graph extraction. - - **ontology_key** (Optional[str]): Reference to a previously uploaded ontology file to use for knowledge graph construction. + - **ontology_key** (Optional[List[str]]): Reference to one or more previously uploaded ontology files to use for knowledge graph construction. ## Response - **Blocking execution**: Complete pipeline run information with entity counts, processing duration, and success/failure status @@ -87,7 +87,7 @@ def get_cognify_router() -> APIRouter: "datasets": ["research_papers", "documentation"], "run_in_background": false, "custom_prompt": "Extract entities focusing on technical concepts and their relationships. Identify key technologies, methodologies, and their interconnections.", - "ontology_key": "medical_ontology_v1" + "ontology_key": ["medical_ontology_v1"] } ``` @@ -121,29 +121,22 @@ def get_cognify_router() -> APIRouter: if payload.ontology_key: ontology_service = OntologyService() - try: - ontology_content = ontology_service.get_ontology_content( - payload.ontology_key, user - ) + ontology_contents = ontology_service.get_ontology_contents( + payload.ontology_key, user + ) - from cognee.modules.ontology.ontology_config import Config - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import ( - RDFLibOntologyResolver, - ) - from io import StringIO + from cognee.modules.ontology.ontology_config import Config + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import ( + RDFLibOntologyResolver, + ) + from io import StringIO - ontology_stream = StringIO(ontology_content) - config_to_use: Config = { - "ontology_config": { - "ontology_resolver": RDFLibOntologyResolver( - ontology_file=ontology_stream - ) - } + ontology_streams = [StringIO(content) for content in ontology_contents] + config_to_use: Config = { + "ontology_config": { + "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_streams) } - except ValueError as e: - return JSONResponse( - status_code=400, content={"error": f"Ontology error: {str(e)}"} - ) + } cognify_run = await cognee_cognify( datasets, diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index 6bfb7658e..130b4a862 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -3,7 +3,7 @@ import json import tempfile from pathlib import Path from datetime import datetime, timezone -from typing import Optional +from typing import Optional, List from dataclasses import dataclass @@ -47,28 +47,23 @@ class OntologyService: async def upload_ontology( self, ontology_key: str, file, user, description: Optional[str] = None ) -> OntologyMetadata: - # Validate file format if not file.filename.lower().endswith(".owl"): raise ValueError("File must be in .owl format") user_dir = self._get_user_dir(str(user.id)) metadata = self._load_metadata(user_dir) - # Check for duplicate key if ontology_key in metadata: raise ValueError(f"Ontology key '{ontology_key}' already exists") - # Read file content content = await file.read() - if len(content) > 10 * 1024 * 1024: # 10MB limit + if len(content) > 10 * 1024 * 1024: raise ValueError("File size exceeds 10MB limit") - # Save file file_path = user_dir / f"{ontology_key}.owl" with open(file_path, "wb") as f: f.write(content) - # Update metadata ontology_metadata = { "filename": file.filename, "size_bytes": len(content), @@ -86,19 +81,102 @@ class OntologyService: description=description, ) - def get_ontology_content(self, ontology_key: str, user) -> str: + async def upload_ontologies( + self, ontology_key: List[str], files: List, user, descriptions: Optional[List[str]] = None + ) -> List[OntologyMetadata]: + """ + Upload ontology files with their respective keys. + + Args: + ontology_key: List of unique keys for each ontology + files: List of UploadFile objects (same length as keys) + user: Authenticated user + descriptions: Optional list of descriptions for each file + + Returns: + List of OntologyMetadata objects for uploaded files + + Raises: + ValueError: If keys duplicate, file format invalid, or array lengths don't match + """ + if len(ontology_key) != len(files): + raise ValueError("Number of keys must match number of files") + + if len(set(ontology_key)) != len(ontology_key): + raise ValueError("Duplicate ontology keys not allowed") + + if descriptions and len(descriptions) != len(files): + raise ValueError("Number of descriptions must match number of files") + + results = [] user_dir = self._get_user_dir(str(user.id)) metadata = self._load_metadata(user_dir) - if ontology_key not in metadata: - raise ValueError(f"Ontology key '{ontology_key}' not found") + for i, (key, file) in enumerate(zip(ontology_key, files)): + if key in metadata: + raise ValueError(f"Ontology key '{key}' already exists") - file_path = user_dir / f"{ontology_key}.owl" - if not file_path.exists(): - raise ValueError(f"Ontology file for key '{ontology_key}' not found") + if not file.filename.lower().endswith(".owl"): + raise ValueError(f"File '{file.filename}' must be in .owl format") - with open(file_path, "r", encoding="utf-8") as f: - return f.read() + content = await file.read() + if len(content) > 10 * 1024 * 1024: + raise ValueError(f"File '{file.filename}' exceeds 10MB limit") + + file_path = user_dir / f"{key}.owl" + with open(file_path, "wb") as f: + f.write(content) + + ontology_metadata = { + "filename": file.filename, + "size_bytes": len(content), + "uploaded_at": datetime.now(timezone.utc).isoformat(), + "description": descriptions[i] if descriptions else None, + } + metadata[key] = ontology_metadata + + results.append( + OntologyMetadata( + ontology_key=key, + filename=file.filename, + size_bytes=len(content), + uploaded_at=ontology_metadata["uploaded_at"], + description=descriptions[i] if descriptions else None, + ) + ) + + self._save_metadata(user_dir, metadata) + return results + + def get_ontology_contents(self, ontology_key: List[str], user) -> List[str]: + """ + Retrieve ontology content for one or more keys. + + Args: + ontology_key: List of ontology keys to retrieve (can contain single item) + user: Authenticated user + + Returns: + List of ontology content strings + + Raises: + ValueError: If any ontology key not found + """ + user_dir = self._get_user_dir(str(user.id)) + metadata = self._load_metadata(user_dir) + + contents = [] + for key in ontology_key: + if key not in metadata: + raise ValueError(f"Ontology key '{key}' not found") + + file_path = user_dir / f"{key}.owl" + if not file_path.exists(): + raise ValueError(f"Ontology file for key '{key}' not found") + + with open(file_path, "r", encoding="utf-8") as f: + contents.append(f.read()) + return contents def list_ontologies(self, user) -> dict: user_dir = self._get_user_dir(str(user.id)) diff --git a/cognee/api/v1/ontologies/routers/get_ontology_router.py b/cognee/api/v1/ontologies/routers/get_ontology_router.py index f5c51ba21..ee31c683f 100644 --- a/cognee/api/v1/ontologies/routers/get_ontology_router.py +++ b/cognee/api/v1/ontologies/routers/get_ontology_router.py @@ -1,6 +1,6 @@ from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException from fastapi.responses import JSONResponse -from typing import Optional +from typing import Optional, List from cognee.modules.users.models import User from cognee.modules.users.methods import get_authenticated_user @@ -16,23 +16,27 @@ def get_ontology_router() -> APIRouter: @router.post("", response_model=dict) async def upload_ontology( ontology_key: str = Form(...), - ontology_file: UploadFile = File(...), - description: Optional[str] = Form(None), + ontology_file: List[UploadFile] = File(...), + descriptions: Optional[str] = Form(None), user: User = Depends(get_authenticated_user), ): """ - Upload an ontology file with a named key for later use in cognify operations. + Upload ontology files with their respective keys for later use in cognify operations. + + Supports both single and multiple file uploads: + - Single file: ontology_key=["key"], ontology_file=[file] + - Multiple files: ontology_key=["key1", "key2"], ontology_file=[file1, file2] ## Request Parameters - - **ontology_key** (str): User-defined identifier for the ontology - - **ontology_file** (UploadFile): OWL format ontology file - - **description** (Optional[str]): Optional description of the ontology + - **ontology_key** (str): JSON array string of user-defined identifiers for the ontologies + - **ontology_file** (List[UploadFile]): OWL format ontology files + - **descriptions** (Optional[str]): JSON array string of optional descriptions ## Response - Returns metadata about the uploaded ontology including key, filename, size, and upload timestamp. + Returns metadata about uploaded ontologies including keys, filenames, sizes, and upload timestamps. ## Error Codes - - **400 Bad Request**: Invalid file format, duplicate key, file size exceeded + - **400 Bad Request**: Invalid file format, duplicate keys, array length mismatches, file size exceeded - **500 Internal Server Error**: File system or processing errors """ send_telemetry( @@ -45,16 +49,31 @@ def get_ontology_router() -> APIRouter: ) try: - result = await ontology_service.upload_ontology( - ontology_key, ontology_file, user, description + import json + + ontology_keys = json.loads(ontology_key) + description_list = json.loads(descriptions) if descriptions else None + + if not isinstance(ontology_keys, list): + raise ValueError("ontology_key must be a JSON array") + + results = await ontology_service.upload_ontologies( + ontology_keys, ontology_file, user, description_list ) + return { - "ontology_key": result.ontology_key, - "filename": result.filename, - "size_bytes": result.size_bytes, - "uploaded_at": result.uploaded_at, + "uploaded_ontologies": [ + { + "ontology_key": result.ontology_key, + "filename": result.filename, + "size_bytes": result.size_bytes, + "uploaded_at": result.uploaded_at, + "description": result.description, + } + for result in results + ] } - except ValueError as e: + except (json.JSONDecodeError, ValueError) as e: return JSONResponse(status_code=400, content={"error": str(e)}) except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)}) diff --git a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index 4acc8861b..34d7a946a 100644 --- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -26,7 +26,7 @@ class RDFLibOntologyResolver(BaseOntologyResolver): def __init__( self, - ontology_file: Optional[Union[str, List[str], IO]] = None, + ontology_file: Optional[Union[str, List[str], IO, List[IO]]] = None, matching_strategy: Optional[MatchingStrategy] = None, ) -> None: super().__init__(matching_strategy) @@ -34,47 +34,68 @@ class RDFLibOntologyResolver(BaseOntologyResolver): try: self.graph = None if ontology_file is not None: + files_to_load = [] + file_objects = [] + if hasattr(ontology_file, "read"): - self.graph = Graph() - content = ontology_file.read() - self.graph.parse(data=content, format="xml") - logger.info("Ontology loaded successfully from file object") - else: - files_to_load = [] - if isinstance(ontology_file, str): - files_to_load = [ontology_file] - elif isinstance(ontology_file, list): + file_objects = [ontology_file] + elif isinstance(ontology_file, str): + files_to_load = [ontology_file] + elif isinstance(ontology_file, list): + if all(hasattr(item, "read") for item in ontology_file): + file_objects = ontology_file + else: files_to_load = ontology_file - else: - raise ValueError( - f"ontology_file must be a string, list of strings, file-like object, or None. Got: {type(ontology_file)}" - ) + else: + raise ValueError( + f"ontology_file must be a string, list of strings, file-like object, list of file-like objects, or None. Got: {type(ontology_file)}" + ) - if files_to_load: - self.graph = Graph() - loaded_files = [] - for file_path in files_to_load: - if os.path.exists(file_path): - self.graph.parse(file_path) - loaded_files.append(file_path) - logger.info("Ontology loaded successfully from file: %s", file_path) - else: - logger.warning( - "Ontology file '%s' not found. Skipping this file.", - file_path, - ) + if file_objects: + self.graph = Graph() + loaded_objects = [] + for file_obj in file_objects: + try: + content = file_obj.read() + self.graph.parse(data=content, format="xml") + loaded_objects.append(file_obj) + logger.info("Ontology loaded successfully from file object") + except Exception as e: + logger.warning("Failed to parse ontology file object: %s", str(e)) - if not loaded_files: - logger.info( - "No valid ontology files found. No owl ontology will be attached to the graph." - ) - self.graph = None - else: - logger.info("Total ontology files loaded: %d", len(loaded_files)) - else: + if not loaded_objects: logger.info( - "No ontology file provided. No owl ontology will be attached to the graph." + "No valid ontology file objects found. No owl ontology will be attached to the graph." ) + self.graph = None + else: + logger.info("Total ontology file objects loaded: %d", len(loaded_objects)) + + elif files_to_load: + self.graph = Graph() + loaded_files = [] + for file_path in files_to_load: + if os.path.exists(file_path): + self.graph.parse(file_path) + loaded_files.append(file_path) + logger.info("Ontology loaded successfully from file: %s", file_path) + else: + logger.warning( + "Ontology file '%s' not found. Skipping this file.", + file_path, + ) + + if not loaded_files: + logger.info( + "No valid ontology files found. No owl ontology will be attached to the graph." + ) + self.graph = None + else: + logger.info("Total ontology files loaded: %d", len(loaded_files)) + else: + logger.info( + "No ontology file provided. No owl ontology will be attached to the graph." + ) else: logger.info( "No ontology file provided. No owl ontology will be attached to the graph." From 01f1c099cc972e2222f1174c515e1baf87fbb9d6 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Fri, 14 Nov 2025 22:20:54 +0500 Subject: [PATCH 74/85] test: enhance server start test with ontology upload verification - Extend test_cognee_server_start to upload ontology and verify integration - Move test_ontology_endpoint from tests/ to tests/unit/api/ --- cognee/tests/test_cognee_server_start.py | 45 ++++++++++++++++++- .../{ => unit/api}/test_ontology_endpoint.py | 0 2 files changed, 44 insertions(+), 1 deletion(-) rename cognee/tests/{ => unit/api}/test_ontology_endpoint.py (100%) diff --git a/cognee/tests/test_cognee_server_start.py b/cognee/tests/test_cognee_server_start.py index ab68a8ef1..d6aa55a98 100644 --- a/cognee/tests/test_cognee_server_start.py +++ b/cognee/tests/test_cognee_server_start.py @@ -7,6 +7,7 @@ import requests from pathlib import Path import sys import uuid +import json class TestCogneeServerStart(unittest.TestCase): @@ -90,12 +91,31 @@ class TestCogneeServerStart(unittest.TestCase): ) } - payload = {"datasets": [dataset_name]} + ontology_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + payload = {"datasets": [dataset_name], "ontology_key": [ontology_key]} add_response = requests.post(url, headers=headers, data=form_data, files=file, timeout=50) if add_response.status_code not in [200, 201]: add_response.raise_for_status() + ontology_content = b""" + + + + + """ + + ontology_response = requests.post( + "http://127.0.0.1:8000/api/v1/ontologies", + files=[("ontology_file", ("test.owl", ontology_content, "application/xml"))], + data={ + "ontology_key": json.dumps([ontology_key]), + "description": json.dumps(["Test ontology"]), + }, + ) + self.assertEqual(ontology_response.status_code, 200) + # Cognify request url = "http://127.0.0.1:8000/api/v1/cognify" headers = { @@ -107,6 +127,29 @@ class TestCogneeServerStart(unittest.TestCase): if cognify_response.status_code not in [200, 201]: cognify_response.raise_for_status() + datasets_response = requests.get("http://127.0.0.1:8000/api/v1/datasets", headers=headers) + + datasets = datasets_response.json() + dataset_id = None + for dataset in datasets: + if dataset["name"] == dataset_name: + dataset_id = dataset["id"] + break + + graph_response = requests.get( + f"http://127.0.0.1:8000/api/v1/datasets/{dataset_id}/graph", headers=headers + ) + self.assertEqual(graph_response.status_code, 200) + + graph_data = graph_response.json() + ontology_nodes = [ + node for node in graph_data.get("nodes") if node.get("properties").get("ontology_valid") + ] + + self.assertGreater( + len(ontology_nodes), 0, "No ontology nodes found - ontology was not integrated" + ) + # TODO: Add test to verify cognify pipeline is complete before testing search # Search request diff --git a/cognee/tests/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py similarity index 100% rename from cognee/tests/test_ontology_endpoint.py rename to cognee/tests/unit/api/test_ontology_endpoint.py From 1ded09d0f995fa57c9eaa3feafbe64089525a92f Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Sat, 15 Nov 2025 00:06:55 +0500 Subject: [PATCH 75/85] fix: fixed test ontology file content. Added tests to support multiple files and improved validation. --- cognee/tests/test_cognee_server_start.py | 13 +- .../tests/unit/api/test_ontology_endpoint.py | 189 +++++++++++++++++- 2 files changed, 185 insertions(+), 17 deletions(-) diff --git a/cognee/tests/test_cognee_server_start.py b/cognee/tests/test_cognee_server_start.py index d6aa55a98..b266fc7bf 100644 --- a/cognee/tests/test_cognee_server_start.py +++ b/cognee/tests/test_cognee_server_start.py @@ -98,13 +98,12 @@ class TestCogneeServerStart(unittest.TestCase): if add_response.status_code not in [200, 201]: add_response.raise_for_status() - ontology_content = b""" - - - - - """ + ontology_content = b""" + + + + + """ ontology_response = requests.post( "http://127.0.0.1:8000/api/v1/ontologies", diff --git a/cognee/tests/unit/api/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py index b5cedfafe..c04959998 100644 --- a/cognee/tests/unit/api/test_ontology_endpoint.py +++ b/cognee/tests/unit/api/test_ontology_endpoint.py @@ -32,6 +32,8 @@ def mock_default_user(): @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_success(mock_get_default_user, client, mock_default_user): """Test successful ontology upload""" + import json + mock_get_default_user.return_value = mock_default_user ontology_content = ( b"" @@ -40,14 +42,14 @@ def test_upload_ontology_success(mock_get_default_user, client, mock_default_use response = client.post( "/api/v1/ontologies", - files={"ontology_file": ("test.owl", ontology_content)}, - data={"ontology_key": unique_key, "description": "Test"}, + files=[("ontology_file", ("test.owl", ontology_content, "application/xml"))], + data={"ontology_key": json.dumps([unique_key]), "description": json.dumps(["Test"])}, ) assert response.status_code == 200 data = response.json() - assert data["ontology_key"] == unique_key - assert "uploaded_at" in data + assert data["uploaded_ontologies"][0]["ontology_key"] == unique_key + assert "uploaded_at" in data["uploaded_ontologies"][0] @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) @@ -66,30 +68,197 @@ def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_defaul @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_missing_data(mock_get_default_user, client, mock_default_user): """Test 400 response for missing file or key""" + import json + mock_get_default_user.return_value = mock_default_user # Missing file - response = client.post("/api/v1/ontologies", data={"ontology_key": "test"}) + response = client.post("/api/v1/ontologies", data={"ontology_key": json.dumps(["test"])}) assert response.status_code == 400 # Missing key - response = client.post("/api/v1/ontologies", files={"ontology_file": ("test.owl", b"xml")}) + response = client.post( + "/api/v1/ontologies", files=[("ontology_file", ("test.owl", b"xml", "application/xml"))] + ) assert response.status_code == 400 @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_default_user): """Test behavior when default user is provided (no explicit authentication)""" + import json + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" mock_get_default_user.return_value = mock_default_user response = client.post( "/api/v1/ontologies", - files={"ontology_file": ("test.owl", b"")}, - data={"ontology_key": unique_key}, + files=[("ontology_file", ("test.owl", b"", "application/xml"))], + data={"ontology_key": json.dumps([unique_key])}, ) # The current system provides a default user when no explicit authentication is given # This test verifies the system works with conditional authentication assert response.status_code == 200 data = response.json() - assert data["ontology_key"] == unique_key - assert "uploaded_at" in data + assert data["uploaded_ontologies"][0]["ontology_key"] == unique_key + assert "uploaded_at" in data["uploaded_ontologies"][0] + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_user): + """Test uploading multiple ontology files in single request""" + import io + + # Create mock files + file1_content = b"" + file2_content = b"" + + files = [ + ("ontology_file", ("vehicles.owl", io.BytesIO(file1_content), "application/xml")), + ("ontology_file", ("manufacturers.owl", io.BytesIO(file2_content), "application/xml")), + ] + data = { + "ontology_key": '["vehicles", "manufacturers"]', + "descriptions": '["Base vehicles", "Car manufacturers"]', + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + + assert response.status_code == 200 + result = response.json() + assert "uploaded_ontologies" in result + assert len(result["uploaded_ontologies"]) == 2 + assert result["uploaded_ontologies"][0]["ontology_key"] == "vehicles" + assert result["uploaded_ontologies"][1]["ontology_key"] == "manufacturers" + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_default_user): + """Test that upload endpoint accepts array parameters""" + import io + import json + + file_content = b"" + + files = [("ontology_file", ("single.owl", io.BytesIO(file_content), "application/xml"))] + data = { + "ontology_key": json.dumps(["single_key"]), + "descriptions": json.dumps(["Single ontology"]), + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + + assert response.status_code == 200 + result = response.json() + assert result["uploaded_ontologies"][0]["ontology_key"] == "single_key" + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_cognify_with_multiple_ontologies(mock_get_default_user, client, mock_default_user): + """Test cognify endpoint accepts multiple ontology keys""" + payload = { + "datasets": ["test_dataset"], + "ontology_key": ["ontology1", "ontology2"], # Array instead of string + "run_in_background": False, + } + + response = client.post("/api/v1/cognify", json=payload) + + # Should not fail due to ontology_key type + assert response.status_code in [200, 400, 409] # May fail for other reasons, not type + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_complete_multifile_workflow(mock_get_default_user, client, mock_default_user): + """Test complete workflow: upload multiple ontologies → cognify with multiple keys""" + import io + import json + + # Step 1: Upload multiple ontologies + file1_content = b""" + + + """ + + file2_content = b""" + + + """ + + files = [ + ("ontology_file", ("vehicles.owl", io.BytesIO(file1_content), "application/xml")), + ("ontology_file", ("manufacturers.owl", io.BytesIO(file2_content), "application/xml")), + ] + data = { + "ontology_key": json.dumps(["vehicles", "manufacturers"]), + "descriptions": json.dumps(["Vehicle ontology", "Manufacturer ontology"]), + } + + upload_response = client.post("/api/v1/ontologies", files=files, data=data) + assert upload_response.status_code == 200 + + # Step 2: Verify ontologies are listed + list_response = client.get("/api/v1/ontologies") + assert list_response.status_code == 200 + ontologies = list_response.json() + assert "vehicles" in ontologies + assert "manufacturers" in ontologies + + # Step 3: Test cognify with multiple ontologies + cognify_payload = { + "datasets": ["test_dataset"], + "ontology_key": ["vehicles", "manufacturers"], + "run_in_background": False, + } + + cognify_response = client.post("/api/v1/cognify", json=cognify_payload) + # Should not fail due to ontology handling (may fail for dataset reasons) + assert cognify_response.status_code != 400 # Not a validation error + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_multifile_error_handling(mock_get_default_user, client, mock_default_user): + """Test error handling for invalid multifile uploads""" + import io + import json + + # Test mismatched array lengths + file_content = b"" + files = [("ontology_file", ("test.owl", io.BytesIO(file_content), "application/xml"))] + data = { + "ontology_key": json.dumps(["key1", "key2"]), # 2 keys, 1 file + "descriptions": json.dumps(["desc1"]), + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + assert response.status_code == 400 + assert "Number of keys must match number of files" in response.json()["error"] + + # Test duplicate keys + files = [ + ("ontology_file", ("test1.owl", io.BytesIO(file_content), "application/xml")), + ("ontology_file", ("test2.owl", io.BytesIO(file_content), "application/xml")), + ] + data = { + "ontology_key": json.dumps(["duplicate", "duplicate"]), + "descriptions": json.dumps(["desc1", "desc2"]), + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + assert response.status_code == 400 + assert "Duplicate ontology keys not allowed" in response.json()["error"] + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_cognify_missing_ontology_key(mock_get_default_user, client, mock_default_user): + """Test cognify with non-existent ontology key""" + payload = { + "datasets": ["test_dataset"], + "ontology_key": ["nonexistent_key"], + "run_in_background": False, + } + + response = client.post("/api/v1/cognify", json=payload) + assert response.status_code == 409 + assert "Ontology key 'nonexistent_key' not found" in response.json()["error"] From 983bfae4fcc9046fd520f0c24733e491679d54cf Mon Sep 17 00:00:00 2001 From: EricXiao Date: Mon, 17 Nov 2025 14:41:55 +0800 Subject: [PATCH 76/85] chore: remove unnecessary csv file type Signed-off-by: EricXiao --- .../files/utils/is_csv_content.py | 181 ------------------ 1 file changed, 181 deletions(-) delete mode 100644 cognee/infrastructure/files/utils/is_csv_content.py diff --git a/cognee/infrastructure/files/utils/is_csv_content.py b/cognee/infrastructure/files/utils/is_csv_content.py deleted file mode 100644 index 07b7ea69b..000000000 --- a/cognee/infrastructure/files/utils/is_csv_content.py +++ /dev/null @@ -1,181 +0,0 @@ -import csv -from collections import Counter - - -def is_csv_content(content): - """ - Heuristically determine whether a bytes-like object is CSV text. - - Strategy (fail-fast and cheap to expensive): - 1) Decode: Try a small ordered list of common encodings with strict errors. - 2) Line sampling: require >= 2 non-empty lines; sample up to 50 lines. - 3) Delimiter detection: - - Prefer csv.Sniffer() with common delimiters. - - Fallback to a lightweight consistency heuristic. - 4) Lightweight parse check: - - Parse a few lines with the delimiter. - - Ensure at least 2 valid rows and relatively stable column counts. - - Returns: - bool: True if the buffer looks like CSV; False otherwise. - """ - try: - encoding_list = [ - "utf-8", - "utf-8-sig", - "utf-32-le", - "utf-32-be", - "utf-16-le", - "utf-16-be", - "gb18030", - "shift_jis", - "cp949", - "cp1252", - "iso-8859-1", - ] - - # Try to decode strictly—if decoding fails for all encodings, it's not text/CSV. - text = None - for enc in encoding_list: - try: - text = content.decode(enc, errors="strict") - break - except UnicodeDecodeError: - continue - if text is None: - return False - - # Reject empty/whitespace-only payloads. - stripped = text.strip() - if not stripped: - return False - - # Split into logical lines and drop empty ones. Require at least two lines. - lines = [ln for ln in text.splitlines() if ln.strip()] - if len(lines) < 2: - return False - - # Take a small sample to keep sniffing cheap and predictable. - sample_lines = lines[:50] - - # Detect delimiter using csv.Sniffer first; if that fails, use our heuristic. - delimiter = _sniff_delimiter(sample_lines) or _heuristic_delimiter(sample_lines) - if not delimiter: - return False - - # Finally, do a lightweight parse sanity check with the chosen delimiter. - return _lightweight_parse_check(sample_lines, delimiter) - except Exception: - return False - - -def _sniff_delimiter(lines): - """ - Try Python's built-in csv.Sniffer on a sample. - - Args: - lines (list[str]): Sample lines (already decoded). - - Returns: - str | None: The detected delimiter if sniffing succeeds; otherwise None. - """ - # Join up to 50 lines to form the sample string Sniffer will inspect. - sample = "\n".join(lines[:50]) - try: - dialect = csv.Sniffer().sniff(sample, delimiters=",\t;|") - return dialect.delimiter - except Exception: - # Sniffer is known to be brittle on small/dirty samples—silently fallback. - return None - - -def _heuristic_delimiter(lines): - """ - Fallback delimiter detection based on count consistency per line. - - Heuristic: - - For each candidate delimiter, count occurrences per line. - - Keep only lines with count > 0 (line must contain the delimiter). - - Require at least half of lines to contain the delimiter (min 2). - - Compute the mode (most common count). If the proportion of lines that - exhibit the modal count is >= 80%, accept that delimiter. - - Args: - lines (list[str]): Sample lines. - - Returns: - str | None: Best delimiter if one meets the consistency threshold; else None. - """ - candidates = [",", "\t", ";", "|"] - best = None - best_score = 0.0 - - for d in candidates: - # Count how many times the delimiter appears in each line. - counts = [ln.count(d) for ln in lines] - # Consider only lines that actually contain the delimiter at least once. - nonzero = [c for c in counts if c > 0] - - # Require that more than half of lines (and at least 2) contain the delimiter. - if len(nonzero) < max(2, int(0.5 * len(lines))): - continue - - # Find the modal count and its frequency. - cnt = Counter(nonzero) - pairs = cnt.most_common(1) - if not pairs: - continue - - mode, mode_freq = pairs[0] - # Consistency ratio: lines with the modal count / total lines in the sample. - consistency = mode_freq / len(lines) - # Accept if consistent enough and better than any previous candidate. - if mode >= 1 and consistency >= 0.80 and consistency > best_score: - best = d - best_score = consistency - - return best - - -def _lightweight_parse_check(lines, delimiter): - """ - Parse a few lines with csv.reader and check structural stability. - - Heuristic: - - Parse up to 5 lines with the given delimiter. - - Count column widths per parsed row. - - Require at least 2 non-empty rows. - - Allow at most 1 row whose width deviates by >2 columns from the first row. - - Args: - lines (list[str]): Sample lines (decoded). - delimiter (str): Delimiter chosen by sniffing/heuristics. - - Returns: - bool: True if parsing looks stable; False otherwise. - """ - try: - # csv.reader accepts any iterable of strings; feeding the first 10 lines is fine. - reader = csv.reader(lines[:10], delimiter=delimiter) - widths = [] - valid_rows = 0 - for row in reader: - if not row: - continue - - widths.append(len(row)) - valid_rows += 1 - - # Need at least two meaningful rows to make a judgment. - if valid_rows < 2: - return False - - if widths: - first = widths[0] - # Count rows whose width deviates significantly (>2) from the first row. - unstable = sum(1 for w in widths if abs(w - first) > 2) - # Permit at most 1 unstable row among the parsed sample. - return unstable <= 1 - return False - except Exception: - return False From fe55071849c06ecd3fe70602d56bb1f2904538a4 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:33:50 +0100 Subject: [PATCH 77/85] Feature/cog 3407 fixing integration test in ci (#1810) ## Description This PR should fix the web crawler integration test issue in our CI ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../test_default_url_crawler.py | 2 +- .../web_url_crawler/test_tavily_crawler.py | 2 +- .../web_url_crawler/test_url_adding_e2e.py | 40 ++++++------------- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py index 156cc87a4..f48c1cedc 100644 --- a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py @@ -5,7 +5,7 @@ from cognee.tasks.web_scraper import DefaultUrlCrawler @pytest.mark.asyncio async def test_fetch(): crawler = DefaultUrlCrawler() - url = "https://en.wikipedia.org/wiki/Large_language_model" + url = "https://httpbin.org/html" results = await crawler.fetch_urls(url) assert len(results) == 1 assert isinstance(results, dict) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py index 946ce8378..19ffdc4ea 100644 --- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -11,7 +11,7 @@ skip_in_ci = pytest.mark.skipif( @skip_in_ci @pytest.mark.asyncio async def test_fetch(): - url = "https://en.wikipedia.org/wiki/Large_language_model" + url = "https://httpbin.org/html" results = await fetch_with_tavily(url) assert isinstance(results, dict) assert len(results) == 1 diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index d91b075aa..cc8ae24d0 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -14,9 +14,7 @@ async def test_url_saves_as_html_file(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -44,9 +42,7 @@ async def test_saved_html_is_valid(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) content = Path(file_path).read_text() @@ -72,7 +68,7 @@ async def test_add_url(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") + await cognee.add("https://httpbin.org/html") skip_in_ci = pytest.mark.skipif( @@ -88,7 +84,7 @@ async def test_add_url_with_tavily(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") + await cognee.add("https://httpbin.org/html") @pytest.mark.asyncio @@ -98,7 +94,7 @@ async def test_add_url_without_incremental_loading(): try: await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", incremental_loading=False, ) except Exception as e: @@ -112,7 +108,7 @@ async def test_add_url_with_incremental_loading(): try: await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", incremental_loading=True, ) except Exception as e: @@ -125,7 +121,7 @@ async def test_add_url_can_define_preferred_loader_as_list_of_str(): await cognee.prune.prune_system(metadata=True) await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", preferred_loaders=["beautiful_soup_loader"], ) @@ -144,7 +140,7 @@ async def test_add_url_with_extraction_rules(): try: await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) except Exception as e: @@ -163,9 +159,7 @@ async def test_loader_is_none_by_default(): } try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -196,9 +190,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov } try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -225,9 +217,7 @@ async def test_beautiful_soup_loader_works_with_and_without_arguments(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -263,9 +253,7 @@ async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_pr await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -302,9 +290,7 @@ async def test_beautiful_soup_loads_file_successfully(): } try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") original_file = Path(file_path) From 30e3971d44816db50d9e83eee39bf6d69b98a328 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Thu, 20 Nov 2025 15:36:15 +0500 Subject: [PATCH 78/85] fix: add auth headers to ontology upload request and enhance ontology content --- cognee/tests/test_cognee_server_start.py | 53 +++++++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/cognee/tests/test_cognee_server_start.py b/cognee/tests/test_cognee_server_start.py index b266fc7bf..ddffe53a4 100644 --- a/cognee/tests/test_cognee_server_start.py +++ b/cognee/tests/test_cognee_server_start.py @@ -98,15 +98,56 @@ class TestCogneeServerStart(unittest.TestCase): if add_response.status_code not in [200, 201]: add_response.raise_for_status() - ontology_content = b""" - - - - - """ + ontology_content = b""" + + + + + + + + + + + + + + + + A failure caused by physical components. + + + + + An error caused by software logic or configuration. + + + + A human being or individual. + + + + + Programmers + + + + Light Bulb + + + + Hardware Problem + + + """ ontology_response = requests.post( "http://127.0.0.1:8000/api/v1/ontologies", + headers=headers, files=[("ontology_file", ("test.owl", ontology_content, "application/xml"))], data={ "ontology_key": json.dumps([ontology_key]), From 8cfb6c41eeca3b2ad0e34fb4b6043f4b5f6a8c00 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Thu, 20 Nov 2025 15:54:09 +0500 Subject: [PATCH 79/85] fix: remove async from ontology endpoint test functions --- cognee/tests/unit/api/test_ontology_endpoint.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cognee/tests/unit/api/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py index c04959998..d53c5ab44 100644 --- a/cognee/tests/unit/api/test_ontology_endpoint.py +++ b/cognee/tests/unit/api/test_ontology_endpoint.py @@ -104,7 +104,7 @@ def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_defaul @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_user): +def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_user): """Test uploading multiple ontology files in single request""" import io @@ -132,7 +132,7 @@ async def test_upload_multiple_ontologies(mock_get_default_user, client, mock_de @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_default_user): +def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_default_user): """Test that upload endpoint accepts array parameters""" import io import json @@ -153,7 +153,7 @@ async def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, moc @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_cognify_with_multiple_ontologies(mock_get_default_user, client, mock_default_user): +def test_cognify_with_multiple_ontologies(mock_get_default_user, client, mock_default_user): """Test cognify endpoint accepts multiple ontology keys""" payload = { "datasets": ["test_dataset"], @@ -168,7 +168,7 @@ async def test_cognify_with_multiple_ontologies(mock_get_default_user, client, m @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_complete_multifile_workflow(mock_get_default_user, client, mock_default_user): +def test_complete_multifile_workflow(mock_get_default_user, client, mock_default_user): """Test complete workflow: upload multiple ontologies → cognify with multiple keys""" import io import json @@ -218,7 +218,7 @@ async def test_complete_multifile_workflow(mock_get_default_user, client, mock_d @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_multifile_error_handling(mock_get_default_user, client, mock_default_user): +def test_multifile_error_handling(mock_get_default_user, client, mock_default_user): """Test error handling for invalid multifile uploads""" import io import json @@ -251,7 +251,7 @@ async def test_multifile_error_handling(mock_get_default_user, client, mock_defa @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_cognify_missing_ontology_key(mock_get_default_user, client, mock_default_user): +def test_cognify_missing_ontology_key(mock_get_default_user, client, mock_default_user): """Test cognify with non-existent ontology key""" payload = { "datasets": ["test_dataset"], From 4e880eca8422872b26a568b18b9f1339ce362c18 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 20 Nov 2025 15:47:22 +0100 Subject: [PATCH 80/85] chore: update env template --- .env.template | 1 + 1 file changed, 1 insertion(+) diff --git a/.env.template b/.env.template index ae2cb1338..376233b1f 100644 --- a/.env.template +++ b/.env.template @@ -21,6 +21,7 @@ LLM_PROVIDER="openai" LLM_ENDPOINT="" LLM_API_VERSION="" LLM_MAX_TOKENS="16384" +LLM_INSTRUCTOR_MODE="json_schema_mode" # this mode is used for gpt-5 models EMBEDDING_PROVIDER="openai" EMBEDDING_MODEL="openai/text-embedding-3-large" From 2176ec16b8e440087f96410fed979528e8159ca2 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:03:36 +0100 Subject: [PATCH 81/85] chore: changes url for crawler tests (#1816) Updates crawler test url to avoid blocking and unavailable sites in CI. ## Description ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../test_default_url_crawler.py | 2 +- .../web_url_crawler/test_tavily_crawler.py | 2 +- .../web_url_crawler/test_url_adding_e2e.py | 26 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py index f48c1cedc..af2595b14 100644 --- a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py @@ -5,7 +5,7 @@ from cognee.tasks.web_scraper import DefaultUrlCrawler @pytest.mark.asyncio async def test_fetch(): crawler = DefaultUrlCrawler() - url = "https://httpbin.org/html" + url = "http://example.com/" results = await crawler.fetch_urls(url) assert len(results) == 1 assert isinstance(results, dict) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py index 19ffdc4ea..5db9b58ce 100644 --- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -11,7 +11,7 @@ skip_in_ci = pytest.mark.skipif( @skip_in_ci @pytest.mark.asyncio async def test_fetch(): - url = "https://httpbin.org/html" + url = "http://example.com/" results = await fetch_with_tavily(url) assert isinstance(results, dict) assert len(results) == 1 diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index cc8ae24d0..200f40a94 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -14,7 +14,7 @@ async def test_url_saves_as_html_file(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -42,7 +42,7 @@ async def test_saved_html_is_valid(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) content = Path(file_path).read_text() @@ -68,7 +68,7 @@ async def test_add_url(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://httpbin.org/html") + await cognee.add("http://example.com/") skip_in_ci = pytest.mark.skipif( @@ -84,7 +84,7 @@ async def test_add_url_with_tavily(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://httpbin.org/html") + await cognee.add("http://example.com/") @pytest.mark.asyncio @@ -94,7 +94,7 @@ async def test_add_url_without_incremental_loading(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", incremental_loading=False, ) except Exception as e: @@ -108,7 +108,7 @@ async def test_add_url_with_incremental_loading(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", incremental_loading=True, ) except Exception as e: @@ -121,7 +121,7 @@ async def test_add_url_can_define_preferred_loader_as_list_of_str(): await cognee.prune.prune_system(metadata=True) await cognee.add( - "https://httpbin.org/html", + "http://example.com/", preferred_loaders=["beautiful_soup_loader"], ) @@ -140,7 +140,7 @@ async def test_add_url_with_extraction_rules(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) except Exception as e: @@ -159,7 +159,7 @@ async def test_loader_is_none_by_default(): } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -190,7 +190,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -217,7 +217,7 @@ async def test_beautiful_soup_loader_works_with_and_without_arguments(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -253,7 +253,7 @@ async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_pr await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -290,7 +290,7 @@ async def test_beautiful_soup_loads_file_successfully(): } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") original_file = Path(file_path) From 204f9c2e4ad6dc706c03deed68adfbc4744ae6df Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Fri, 21 Nov 2025 16:20:19 +0100 Subject: [PATCH 82/85] fix: PR comment changes --- .env.template | 5 ++++- cognee/infrastructure/llm/config.py | 4 ++-- .../litellm_instructor/llm/get_llm_client.py | 12 ++++++------ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.env.template b/.env.template index 376233b1f..61853b983 100644 --- a/.env.template +++ b/.env.template @@ -21,7 +21,10 @@ LLM_PROVIDER="openai" LLM_ENDPOINT="" LLM_API_VERSION="" LLM_MAX_TOKENS="16384" -LLM_INSTRUCTOR_MODE="json_schema_mode" # this mode is used for gpt-5 models +# Instructor's modes determine how structured data is requested from and extracted from LLM responses +# You can change this type (i.e. mode) via this env variable +# Each LLM has its own default value, e.g. gpt-5 models have "json_schema_mode" +LLM_INSTRUCTOR_MODE="" EMBEDDING_PROVIDER="openai" EMBEDDING_MODEL="openai/text-embedding-3-large" diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index c87054ff6..2e300dc0c 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -38,7 +38,7 @@ class LLMConfig(BaseSettings): """ structured_output_framework: str = "instructor" - llm_instructor_mode: Optional[str] = None + llm_instructor_mode: str = "" llm_provider: str = "openai" llm_model: str = "openai/gpt-5-mini" llm_endpoint: str = "" @@ -182,7 +182,7 @@ class LLMConfig(BaseSettings): instance. """ return { - "llm_instructor_mode": self.llm_instructor_mode, + "llm_instructor_mode": self.llm_instructor_mode.lower(), "provider": self.llm_provider, "model": self.llm_model, "endpoint": self.llm_endpoint, diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py index 537eda1b2..6ab3b91ad 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py @@ -81,7 +81,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, transcription_model=llm_config.transcription_model, max_completion_tokens=max_completion_tokens, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), streaming=llm_config.llm_streaming, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, @@ -102,7 +102,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Ollama", max_completion_tokens=max_completion_tokens, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.ANTHROPIC: @@ -113,7 +113,7 @@ def get_llm_client(raise_api_key_error: bool = True): return AnthropicAdapter( max_completion_tokens=max_completion_tokens, model=llm_config.llm_model, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.CUSTOM: @@ -130,7 +130,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Custom", max_completion_tokens=max_completion_tokens, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, fallback_model=llm_config.fallback_model, @@ -150,7 +150,7 @@ def get_llm_client(raise_api_key_error: bool = True): max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, api_version=llm_config.llm_api_version, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.MISTRAL: @@ -166,7 +166,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.MISTRAL: From af8c55e82bd5dcefb526ffc106f3dcbc1a881a24 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 24 Nov 2025 16:16:47 +0100 Subject: [PATCH 83/85] version: 0.5.0.dev0 --- poetry.lock | 62 +++++++++++++++++++++----------------------------- pyproject.toml | 2 +- uv.lock | 10 +++++++- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/poetry.lock b/poetry.lock index 67de51633..0736a7bb7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "accelerate" @@ -1231,12 +1231,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["main", "dev"] +groups = ["main"] +markers = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -markers = {main = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")", dev = "sys_platform == \"win32\""} [[package]] name = "coloredlogs" @@ -2347,7 +2347,7 @@ version = "1.3.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main"] markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"}, @@ -3732,14 +3732,14 @@ type = ["pytest-mypy"] name = "iniconfig" version = "2.1.0" description = "brain-dead simple config-ini parsing" -optional = false +optional = true python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main"] +markers = "extra == \"deepeval\" or extra == \"dev\"" files = [ {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] -markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [[package]] name = "instructor" @@ -4196,8 +4196,6 @@ groups = ["main"] markers = "extra == \"dlt\"" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, - {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, - {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -7634,7 +7632,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -8289,14 +8287,14 @@ kaleido = ["kaleido (>=1.0.0)"] name = "pluggy" version = "1.6.0" description = "plugin and hook calling mechanisms for python" -optional = false +optional = true python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main"] +markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\"" files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] -markers = {main = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\""} [package.extras] dev = ["pre-commit", "tox"] @@ -8656,7 +8654,6 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, - {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -8718,7 +8715,6 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, - {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -9698,14 +9694,14 @@ files = [ name = "pytest" version = "7.4.4" description = "pytest: simple powerful testing with Python" -optional = false +optional = true python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main"] +markers = "extra == \"deepeval\" or extra == \"dev\"" files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] -markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -9792,21 +9788,6 @@ files = [ packaging = ">=17.1" pytest = ">=6.2" -[[package]] -name = "pytest-timeout" -version = "2.4.0" -description = "pytest plugin to abort hanging tests" -optional = false -python-versions = ">=3.7" -groups = ["dev"] -files = [ - {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"}, - {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"}, -] - -[package.dependencies] -pytest = ">=7.0.0" - [[package]] name = "pytest-xdist" version = "3.8.0" @@ -11656,7 +11637,9 @@ groups = ["main"] files = [ {file = "SQLAlchemy-2.0.43-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:21ba7a08a4253c5825d1db389d4299f64a100ef9800e4624c8bf70d8f136e6ed"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11b9503fa6f8721bef9b8567730f664c5a5153d25e247aadc69247c4bc605227"}, + {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07097c0a1886c150ef2adba2ff7437e84d40c0f7dcb44a2c2b9c905ccfc6361c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cdeff998cb294896a34e5b2f00e383e7c5c4ef3b4bfa375d9104723f15186443"}, + {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:bcf0724a62a5670e5718957e05c56ec2d6850267ea859f8ad2481838f889b42c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win32.whl", hash = "sha256:c697575d0e2b0a5f0433f679bda22f63873821d991e95a90e9e52aae517b2e32"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win_amd64.whl", hash = "sha256:d34c0f6dbefd2e816e8f341d0df7d4763d382e3f452423e752ffd1e213da2512"}, {file = "sqlalchemy-2.0.43-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:70322986c0c699dca241418fcf18e637a4369e0ec50540a2b907b184c8bca069"}, @@ -11691,12 +11674,20 @@ files = [ {file = "sqlalchemy-2.0.43-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9df7126fd9db49e3a5a3999442cc67e9ee8971f3cb9644250107d7296cb2a164"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win32.whl", hash = "sha256:7f1ac7828857fcedb0361b48b9ac4821469f7694089d15550bbcf9ab22564a1d"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win_amd64.whl", hash = "sha256:971ba928fcde01869361f504fcff3b7143b47d30de188b11c6357c0505824197"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4e6aeb2e0932f32950cf56a8b4813cb15ff792fc0c9b3752eaf067cfe298496a"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:61f964a05356f4bca4112e6334ed7c208174511bd56e6b8fc86dad4d024d4185"}, {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46293c39252f93ea0910aababa8752ad628bcce3a10d3f260648dd472256983f"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:136063a68644eca9339d02e6693932116f6a8591ac013b0014479a1de664e40a"}, {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6e2bf13d9256398d037fef09fd8bf9b0bf77876e22647d10761d35593b9ac547"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:44337823462291f17f994d64282a71c51d738fc9ef561bf265f1d0fd9116a782"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win32.whl", hash = "sha256:13194276e69bb2af56198fef7909d48fd34820de01d9c92711a5fa45497cc7ed"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win_amd64.whl", hash = "sha256:334f41fa28de9f9be4b78445e68530da3c5fa054c907176460c81494f4ae1f5e"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ceb5c832cc30663aeaf5e39657712f4c4241ad1f638d487ef7216258f6d41fe7"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11f43c39b4b2ec755573952bbcc58d976779d482f6f832d7f33a8d869ae891bf"}, {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:413391b2239db55be14fa4223034d7e13325a1812c8396ecd4f2c08696d5ccad"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c379e37b08c6c527181a397212346be39319fb64323741d23e46abd97a400d34"}, {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:03d73ab2a37d9e40dec4984d1813d7878e01dbdc742448d44a7341b7a9f408c7"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8cee08f15d9e238ede42e9bbc1d6e7158d0ca4f176e4eab21f88ac819ae3bd7b"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win32.whl", hash = "sha256:b3edaec7e8b6dc5cd94523c6df4f294014df67097c8217a89929c99975811414"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win_amd64.whl", hash = "sha256:227119ce0a89e762ecd882dc661e0aa677a690c914e358f0dd8932a2e8b2765b"}, {file = "sqlalchemy-2.0.43-py3-none-any.whl", hash = "sha256:1681c21dd2ccee222c2fe0bef671d1aef7c504087c9c4e800371cfcc8ac966fc"}, @@ -12065,7 +12056,7 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main"] markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, @@ -12537,12 +12528,11 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] -markers = {dev = "python_version == \"3.10\""} [[package]] name = "typing-inspect" diff --git a/pyproject.toml b/pyproject.toml index 2436911e8..a9b895dfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "cognee" -version = "0.3.9" +version = "0.5.0.dev0" description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning." authors = [ { name = "Vasilije Markovic" }, diff --git a/uv.lock b/uv.lock index 8c35a3366..cc66c3d7e 100644 --- a/uv.lock +++ b/uv.lock @@ -929,7 +929,7 @@ wheels = [ [[package]] name = "cognee" -version = "0.3.9" +version = "0.5.0.dev0" source = { editable = "." } dependencies = [ { name = "aiofiles" }, @@ -2560,6 +2560,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" }, { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" }, { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" }, + { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" }, { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" }, { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" }, { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" }, @@ -2569,6 +2571,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, @@ -2578,6 +2582,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -2587,6 +2593,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, ] From c2c64a417c4805c5d5aeb59b4e5f9519b729ee85 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 24 Nov 2025 17:44:51 +0100 Subject: [PATCH 84/85] fix: fixes ontology api endpoint tests + poetry lock(#1824) ## Description This PR fixes the failing CI tests related to the new ontology api endpoint. ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../tests/unit/api/test_ontology_endpoint.py | 10 +++- poetry.lock | 52 +++++++++++++------ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/cognee/tests/unit/api/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py index d53c5ab44..af3a4d90e 100644 --- a/cognee/tests/unit/api/test_ontology_endpoint.py +++ b/cognee/tests/unit/api/test_ontology_endpoint.py @@ -25,7 +25,10 @@ def mock_user(): def mock_default_user(): """Mock default user for testing.""" return SimpleNamespace( - id=uuid.uuid4(), email="default@example.com", is_active=True, tenant_id=uuid.uuid4() + id=str(uuid.uuid4()), + email="default@example.com", + is_active=True, + tenant_id=str(uuid.uuid4()), ) @@ -108,6 +111,7 @@ def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_ """Test uploading multiple ontology files in single request""" import io + mock_get_default_user.return_value = mock_default_user # Create mock files file1_content = b"" file2_content = b"" @@ -137,6 +141,7 @@ def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_defa import io import json + mock_get_default_user.return_value = mock_default_user file_content = b"" files = [("ontology_file", ("single.owl", io.BytesIO(file_content), "application/xml"))] @@ -173,6 +178,7 @@ def test_complete_multifile_workflow(mock_get_default_user, client, mock_default import io import json + mock_get_default_user.return_value = mock_default_user # Step 1: Upload multiple ontologies file1_content = b""" =1.0.0)"] name = "pluggy" version = "1.6.0" description = "plugin and hook calling mechanisms for python" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\"" +groups = ["main", "dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\""} [package.extras] dev = ["pre-commit", "tox"] @@ -8654,6 +8656,7 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, + {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -8715,6 +8718,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -9694,14 +9698,14 @@ files = [ name = "pytest" version = "7.4.4" description = "pytest: simple powerful testing with Python" -optional = true +optional = false python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\"" +groups = ["main", "dev"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -9788,6 +9792,21 @@ files = [ packaging = ">=17.1" pytest = ">=6.2" +[[package]] +name = "pytest-timeout" +version = "2.4.0" +description = "pytest plugin to abort hanging tests" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"}, + {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"}, +] + +[package.dependencies] +pytest = ">=7.0.0" + [[package]] name = "pytest-xdist" version = "3.8.0" @@ -12056,7 +12075,7 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, @@ -12528,11 +12547,12 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] +markers = {dev = "python_version == \"3.10\""} [[package]] name = "typing-inspect" From 508165e883aa9dc3232d8021daa94765439aded1 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 26 Nov 2025 15:18:53 +0100 Subject: [PATCH 85/85] feature: Introduces wide subgraph search in graph completion and improves QA speed (#1736) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces wide vector and graph structure filtering capabilities. With these changes, the graph completion retriever and all retrievers that inherit from it will now filter relevant vector elements and subgraphs based on the query. This improvement significantly increases search speed for large graphs while maintaining—and in some cases slightly improving—accuracy. Changes in This PR: -Introduced new wide_search_top_k parameter: Controls the initial search space size -Added graph adapter level filtering method: Enables relevant subgraph filtering while maintaining backward compatibility. For community or custom graph adapters that don't implement this method, the system gracefully falls back to the original search behavior. -Updated modal dashboard and evaluation framework: Fixed compatibility issues. Added comprehensive unit tests: Introduced unit tests for brute_force_triplet_search (previously untested) and expanded the CogneeGraph test suite. Integration tests: Existing integration tests verify end-to-end search functionality (no changes required). Acceptance Criteria and Testing To verify the new search behavior, run search queries with different wide_search_top_k parameters while logging is enabled: None: Triggers a full graph search (default behavior) 1: Projects a minimal subgraph (demonstrates maximum filtering) Custom values: Test intermediate levels of filtering Internal Testing and results: Performance and accuracy benchmarks are available upon request. The implementation demonstrates measurable improvements in query latency for large graphs without sacrificing result quality. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [x] Code refactoring - [x] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) None ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Pavel Zorin --- cognee/api/v1/search/search.py | 4 + cognee/eval_framework/Dockerfile | 29 + .../answer_generation_executor.py | 10 + .../run_question_answering_module.py | 2 +- cognee/eval_framework/eval_config.py | 4 +- cognee/eval_framework/modal_run_eval.py | 46 +- .../databases/graph/graph_db_interface.py | 15 + .../databases/graph/kuzu/adapter.py | 97 +++ .../databases/graph/neo4j_driver/adapter.py | 57 ++ .../modules/graph/cognee_graph/CogneeGraph.py | 110 +++- .../graph/cognee_graph/CogneeGraphElements.py | 11 +- ..._completion_context_extension_retriever.py | 4 + .../graph_completion_cot_retriever.py | 4 + .../retrieval/graph_completion_retriever.py | 10 + .../graph_summary_completion_retriever.py | 4 + .../modules/retrieval/temporal_retriever.py | 4 + .../utils/brute_force_triplet_search.py | 45 +- .../search/methods/get_search_type_tools.py | 30 +- .../methods/no_access_control_search.py | 4 + cognee/modules/search/methods/search.py | 21 + .../graph/cognee_graph_elements_test.py | 4 +- .../unit/modules/graph/cognee_graph_test.py | 455 ++++++++++++++ .../test_brute_force_triplet_search.py | 582 ++++++++++++++++++ 23 files changed, 1482 insertions(+), 70 deletions(-) create mode 100644 cognee/eval_framework/Dockerfile create mode 100644 cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index d4e5fbbe6..354331c57 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -31,6 +31,8 @@ async def search( only_context: bool = False, use_combined_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Union[List[SearchResult], CombinedSearchResult]: """ Search and query the knowledge graph for insights, information, and connections. @@ -200,6 +202,8 @@ async def search( only_context=only_context, use_combined_context=use_combined_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) return filtered_search_results diff --git a/cognee/eval_framework/Dockerfile b/cognee/eval_framework/Dockerfile new file mode 100644 index 000000000..e83be3da4 --- /dev/null +++ b/cognee/eval_framework/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11-slim + +# Set environment variables +ENV PIP_NO_CACHE_DIR=true +ENV PATH="${PATH}:/root/.poetry/bin" +ENV PYTHONPATH=/app +ENV SKIP_MIGRATIONS=true + +# System dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + libpq-dev \ + git \ + curl \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY pyproject.toml poetry.lock README.md /app/ + +RUN pip install poetry + +RUN poetry config virtualenvs.create false + +RUN poetry install --extras distributed --extras evals --extras deepeval --no-root + +COPY cognee/ /app/cognee +COPY distributed/ /app/distributed diff --git a/cognee/eval_framework/answer_generation/answer_generation_executor.py b/cognee/eval_framework/answer_generation/answer_generation_executor.py index 6f166657e..29b3ede68 100644 --- a/cognee/eval_framework/answer_generation/answer_generation_executor.py +++ b/cognee/eval_framework/answer_generation/answer_generation_executor.py @@ -35,6 +35,16 @@ class AnswerGeneratorExecutor: retrieval_context = await retriever.get_context(query_text) search_results = await retriever.get_completion(query_text, retrieval_context) + ############ + #:TODO This is a quick fix until we don't structure retriever results properly but lets not leave it like this...this is needed now due to the changed combined retriever structure.. + if isinstance(retrieval_context, list): + retrieval_context = await retriever.convert_retrieved_objects_to_context( + triplets=retrieval_context + ) + + if isinstance(search_results, str): + search_results = [search_results] + ############# answer = { "question": query_text, "answer": search_results[0], diff --git a/cognee/eval_framework/answer_generation/run_question_answering_module.py b/cognee/eval_framework/answer_generation/run_question_answering_module.py index d0a2ebe1e..6b55d84b2 100644 --- a/cognee/eval_framework/answer_generation/run_question_answering_module.py +++ b/cognee/eval_framework/answer_generation/run_question_answering_module.py @@ -35,7 +35,7 @@ async def create_and_insert_answers_table(questions_payload): async def run_question_answering( - params: dict, system_prompt="answer_simple_question.txt", top_k: Optional[int] = None + params: dict, system_prompt="answer_simple_question_benchmark.txt", top_k: Optional[int] = None ) -> List[dict]: if params.get("answering_questions"): logger.info("Question answering started...") diff --git a/cognee/eval_framework/eval_config.py b/cognee/eval_framework/eval_config.py index 6edcc0454..9e6f26688 100644 --- a/cognee/eval_framework/eval_config.py +++ b/cognee/eval_framework/eval_config.py @@ -14,7 +14,7 @@ class EvalConfig(BaseSettings): # Question answering params answering_questions: bool = True - qa_engine: str = "cognee_completion" # Options: 'cognee_completion' or 'cognee_graph_completion' or 'cognee_graph_completion_cot' or 'cognee_graph_completion_context_extension' + qa_engine: str = "cognee_graph_completion" # Options: 'cognee_completion' or 'cognee_graph_completion' or 'cognee_graph_completion_cot' or 'cognee_graph_completion_context_extension' # Evaluation params evaluating_answers: bool = True @@ -25,7 +25,7 @@ class EvalConfig(BaseSettings): "EM", "f1", ] # Use only 'correctness' for DirectLLM - deepeval_model: str = "gpt-5-mini" + deepeval_model: str = "gpt-4o-mini" # Metrics params calculate_metrics: bool = True diff --git a/cognee/eval_framework/modal_run_eval.py b/cognee/eval_framework/modal_run_eval.py index aca2686a5..bc2ff77c5 100644 --- a/cognee/eval_framework/modal_run_eval.py +++ b/cognee/eval_framework/modal_run_eval.py @@ -2,7 +2,6 @@ import modal import os import asyncio import datetime -import hashlib import json from cognee.shared.logging_utils import get_logger from cognee.eval_framework.eval_config import EvalConfig @@ -10,6 +9,9 @@ from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_b from cognee.eval_framework.answer_generation.run_question_answering_module import ( run_question_answering, ) +import pathlib +from os import path +from modal import Image from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation from cognee.eval_framework.metrics_dashboard import create_dashboard @@ -38,22 +40,19 @@ def read_and_combine_metrics(eval_params: dict) -> dict: app = modal.App("modal-run-eval") -image = ( - modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False) - .copy_local_file("pyproject.toml", "pyproject.toml") - .copy_local_file("poetry.lock", "poetry.lock") - .env( - { - "ENV": os.getenv("ENV"), - "LLM_API_KEY": os.getenv("LLM_API_KEY"), - "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), - } - ) - .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly") +image = Image.from_dockerfile( + path=pathlib.Path(path.join(path.dirname(__file__), "Dockerfile")).resolve(), + force_build=False, +).add_local_python_source("cognee") + + +@app.function( + image=image, + max_containers=10, + timeout=86400, + volumes={"/data": vol}, + secrets=[modal.Secret.from_name("eval_secrets")], ) - - -@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol}) async def modal_run_eval(eval_params=None): """Runs evaluation pipeline and returns combined metrics results.""" if eval_params is None: @@ -105,18 +104,7 @@ async def main(): configs = [ EvalConfig( task_getter_type="Default", - number_of_samples_in_corpus=10, - benchmark="HotPotQA", - qa_engine="cognee_graph_completion", - building_corpus_from_scratch=True, - answering_questions=True, - evaluating_answers=True, - calculate_metrics=True, - dashboard=True, - ), - EvalConfig( - task_getter_type="Default", - number_of_samples_in_corpus=10, + number_of_samples_in_corpus=25, benchmark="TwoWikiMultiHop", qa_engine="cognee_graph_completion", building_corpus_from_scratch=True, @@ -127,7 +115,7 @@ async def main(): ), EvalConfig( task_getter_type="Default", - number_of_samples_in_corpus=10, + number_of_samples_in_corpus=25, benchmark="Musique", qa_engine="cognee_graph_completion", building_corpus_from_scratch=True, diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 67df1a27c..8f8c96e79 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -398,3 +398,18 @@ class GraphDBInterface(ABC): - node_id (Union[str, UUID]): Unique identifier of the node for which to retrieve connections. """ raise NotImplementedError + + @abstractmethod + async def get_filtered_graph_data( + self, attribute_filters: List[Dict[str, List[Union[str, int]]]] + ) -> Tuple[List[Node], List[EdgeData]]: + """ + Retrieve nodes and edges filtered by the provided attribute criteria. + + Parameters: + ----------- + + - attribute_filters: A list of dictionaries where keys are attribute names and values + are lists of attribute values to filter by. + """ + raise NotImplementedError diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 8dd160665..9dbc9c1bc 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -12,6 +12,7 @@ from contextlib import asynccontextmanager from concurrent.futures import ThreadPoolExecutor from typing import Dict, Any, List, Union, Optional, Tuple, Type +from cognee.exceptions import CogneeValidationError from cognee.shared.logging_utils import get_logger from cognee.infrastructure.utils.run_sync import run_sync from cognee.infrastructure.files.storage import get_file_storage @@ -1186,6 +1187,11 @@ class KuzuAdapter(GraphDBInterface): A tuple with two elements: a list of tuples of (node_id, properties) and a list of tuples of (source_id, target_id, relationship_name, properties). """ + + import time + + start_time = time.time() + try: nodes_query = """ MATCH (n:Node) @@ -1249,6 +1255,11 @@ class KuzuAdapter(GraphDBInterface): }, ) ) + + retrieval_time = time.time() - start_time + logger.info( + f"Retrieved {len(nodes)} nodes and {len(edges)} edges in {retrieval_time:.2f} seconds" + ) return formatted_nodes, formatted_edges except Exception as e: logger.error(f"Failed to get graph data: {e}") @@ -1417,6 +1428,92 @@ class KuzuAdapter(GraphDBInterface): formatted_edges.append((source_id, target_id, rel_type, props)) return formatted_nodes, formatted_edges + async def get_id_filtered_graph_data(self, target_ids: list[str]): + """ + Retrieve graph data filtered by specific node IDs, including their direct neighbors + and only edges where one endpoint matches those IDs. + + Returns: + nodes: List[dict] -> Each dict includes "id" and all node properties + edges: List[dict] -> Each dict includes "source", "target", "type", "properties" + """ + import time + + start_time = time.time() + + try: + if not target_ids: + logger.warning("No target IDs provided for ID-filtered graph retrieval.") + return [], [] + + if not all(isinstance(x, str) for x in target_ids): + raise CogneeValidationError("target_ids must be a list of strings") + + query = """ + MATCH (n:Node)-[r]->(m:Node) + WHERE n.id IN $target_ids OR m.id IN $target_ids + RETURN n.id, { + name: n.name, + type: n.type, + properties: n.properties + }, m.id, { + name: m.name, + type: m.type, + properties: m.properties + }, r.relationship_name, r.properties + """ + + result = await self.query(query, {"target_ids": target_ids}) + + if not result: + logger.info("No data returned for the supplied IDs") + return [], [] + + nodes_dict = {} + edges = [] + + for n_id, n_props, m_id, m_props, r_type, r_props_raw in result: + if n_props.get("properties"): + try: + additional_props = json.loads(n_props["properties"]) + n_props.update(additional_props) + del n_props["properties"] + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties JSON for node {n_id}") + + if m_props.get("properties"): + try: + additional_props = json.loads(m_props["properties"]) + m_props.update(additional_props) + del m_props["properties"] + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties JSON for node {m_id}") + + nodes_dict[n_id] = (n_id, n_props) + nodes_dict[m_id] = (m_id, m_props) + + edge_props = {} + if r_props_raw: + try: + edge_props = json.loads(r_props_raw) + except (json.JSONDecodeError, TypeError): + logger.warning(f"Failed to parse edge properties for {n_id}->{m_id}") + + source_id = edge_props.get("source_node_id", n_id) + target_id = edge_props.get("target_node_id", m_id) + edges.append((source_id, target_id, r_type, edge_props)) + + retrieval_time = time.time() - start_time + logger.info( + f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s" + ) + + return list(nodes_dict.values()), edges + + except Exception as e: + logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}") + raise + async def get_graph_metrics(self, include_optional=False) -> Dict[str, Any]: """ Get metrics on graph structure and connectivity. diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 6216e107e..f3bb8e173 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -964,6 +964,63 @@ class Neo4jAdapter(GraphDBInterface): logger.error(f"Error during graph data retrieval: {str(e)}") raise + async def get_id_filtered_graph_data(self, target_ids: list[str]): + """ + Retrieve graph data filtered by specific node IDs, including their direct neighbors + and only edges where one endpoint matches those IDs. + + This version uses a single Cypher query for efficiency. + """ + import time + + start_time = time.time() + + try: + if not target_ids: + logger.warning("No target IDs provided for ID-filtered graph retrieval.") + return [], [] + + query = """ + MATCH ()-[r]-() + WHERE startNode(r).id IN $target_ids + OR endNode(r).id IN $target_ids + WITH DISTINCT r, startNode(r) AS a, endNode(r) AS b + RETURN + properties(a) AS n_properties, + properties(b) AS m_properties, + type(r) AS type, + properties(r) AS properties + """ + + result = await self.query(query, {"target_ids": target_ids}) + + nodes_dict = {} + edges = [] + + for record in result: + n_props = record["n_properties"] + m_props = record["m_properties"] + r_props = record["properties"] + r_type = record["type"] + + nodes_dict[n_props["id"]] = (n_props["id"], n_props) + nodes_dict[m_props["id"]] = (m_props["id"], m_props) + + source_id = r_props.get("source_node_id", n_props["id"]) + target_id = r_props.get("target_node_id", m_props["id"]) + edges.append((source_id, target_id, r_type, r_props)) + + retrieval_time = time.time() - start_time + logger.info( + f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s" + ) + + return list(nodes_dict.values()), edges + + except Exception as e: + logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}") + raise + async def get_nodeset_subgraph( self, node_type: Type[Any], node_name: List[str] ) -> Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]]: diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index cb7562422..2e0b82e8d 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -56,6 +56,68 @@ class CogneeGraph(CogneeAbstractGraph): def get_edges(self) -> List[Edge]: return self.edges + async def _get_nodeset_subgraph( + self, + adapter, + node_type, + node_name, + ): + """Retrieve subgraph based on node type and name.""" + logger.info("Retrieving graph filtered by node type and node name (NodeSet).") + nodes_data, edges_data = await adapter.get_nodeset_subgraph( + node_type=node_type, node_name=node_name + ) + if not nodes_data or not edges_data: + raise EntityNotFoundError( + message="Nodeset does not exist, or empty nodeset projected from the database." + ) + return nodes_data, edges_data + + async def _get_full_or_id_filtered_graph( + self, + adapter, + relevant_ids_to_filter, + ): + """Retrieve full or ID-filtered graph with fallback.""" + if relevant_ids_to_filter is None: + logger.info("Retrieving full graph.") + nodes_data, edges_data = await adapter.get_graph_data() + if not nodes_data or not edges_data: + raise EntityNotFoundError(message="Empty graph projected from the database.") + return nodes_data, edges_data + + get_graph_data_fn = getattr(adapter, "get_id_filtered_graph_data", adapter.get_graph_data) + if getattr(adapter.__class__, "get_id_filtered_graph_data", None): + logger.info("Retrieving ID-filtered graph from database.") + nodes_data, edges_data = await get_graph_data_fn(target_ids=relevant_ids_to_filter) + else: + logger.info("Retrieving full graph from database.") + nodes_data, edges_data = await get_graph_data_fn() + if hasattr(adapter, "get_id_filtered_graph_data") and (not nodes_data or not edges_data): + logger.warning( + "Id filtered graph returned empty, falling back to full graph retrieval." + ) + logger.info("Retrieving full graph") + nodes_data, edges_data = await adapter.get_graph_data() + + if not nodes_data or not edges_data: + raise EntityNotFoundError("Empty graph projected from the database.") + return nodes_data, edges_data + + async def _get_filtered_graph( + self, + adapter, + memory_fragment_filter, + ): + """Retrieve graph filtered by attributes.""" + logger.info("Retrieving graph filtered by memory fragment") + nodes_data, edges_data = await adapter.get_filtered_graph_data( + attribute_filters=memory_fragment_filter + ) + if not nodes_data or not edges_data: + raise EntityNotFoundError(message="Empty filtered graph projected from the database.") + return nodes_data, edges_data + async def project_graph_from_db( self, adapter: Union[GraphDBInterface], @@ -67,40 +129,39 @@ class CogneeGraph(CogneeAbstractGraph): memory_fragment_filter=[], node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + relevant_ids_to_filter: Optional[List[str]] = None, + triplet_distance_penalty: float = 3.5, ) -> None: if node_dimension < 1 or edge_dimension < 1: raise InvalidDimensionsError() try: + if node_type is not None and node_name not in [None, [], ""]: + nodes_data, edges_data = await self._get_nodeset_subgraph( + adapter, node_type, node_name + ) + elif len(memory_fragment_filter) == 0: + nodes_data, edges_data = await self._get_full_or_id_filtered_graph( + adapter, relevant_ids_to_filter + ) + else: + nodes_data, edges_data = await self._get_filtered_graph( + adapter, memory_fragment_filter + ) + import time start_time = time.time() - - # Determine projection strategy - if node_type is not None and node_name not in [None, [], ""]: - nodes_data, edges_data = await adapter.get_nodeset_subgraph( - node_type=node_type, node_name=node_name - ) - if not nodes_data or not edges_data: - raise EntityNotFoundError( - message="Nodeset does not exist, or empty nodetes projected from the database." - ) - elif len(memory_fragment_filter) == 0: - nodes_data, edges_data = await adapter.get_graph_data() - if not nodes_data or not edges_data: - raise EntityNotFoundError(message="Empty graph projected from the database.") - else: - nodes_data, edges_data = await adapter.get_filtered_graph_data( - attribute_filters=memory_fragment_filter - ) - if not nodes_data or not edges_data: - raise EntityNotFoundError( - message="Empty filtered graph projected from the database." - ) - # Process nodes for node_id, properties in nodes_data: node_attributes = {key: properties.get(key) for key in node_properties_to_project} - self.add_node(Node(str(node_id), node_attributes, dimension=node_dimension)) + self.add_node( + Node( + str(node_id), + node_attributes, + dimension=node_dimension, + node_penalty=triplet_distance_penalty, + ) + ) # Process edges for source_id, target_id, relationship_type, properties in edges_data: @@ -118,6 +179,7 @@ class CogneeGraph(CogneeAbstractGraph): attributes=edge_attributes, directed=directed, dimension=edge_dimension, + edge_penalty=triplet_distance_penalty, ) self.add_edge(edge) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py index 0ca9c4fb9..62ef8d9fd 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py @@ -20,13 +20,17 @@ class Node: status: np.ndarray def __init__( - self, node_id: str, attributes: Optional[Dict[str, Any]] = None, dimension: int = 1 + self, + node_id: str, + attributes: Optional[Dict[str, Any]] = None, + dimension: int = 1, + node_penalty: float = 3.5, ): if dimension <= 0: raise InvalidDimensionsError() self.id = node_id self.attributes = attributes if attributes is not None else {} - self.attributes["vector_distance"] = float("inf") + self.attributes["vector_distance"] = node_penalty self.skeleton_neighbours = [] self.skeleton_edges = [] self.status = np.ones(dimension, dtype=int) @@ -105,13 +109,14 @@ class Edge: attributes: Optional[Dict[str, Any]] = None, directed: bool = True, dimension: int = 1, + edge_penalty: float = 3.5, ): if dimension <= 0: raise InvalidDimensionsError() self.node1 = node1 self.node2 = node2 self.attributes = attributes if attributes is not None else {} - self.attributes["vector_distance"] = float("inf") + self.attributes["vector_distance"] = edge_penalty self.directed = directed self.status = np.ones(dimension, dtype=int) diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py index b07d11fd2..fc49a139b 100644 --- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py +++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py @@ -39,6 +39,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): super().__init__( user_prompt_path=user_prompt_path, @@ -48,6 +50,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) async def get_completion( diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py index eb8f502cb..70fcb6cdb 100644 --- a/cognee/modules/retrieval/graph_completion_cot_retriever.py +++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py @@ -65,6 +65,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): super().__init__( user_prompt_path=user_prompt_path, @@ -74,6 +76,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): node_type=node_type, node_name=node_name, save_interaction=save_interaction, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) self.validation_system_prompt_path = validation_system_prompt_path self.validation_user_prompt_path = validation_user_prompt_path diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index df77a11ac..89e9e47ce 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -47,6 +47,8 @@ class GraphCompletionRetriever(BaseGraphRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): """Initialize retriever with prompt paths and search parameters.""" self.save_interaction = save_interaction @@ -54,8 +56,10 @@ class GraphCompletionRetriever(BaseGraphRetriever): self.system_prompt_path = system_prompt_path self.system_prompt = system_prompt self.top_k = top_k if top_k is not None else 5 + self.wide_search_top_k = wide_search_top_k self.node_type = node_type self.node_name = node_name + self.triplet_distance_penalty = triplet_distance_penalty async def resolve_edges_to_text(self, retrieved_edges: list) -> str: """ @@ -105,6 +109,8 @@ class GraphCompletionRetriever(BaseGraphRetriever): collections=vector_index_collections or None, node_type=self.node_type, node_name=self.node_name, + wide_search_top_k=self.wide_search_top_k, + triplet_distance_penalty=self.triplet_distance_penalty, ) return found_triplets @@ -141,6 +147,10 @@ class GraphCompletionRetriever(BaseGraphRetriever): return triplets + async def convert_retrieved_objects_to_context(self, triplets: List[Edge]): + context = await self.resolve_edges_to_text(triplets) + return context + async def get_completion( self, query: str, diff --git a/cognee/modules/retrieval/graph_summary_completion_retriever.py b/cognee/modules/retrieval/graph_summary_completion_retriever.py index 051f39b22..e31ad126e 100644 --- a/cognee/modules/retrieval/graph_summary_completion_retriever.py +++ b/cognee/modules/retrieval/graph_summary_completion_retriever.py @@ -26,6 +26,8 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): """Initialize retriever with default prompt paths and search parameters.""" super().__init__( @@ -36,6 +38,8 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) self.summarize_prompt_path = summarize_prompt_path diff --git a/cognee/modules/retrieval/temporal_retriever.py b/cognee/modules/retrieval/temporal_retriever.py index f3da02c15..87d2ab009 100644 --- a/cognee/modules/retrieval/temporal_retriever.py +++ b/cognee/modules/retrieval/temporal_retriever.py @@ -47,6 +47,8 @@ class TemporalRetriever(GraphCompletionRetriever): top_k: Optional[int] = 5, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): super().__init__( user_prompt_path=user_prompt_path, @@ -54,6 +56,8 @@ class TemporalRetriever(GraphCompletionRetriever): top_k=top_k, node_type=node_type, node_name=node_name, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) self.user_prompt_path = user_prompt_path self.system_prompt_path = system_prompt_path diff --git a/cognee/modules/retrieval/utils/brute_force_triplet_search.py b/cognee/modules/retrieval/utils/brute_force_triplet_search.py index f8bdbb97d..2f8a545f7 100644 --- a/cognee/modules/retrieval/utils/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/utils/brute_force_triplet_search.py @@ -58,6 +58,8 @@ async def get_memory_fragment( properties_to_project: Optional[List[str]] = None, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + relevant_ids_to_filter: Optional[List[str]] = None, + triplet_distance_penalty: Optional[float] = 3.5, ) -> CogneeGraph: """Creates and initializes a CogneeGraph memory fragment with optional property projections.""" if properties_to_project is None: @@ -74,6 +76,8 @@ async def get_memory_fragment( edge_properties_to_project=["relationship_name", "edge_text"], node_type=node_type, node_name=node_name, + relevant_ids_to_filter=relevant_ids_to_filter, + triplet_distance_penalty=triplet_distance_penalty, ) except EntityNotFoundError: @@ -95,6 +99,8 @@ async def brute_force_triplet_search( memory_fragment: Optional[CogneeGraph] = None, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> List[Edge]: """ Performs a brute force search to retrieve the top triplets from the graph. @@ -107,6 +113,8 @@ async def brute_force_triplet_search( memory_fragment (Optional[CogneeGraph]): Existing memory fragment to reuse. node_type: node type to filter node_name: node name to filter + wide_search_top_k (Optional[int]): Number of initial elements to retrieve from collections + triplet_distance_penalty (Optional[float]): Default distance penalty in graph projection Returns: list: The top triplet results. @@ -116,10 +124,10 @@ async def brute_force_triplet_search( if top_k <= 0: raise ValueError("top_k must be a positive integer.") - if memory_fragment is None: - memory_fragment = await get_memory_fragment( - properties_to_project, node_type=node_type, node_name=node_name - ) + # Setting wide search limit based on the parameters + non_global_search = node_name is None + + wide_search_limit = wide_search_top_k if non_global_search else None if collections is None: collections = [ @@ -140,7 +148,7 @@ async def brute_force_triplet_search( async def search_in_collection(collection_name: str): try: return await vector_engine.search( - collection_name=collection_name, query_vector=query_vector, limit=None + collection_name=collection_name, query_vector=query_vector, limit=wide_search_limit ) except CollectionNotFoundError: return [] @@ -156,15 +164,38 @@ async def brute_force_triplet_search( return [] # Final statistics - projection_time = time.time() - start_time + vector_collection_search_time = time.time() - start_time logger.info( - f"Vector collection retrieval completed: Retrieved distances from {sum(1 for res in results if res)} collections in {projection_time:.2f}s" + f"Vector collection retrieval completed: Retrieved distances from {sum(1 for res in results if res)} collections in {vector_collection_search_time:.2f}s" ) node_distances = {collection: result for collection, result in zip(collections, results)} edge_distances = node_distances.get("EdgeType_relationship_name", None) + if wide_search_limit is not None: + relevant_ids_to_filter = list( + { + str(getattr(scored_node, "id")) + for collection_name, score_collection in node_distances.items() + if collection_name != "EdgeType_relationship_name" + and isinstance(score_collection, (list, tuple)) + for scored_node in score_collection + if getattr(scored_node, "id", None) + } + ) + else: + relevant_ids_to_filter = None + + if memory_fragment is None: + memory_fragment = await get_memory_fragment( + properties_to_project=properties_to_project, + node_type=node_type, + node_name=node_name, + relevant_ids_to_filter=relevant_ids_to_filter, + triplet_distance_penalty=triplet_distance_penalty, + ) + await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) await memory_fragment.map_vector_distances_to_graph_edges( vector_engine=vector_engine, query_vector=query_vector, edge_distances=edge_distances diff --git a/cognee/modules/search/methods/get_search_type_tools.py b/cognee/modules/search/methods/get_search_type_tools.py index 72e2db89a..165ec379b 100644 --- a/cognee/modules/search/methods/get_search_type_tools.py +++ b/cognee/modules/search/methods/get_search_type_tools.py @@ -37,6 +37,8 @@ async def get_search_type_tools( node_name: Optional[List[str]] = None, save_interaction: bool = False, last_k: Optional[int] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> list: search_tasks: dict[SearchType, List[Callable]] = { SearchType.SUMMARIES: [ @@ -67,6 +69,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphCompletionRetriever( system_prompt_path=system_prompt_path, @@ -75,6 +79,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.GRAPH_COMPLETION_COT: [ @@ -85,6 +91,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphCompletionCotRetriever( system_prompt_path=system_prompt_path, @@ -93,6 +101,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.GRAPH_COMPLETION_CONTEXT_EXTENSION: [ @@ -103,6 +113,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphCompletionContextExtensionRetriever( system_prompt_path=system_prompt_path, @@ -111,6 +123,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.GRAPH_SUMMARY_COMPLETION: [ @@ -121,6 +135,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphSummaryCompletionRetriever( system_prompt_path=system_prompt_path, @@ -129,6 +145,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.CODE: [ @@ -145,8 +163,16 @@ async def get_search_type_tools( ], SearchType.FEEDBACK: [UserQAFeedback(last_k=last_k).add_feedback], SearchType.TEMPORAL: [ - TemporalRetriever(top_k=top_k).get_completion, - TemporalRetriever(top_k=top_k).get_context, + TemporalRetriever( + top_k=top_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, + ).get_completion, + TemporalRetriever( + top_k=top_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, + ).get_context, ], SearchType.CHUNKS_LEXICAL: ( lambda _r=JaccardChunksRetriever(top_k=top_k): [ diff --git a/cognee/modules/search/methods/no_access_control_search.py b/cognee/modules/search/methods/no_access_control_search.py index fcb02da46..3a703bbc9 100644 --- a/cognee/modules/search/methods/no_access_control_search.py +++ b/cognee/modules/search/methods/no_access_control_search.py @@ -24,6 +24,8 @@ async def no_access_control_search( last_k: Optional[int] = None, only_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Tuple[Any, Union[str, List[Edge]], List[Dataset]]: search_tools = await get_search_type_tools( query_type=query_type, @@ -35,6 +37,8 @@ async def no_access_control_search( node_name=node_name, save_interaction=save_interaction, last_k=last_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) graph_engine = await get_graph_engine() is_empty = await graph_engine.is_empty() diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index b4278424b..9f180d607 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -47,6 +47,8 @@ async def search( only_context: bool = False, use_combined_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Union[CombinedSearchResult, List[SearchResult]]: """ @@ -90,6 +92,8 @@ async def search( only_context=only_context, use_combined_context=use_combined_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) else: search_results = [ @@ -105,6 +109,8 @@ async def search( last_k=last_k, only_context=only_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) ] @@ -219,6 +225,8 @@ async def authorized_search( only_context: bool = False, use_combined_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Union[ Tuple[Any, Union[List[Edge], str], List[Dataset]], List[Tuple[Any, Union[List[Edge], str], List[Dataset]]], @@ -246,6 +254,8 @@ async def authorized_search( last_k=last_k, only_context=True, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) context = {} @@ -267,6 +277,8 @@ async def authorized_search( node_name=node_name, save_interaction=save_interaction, last_k=last_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) search_tools = specific_search_tools if len(search_tools) == 2: @@ -306,6 +318,7 @@ async def authorized_search( last_k=last_k, only_context=only_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, ) return search_results @@ -325,6 +338,8 @@ async def search_in_datasets_context( only_context: bool = False, context: Optional[Any] = None, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> List[Tuple[Any, Union[str, List[Edge]], List[Dataset]]]: """ Searches all provided datasets and handles setting up of appropriate database context based on permissions. @@ -345,6 +360,8 @@ async def search_in_datasets_context( only_context: bool = False, context: Optional[Any] = None, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Tuple[Any, Union[str, List[Edge]], List[Dataset]]: # Set database configuration in async context for each dataset user has access for await set_database_global_context_variables(dataset.id, dataset.owner_id) @@ -378,6 +395,8 @@ async def search_in_datasets_context( node_name=node_name, save_interaction=save_interaction, last_k=last_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) search_tools = specific_search_tools if len(search_tools) == 2: @@ -413,6 +432,8 @@ async def search_in_datasets_context( only_context=only_context, context=context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) ) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py index 37ba113b5..1d2b79cf9 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py @@ -9,7 +9,7 @@ def test_node_initialization(): """Test that a Node is initialized correctly.""" node = Node("node1", {"attr1": "value1"}, dimension=2) assert node.id == "node1" - assert node.attributes == {"attr1": "value1", "vector_distance": np.inf} + assert node.attributes == {"attr1": "value1", "vector_distance": 3.5} assert len(node.status) == 2 assert np.all(node.status == 1) @@ -96,7 +96,7 @@ def test_edge_initialization(): edge = Edge(node1, node2, {"weight": 10}, directed=False, dimension=2) assert edge.node1 == node1 assert edge.node2 == node2 - assert edge.attributes == {"vector_distance": np.inf, "weight": 10} + assert edge.attributes == {"vector_distance": 3.5, "weight": 10} assert edge.directed is False assert len(edge.status) == 2 assert np.all(edge.status == 1) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py index 6888648c3..711479387 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py @@ -1,4 +1,5 @@ import pytest +from unittest.mock import AsyncMock from cognee.modules.graph.exceptions import EntityNotFoundError, EntityAlreadyExistsError from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph @@ -11,6 +12,30 @@ def setup_graph(): return CogneeGraph() +@pytest.fixture +def mock_adapter(): + """Fixture to create a mock adapter for database operations.""" + adapter = AsyncMock() + return adapter + + +@pytest.fixture +def mock_vector_engine(): + """Fixture to create a mock vector engine.""" + engine = AsyncMock() + engine.search = AsyncMock() + return engine + + +class MockScoredResult: + """Mock class for vector search results.""" + + def __init__(self, id, score, payload=None): + self.id = id + self.score = score + self.payload = payload or {} + + def test_add_node_success(setup_graph): """Test successful addition of a node.""" graph = setup_graph @@ -73,3 +98,433 @@ def test_get_edges_nonexistent_node(setup_graph): graph = setup_graph with pytest.raises(EntityNotFoundError, match="Node with id nonexistent does not exist."): graph.get_edges_from_node("nonexistent") + + +@pytest.mark.asyncio +async def test_project_graph_from_db_full_graph(setup_graph, mock_adapter): + """Test projecting a full graph from database.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Node1", "description": "First node"}), + ("2", {"name": "Node2", "description": "Second node"}), + ] + edges_data = [ + ("1", "2", "CONNECTS_TO", {"relationship_name": "connects"}), + ] + + mock_adapter.get_graph_data = AsyncMock(return_value=(nodes_data, edges_data)) + + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name", "description"], + edge_properties_to_project=["relationship_name"], + ) + + assert len(graph.nodes) == 2 + assert len(graph.edges) == 1 + assert graph.get_node("1") is not None + assert graph.get_node("2") is not None + assert graph.edges[0].node1.id == "1" + assert graph.edges[0].node2.id == "2" + + +@pytest.mark.asyncio +async def test_project_graph_from_db_id_filtered(setup_graph, mock_adapter): + """Test projecting an ID-filtered graph from database.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Node1"}), + ("2", {"name": "Node2"}), + ] + edges_data = [ + ("1", "2", "CONNECTS_TO", {"relationship_name": "connects"}), + ] + + mock_adapter.get_id_filtered_graph_data = AsyncMock(return_value=(nodes_data, edges_data)) + + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name"], + edge_properties_to_project=["relationship_name"], + relevant_ids_to_filter=["1", "2"], + ) + + assert len(graph.nodes) == 2 + assert len(graph.edges) == 1 + mock_adapter.get_id_filtered_graph_data.assert_called_once() + + +@pytest.mark.asyncio +async def test_project_graph_from_db_nodeset_subgraph(setup_graph, mock_adapter): + """Test projecting a nodeset subgraph filtered by node type and name.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Alice", "type": "Person"}), + ("2", {"name": "Bob", "type": "Person"}), + ] + edges_data = [ + ("1", "2", "KNOWS", {"relationship_name": "knows"}), + ] + + mock_adapter.get_nodeset_subgraph = AsyncMock(return_value=(nodes_data, edges_data)) + + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name", "type"], + edge_properties_to_project=["relationship_name"], + node_type="Person", + node_name=["Alice"], + ) + + assert len(graph.nodes) == 2 + assert graph.get_node("1") is not None + assert len(graph.edges) == 1 + mock_adapter.get_nodeset_subgraph.assert_called_once() + + +@pytest.mark.asyncio +async def test_project_graph_from_db_empty_graph(setup_graph, mock_adapter): + """Test projecting empty graph raises EntityNotFoundError.""" + graph = setup_graph + + mock_adapter.get_graph_data = AsyncMock(return_value=([], [])) + + with pytest.raises(EntityNotFoundError, match="Empty graph projected from the database."): + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name"], + edge_properties_to_project=[], + ) + + +@pytest.mark.asyncio +async def test_project_graph_from_db_missing_nodes(setup_graph, mock_adapter): + """Test that edges referencing missing nodes raise error.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Node1"}), + ] + edges_data = [ + ("1", "999", "CONNECTS_TO", {"relationship_name": "connects"}), + ] + + mock_adapter.get_graph_data = AsyncMock(return_value=(nodes_data, edges_data)) + + with pytest.raises(EntityNotFoundError, match="Edge references nonexistent nodes"): + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name"], + edge_properties_to_project=["relationship_name"], + ) + + +@pytest.mark.asyncio +async def test_map_vector_distances_to_graph_nodes(setup_graph): + """Test mapping vector distances to graph nodes.""" + graph = setup_graph + + node1 = Node("1", {"name": "Node1"}) + node2 = Node("2", {"name": "Node2"}) + graph.add_node(node1) + graph.add_node(node2) + + node_distances = { + "Entity_name": [ + MockScoredResult("1", 0.95), + MockScoredResult("2", 0.87), + ] + } + + await graph.map_vector_distances_to_graph_nodes(node_distances) + + assert graph.get_node("1").attributes.get("vector_distance") == 0.95 + assert graph.get_node("2").attributes.get("vector_distance") == 0.87 + + +@pytest.mark.asyncio +async def test_map_vector_distances_partial_node_coverage(setup_graph): + """Test mapping vector distances when only some nodes have results.""" + graph = setup_graph + + node1 = Node("1", {"name": "Node1"}) + node2 = Node("2", {"name": "Node2"}) + node3 = Node("3", {"name": "Node3"}) + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + + node_distances = { + "Entity_name": [ + MockScoredResult("1", 0.95), + MockScoredResult("2", 0.87), + ] + } + + await graph.map_vector_distances_to_graph_nodes(node_distances) + + assert graph.get_node("1").attributes.get("vector_distance") == 0.95 + assert graph.get_node("2").attributes.get("vector_distance") == 0.87 + assert graph.get_node("3").attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_multiple_categories(setup_graph): + """Test mapping vector distances from multiple collection categories.""" + graph = setup_graph + + # Create nodes + node1 = Node("1") + node2 = Node("2") + node3 = Node("3") + node4 = Node("4") + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + graph.add_node(node4) + + node_distances = { + "Entity_name": [ + MockScoredResult("1", 0.95), + MockScoredResult("2", 0.87), + ], + "TextSummary_text": [ + MockScoredResult("3", 0.92), + ], + } + + await graph.map_vector_distances_to_graph_nodes(node_distances) + + assert graph.get_node("1").attributes.get("vector_distance") == 0.95 + assert graph.get_node("2").attributes.get("vector_distance") == 0.87 + assert graph.get_node("3").attributes.get("vector_distance") == 0.92 + assert graph.get_node("4").attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_to_graph_edges_with_payload(setup_graph, mock_vector_engine): + """Test mapping vector distances to edges when edge_distances provided.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"edge_text": "CONNECTS_TO", "relationship_type": "connects"}, + ) + graph.add_edge(edge) + + edge_distances = [ + MockScoredResult("e1", 0.92, payload={"text": "CONNECTS_TO"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 0.92 + + +@pytest.mark.asyncio +async def test_map_vector_distances_to_graph_edges_search(setup_graph, mock_vector_engine): + """Test mapping edge distances when searching for them.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"edge_text": "CONNECTS_TO", "relationship_type": "connects"}, + ) + graph.add_edge(edge) + + mock_vector_engine.search.return_value = [ + MockScoredResult("e1", 0.88, payload={"text": "CONNECTS_TO"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=None, + ) + + mock_vector_engine.search.assert_called_once() + assert graph.edges[0].attributes.get("vector_distance") == 0.88 + + +@pytest.mark.asyncio +async def test_map_vector_distances_partial_edge_coverage(setup_graph, mock_vector_engine): + """Test mapping edge distances when only some edges have results.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + node3 = Node("3") + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + + edge1 = Edge(node1, node2, attributes={"edge_text": "CONNECTS_TO"}) + edge2 = Edge(node2, node3, attributes={"edge_text": "DEPENDS_ON"}) + graph.add_edge(edge1) + graph.add_edge(edge2) + + edge_distances = [ + MockScoredResult("e1", 0.92, payload={"text": "CONNECTS_TO"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 0.92 + assert graph.edges[1].attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_edges_fallback_to_relationship_type( + setup_graph, mock_vector_engine +): + """Test that edge mapping falls back to relationship_type when edge_text is missing.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"relationship_type": "KNOWS"}, + ) + graph.add_edge(edge) + + edge_distances = [ + MockScoredResult("e1", 0.85, payload={"text": "KNOWS"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 0.85 + + +@pytest.mark.asyncio +async def test_map_vector_distances_no_edge_matches(setup_graph, mock_vector_engine): + """Test edge mapping when no edges match the distance results.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"edge_text": "CONNECTS_TO", "relationship_type": "connects"}, + ) + graph.add_edge(edge) + + edge_distances = [ + MockScoredResult("e1", 0.92, payload={"text": "SOME_OTHER_EDGE"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_invalid_query_vector(setup_graph, mock_vector_engine): + """Test that invalid query vector raises error.""" + graph = setup_graph + + with pytest.raises(ValueError, match="Failed to generate query embedding"): + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[], + edge_distances=None, + ) + + +@pytest.mark.asyncio +async def test_calculate_top_triplet_importances(setup_graph): + """Test calculating top triplet importances by score.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + node3 = Node("3") + node4 = Node("4") + + node1.add_attribute("vector_distance", 0.9) + node2.add_attribute("vector_distance", 0.8) + node3.add_attribute("vector_distance", 0.7) + node4.add_attribute("vector_distance", 0.6) + + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + graph.add_node(node4) + + edge1 = Edge(node1, node2) + edge2 = Edge(node2, node3) + edge3 = Edge(node3, node4) + + edge1.add_attribute("vector_distance", 0.85) + edge2.add_attribute("vector_distance", 0.75) + edge3.add_attribute("vector_distance", 0.65) + + graph.add_edge(edge1) + graph.add_edge(edge2) + graph.add_edge(edge3) + + top_triplets = await graph.calculate_top_triplet_importances(k=2) + + assert len(top_triplets) == 2 + + assert top_triplets[0] == edge3 + assert top_triplets[1] == edge2 + + +@pytest.mark.asyncio +async def test_calculate_top_triplet_importances_default_distances(setup_graph): + """Test calculating importances when nodes/edges have no vector distances.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge(node1, node2) + graph.add_edge(edge) + + top_triplets = await graph.calculate_top_triplet_importances(k=1) + + assert len(top_triplets) == 1 + assert top_triplets[0] == edge diff --git a/cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py b/cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py new file mode 100644 index 000000000..5eb6fb105 --- /dev/null +++ b/cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py @@ -0,0 +1,582 @@ +import pytest +from unittest.mock import AsyncMock, patch + +from cognee.modules.retrieval.utils.brute_force_triplet_search import ( + brute_force_triplet_search, + get_memory_fragment, +) +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph +from cognee.modules.graph.exceptions.exceptions import EntityNotFoundError + + +class MockScoredResult: + """Mock class for vector search results.""" + + def __init__(self, id, score, payload=None): + self.id = id + self.score = score + self.payload = payload or {} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_empty_query(): + """Test that empty query raises ValueError.""" + with pytest.raises(ValueError, match="The query must be a non-empty string."): + await brute_force_triplet_search(query="") + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_none_query(): + """Test that None query raises ValueError.""" + with pytest.raises(ValueError, match="The query must be a non-empty string."): + await brute_force_triplet_search(query=None) + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_negative_top_k(): + """Test that negative top_k raises ValueError.""" + with pytest.raises(ValueError, match="top_k must be a positive integer."): + await brute_force_triplet_search(query="test query", top_k=-1) + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_zero_top_k(): + """Test that zero top_k raises ValueError.""" + with pytest.raises(ValueError, match="top_k must be a positive integer."): + await brute_force_triplet_search(query="test query", top_k=0) + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_wide_search_limit_global_search(): + """Test that wide_search_limit is applied for global search (node_name=None).""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search( + query="test", + node_name=None, # Global search + wide_search_top_k=75, + ) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["limit"] == 75 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_wide_search_limit_filtered_search(): + """Test that wide_search_limit is None for filtered search (node_name provided).""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search( + query="test", + node_name=["Node1"], + wide_search_top_k=50, + ) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["limit"] is None + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_wide_search_default(): + """Test that wide_search_top_k defaults to 100.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query="test", node_name=None) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["limit"] == 100 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_default_collections(): + """Test that default collections are used when none provided.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query="test") + + expected_collections = [ + "Entity_name", + "TextSummary_text", + "EntityType_name", + "DocumentChunk_text", + ] + + call_collections = [ + call[1]["collection_name"] for call in mock_vector_engine.search.call_args_list + ] + assert call_collections == expected_collections + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_custom_collections(): + """Test that custom collections are used when provided.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + custom_collections = ["CustomCol1", "CustomCol2"] + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query="test", collections=custom_collections) + + call_collections = [ + call[1]["collection_name"] for call in mock_vector_engine.search.call_args_list + ] + assert call_collections == custom_collections + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_all_collections_empty(): + """Test that empty list is returned when all collections return no results.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + results = await brute_force_triplet_search(query="test") + assert results == [] + + +# Tests for query embedding + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_embeds_query(): + """Test that query is embedded before searching.""" + query_text = "test query" + expected_vector = [0.1, 0.2, 0.3] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[expected_vector]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query=query_text) + + mock_vector_engine.embedding_engine.embed_text.assert_called_once_with([query_text]) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["query_vector"] == expected_vector + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_extracts_node_ids_global_search(): + """Test that node IDs are extracted from search results for global search.""" + scored_results = [ + MockScoredResult("node1", 0.95), + MockScoredResult("node2", 0.87), + MockScoredResult("node3", 0.92), + ] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=scored_results) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2", "node3"} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_reuses_provided_fragment(): + """Test that provided memory fragment is reused instead of creating new one.""" + provided_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[MockScoredResult("n1", 0.95)]) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment" + ) as mock_get_fragment, + ): + await brute_force_triplet_search( + query="test", + memory_fragment=provided_fragment, + node_name=["node"], + ) + + mock_get_fragment.assert_not_called() + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_creates_fragment_when_not_provided(): + """Test that memory fragment is created when not provided.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[MockScoredResult("n1", 0.95)]) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment, + ): + await brute_force_triplet_search(query="test", node_name=["node"]) + + mock_get_fragment.assert_called_once() + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_passes_top_k_to_importance_calculation(): + """Test that custom top_k is passed to importance calculation.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[MockScoredResult("n1", 0.95)]) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ), + ): + custom_top_k = 15 + await brute_force_triplet_search(query="test", top_k=custom_top_k, node_name=["n"]) + + mock_fragment.calculate_top_triplet_importances.assert_called_once_with(k=custom_top_k) + + +@pytest.mark.asyncio +async def test_get_memory_fragment_returns_empty_graph_on_entity_not_found(): + """Test that get_memory_fragment returns empty graph when entity not found.""" + mock_graph_engine = AsyncMock() + mock_graph_engine.project_graph_from_db = AsyncMock( + side_effect=EntityNotFoundError("Entity not found") + ) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_graph_engine", + return_value=mock_graph_engine, + ): + fragment = await get_memory_fragment() + + assert isinstance(fragment, CogneeGraph) + assert len(fragment.nodes) == 0 + + +@pytest.mark.asyncio +async def test_get_memory_fragment_returns_empty_graph_on_error(): + """Test that get_memory_fragment returns empty graph on generic error.""" + mock_graph_engine = AsyncMock() + mock_graph_engine.project_graph_from_db = AsyncMock(side_effect=Exception("Generic error")) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_graph_engine", + return_value=mock_graph_engine, + ): + fragment = await get_memory_fragment() + + assert isinstance(fragment, CogneeGraph) + assert len(fragment.nodes) == 0 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_deduplicates_node_ids(): + """Test that duplicate node IDs across collections are deduplicated.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [ + MockScoredResult("node1", 0.95), + MockScoredResult("node2", 0.87), + ] + elif collection_name == "TextSummary_text": + return [ + MockScoredResult("node1", 0.90), + MockScoredResult("node3", 0.92), + ] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2", "node3"} + assert len(call_kwargs["relevant_ids_to_filter"]) == 3 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_excludes_edge_collection(): + """Test that EdgeType_relationship_name collection is excluded from ID extraction.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [MockScoredResult("node1", 0.95)] + elif collection_name == "EdgeType_relationship_name": + return [MockScoredResult("edge1", 0.88)] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search( + query="test", + node_name=None, + collections=["Entity_name", "EdgeType_relationship_name"], + ) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert call_kwargs["relevant_ids_to_filter"] == ["node1"] + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_skips_nodes_without_ids(): + """Test that nodes without ID attribute are skipped.""" + + class ScoredResultNoId: + """Mock result without id attribute.""" + + def __init__(self, score): + self.score = score + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [ + MockScoredResult("node1", 0.95), + ScoredResultNoId(0.90), + MockScoredResult("node2", 0.87), + ] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2"} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_handles_tuple_results(): + """Test that both list and tuple results are handled correctly.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return ( + MockScoredResult("node1", 0.95), + MockScoredResult("node2", 0.87), + ) + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2"} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_mixed_empty_collections(): + """Test ID extraction with mixed empty and non-empty collections.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [MockScoredResult("node1", 0.95)] + elif collection_name == "TextSummary_text": + return [] + elif collection_name == "EntityType_name": + return [MockScoredResult("node2", 0.92)] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2"}