From ac5118ee34c4bd149ac26d042e2ffe5292ee3459 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 15 Oct 2025 17:28:51 +0200 Subject: [PATCH 001/129] test:Add load test --- cognee/tests/load_test.py | 61 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 cognee/tests/load_test.py diff --git a/cognee/tests/load_test.py b/cognee/tests/load_test.py new file mode 100644 index 000000000..da9b74ab9 --- /dev/null +++ b/cognee/tests/load_test.py @@ -0,0 +1,61 @@ +import os +import pathlib +import asyncio +import time + +import cognee +from cognee.modules.search.types import SearchType +from cognee.shared.logging_utils import get_logger + +logger = get_logger() + +async def helper_func(num_of_searches): + + start_time = time.time() + + await cognee.cognify() + + await asyncio.gather( + *[ + cognee.search(query_text="Tell me about AI", query_type=SearchType.GRAPH_COMPLETION) + for _ in range(num_of_searches) + ] + ) + + end_time = time.time() + + return end_time - start_time + +async def main(): + + file_path = os.path.join( + pathlib.Path(__file__).resolve().parent, "test_data/artificial-intelligence.pdf" + ) + + num_of_pdfs = 10 + num_of_reps = 5 + upper_boundary_minutes = 3 + average_minutes = 1.5 + + await asyncio.gather( + *[ + cognee.add(file_path, dataset_name=f"dataset_{i}") + for i in range(num_of_pdfs) + ] + ) + + recorded_times = await asyncio.gather( + *[helper_func(num_of_pdfs) for _ in range(num_of_reps)] + ) + + average_recorded_time = sum(recorded_times) / len(recorded_times) + + assert average_recorded_time <= average_minutes * 60 + + assert all(rec_time <= upper_boundary_minutes * 60 for rec_time in recorded_times) + + return + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From c16459d236a6e07b9267d323387b2be217fd5b46 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 15 Oct 2025 17:58:05 +0200 Subject: [PATCH 002/129] test: Add prune step to the test --- cognee/tests/load_test.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cognee/tests/load_test.py b/cognee/tests/load_test.py index da9b74ab9..c44efad00 100644 --- a/cognee/tests/load_test.py +++ b/cognee/tests/load_test.py @@ -9,7 +9,7 @@ from cognee.shared.logging_utils import get_logger logger = get_logger() -async def helper_func(num_of_searches): +async def process_and_search(num_of_searches): start_time = time.time() @@ -37,6 +37,9 @@ async def main(): upper_boundary_minutes = 3 average_minutes = 1.5 + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await asyncio.gather( *[ cognee.add(file_path, dataset_name=f"dataset_{i}") @@ -45,7 +48,7 @@ async def main(): ) recorded_times = await asyncio.gather( - *[helper_func(num_of_pdfs) for _ in range(num_of_reps)] + *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] ) average_recorded_time = sum(recorded_times) / len(recorded_times) @@ -54,8 +57,6 @@ async def main(): assert all(rec_time <= upper_boundary_minutes * 60 for rec_time in recorded_times) - return - if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file From a8ff50ceae262868cb303707f5396abe07cfed38 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 17 Oct 2025 18:09:01 +0200 Subject: [PATCH 003/129] feat: Initial multi-tenancy commit --- cognee/modules/data/methods/create_dataset.py | 5 +-- cognee/modules/data/models/Dataset.py | 1 + cognee/modules/users/methods/create_user.py | 35 +++++-------------- .../modules/users/methods/get_default_user.py | 7 +--- cognee/modules/users/methods/get_user.py | 2 +- .../users/methods/get_user_by_email.py | 2 +- cognee/modules/users/models/Tenant.py | 13 ++++--- cognee/modules/users/models/User.py | 11 +++--- cognee/modules/users/models/UserTenant.py | 12 +++++++ cognee/modules/users/models/__init__.py | 1 + .../get_all_user_permission_datasets.py | 20 +++++------ .../tenants/methods/add_user_to_tenant.py | 25 +++++++++---- .../users/tenants/methods/create_tenant.py | 16 ++++++--- examples/python/permissions_example.py | 4 ++- 14 files changed, 82 insertions(+), 72 deletions(-) create mode 100644 cognee/modules/users/models/UserTenant.py diff --git a/cognee/modules/data/methods/create_dataset.py b/cognee/modules/data/methods/create_dataset.py index c080de0e8..280c9e105 100644 --- a/cognee/modules/data/methods/create_dataset.py +++ b/cognee/modules/data/methods/create_dataset.py @@ -22,8 +22,9 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) - if dataset is None: # Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user) - dataset = Dataset(id=dataset_id, name=dataset_name, data=[]) - dataset.owner_id = owner_id + dataset = Dataset( + id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id + ) session.add(dataset) diff --git a/cognee/modules/data/models/Dataset.py b/cognee/modules/data/models/Dataset.py index 797401d5a..00ed4da96 100644 --- a/cognee/modules/data/models/Dataset.py +++ b/cognee/modules/data/models/Dataset.py @@ -18,6 +18,7 @@ class Dataset(Base): updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) owner_id = Column(UUID, index=True) + tenant_id = Column(UUID, index=True, nullable=True) acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan") diff --git a/cognee/modules/users/methods/create_user.py b/cognee/modules/users/methods/create_user.py index 1b303bd36..953c70cd6 100644 --- a/cognee/modules/users/methods/create_user.py +++ b/cognee/modules/users/methods/create_user.py @@ -18,7 +18,6 @@ from typing import Optional async def create_user( email: str, password: str, - tenant_id: Optional[str] = None, is_superuser: bool = False, is_active: bool = True, is_verified: bool = False, @@ -30,33 +29,15 @@ async def create_user( async with relational_engine.get_async_session() as session: async with get_user_db_context(session) as user_db: async with get_user_manager_context(user_db) as user_manager: - if tenant_id: - # Check if the tenant already exists - result = await session.execute(select(Tenant).where(Tenant.id == tenant_id)) - tenant = result.scalars().first() - if not tenant: - raise TenantNotFoundError - - user = await user_manager.create( - UserCreate( - email=email, - password=password, - tenant_id=tenant.id, - is_superuser=is_superuser, - is_active=is_active, - is_verified=is_verified, - ) - ) - else: - user = await user_manager.create( - UserCreate( - email=email, - password=password, - is_superuser=is_superuser, - is_active=is_active, - is_verified=is_verified, - ) + user = await user_manager.create( + UserCreate( + email=email, + password=password, + is_superuser=is_superuser, + is_active=is_active, + is_verified=is_verified, ) + ) if auto_login: await session.refresh(user) diff --git a/cognee/modules/users/methods/get_default_user.py b/cognee/modules/users/methods/get_default_user.py index 48073a884..773545f8e 100644 --- a/cognee/modules/users/methods/get_default_user.py +++ b/cognee/modules/users/methods/get_default_user.py @@ -27,12 +27,7 @@ async def get_default_user() -> SimpleNamespace: if user is None: return await create_default_user() - # We return a SimpleNamespace to have the same user type as our SaaS - # SimpleNamespace is just a dictionary which can be accessed through attributes - auth_data = SimpleNamespace( - id=user.id, email=user.email, tenant_id=user.tenant_id, roles=[] - ) - return auth_data + return user except Exception as error: if "principals" in str(error.args): raise DatabaseNotCreatedError() from error diff --git a/cognee/modules/users/methods/get_user.py b/cognee/modules/users/methods/get_user.py index 2678a5a01..a1c87aab7 100644 --- a/cognee/modules/users/methods/get_user.py +++ b/cognee/modules/users/methods/get_user.py @@ -14,7 +14,7 @@ async def get_user(user_id: UUID): user = ( await session.execute( select(User) - .options(selectinload(User.roles), selectinload(User.tenant)) + .options(selectinload(User.roles), selectinload(User.tenants)) .where(User.id == user_id) ) ).scalar() diff --git a/cognee/modules/users/methods/get_user_by_email.py b/cognee/modules/users/methods/get_user_by_email.py index c4bd5b48e..6df989251 100644 --- a/cognee/modules/users/methods/get_user_by_email.py +++ b/cognee/modules/users/methods/get_user_by_email.py @@ -13,7 +13,7 @@ async def get_user_by_email(user_email: str): user = ( await session.execute( select(User) - .options(joinedload(User.roles), joinedload(User.tenant)) + .options(joinedload(User.roles), joinedload(User.tenants)) .where(User.email == user_email) ) ).scalar() diff --git a/cognee/modules/users/models/Tenant.py b/cognee/modules/users/models/Tenant.py index 95023a6ee..b8fa158c5 100644 --- a/cognee/modules/users/models/Tenant.py +++ b/cognee/modules/users/models/Tenant.py @@ -1,7 +1,7 @@ -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped from sqlalchemy import Column, String, ForeignKey, UUID from .Principal import Principal -from .User import User +from .UserTenant import UserTenant from .Role import Role @@ -13,14 +13,13 @@ class Tenant(Principal): owner_id = Column(UUID, index=True) - # One-to-Many relationship with User; specify the join via User.tenant_id - users = relationship( + users: Mapped[list["User"]] = relationship( # noqa: F821 "User", - back_populates="tenant", - foreign_keys=lambda: [User.tenant_id], + secondary=UserTenant.__tablename__, + back_populates="tenants", ) - # One-to-Many relationship with Role (if needed; similar fix) + # One-to-Many relationship with Role roles = relationship( "Role", back_populates="tenant", diff --git a/cognee/modules/users/models/User.py b/cognee/modules/users/models/User.py index 8972a5932..a98abd3bc 100644 --- a/cognee/modules/users/models/User.py +++ b/cognee/modules/users/models/User.py @@ -6,8 +6,10 @@ from sqlalchemy import ForeignKey, Column, UUID from sqlalchemy.orm import relationship, Mapped from .Principal import Principal +from .UserTenant import UserTenant from .UserRole import UserRole from .Role import Role +from .Tenant import Tenant class User(SQLAlchemyBaseUserTableUUID, Principal): @@ -15,7 +17,7 @@ class User(SQLAlchemyBaseUserTableUUID, Principal): id = Column(UUID, ForeignKey("principals.id", ondelete="CASCADE"), primary_key=True) - # Foreign key to Tenant (Many-to-One relationship) + # Foreign key to current Tenant (Many-to-One relationship) tenant_id = Column(UUID, ForeignKey("tenants.id")) # Many-to-Many Relationship with Roles @@ -25,11 +27,11 @@ class User(SQLAlchemyBaseUserTableUUID, Principal): back_populates="users", ) - # Relationship to Tenant - tenant = relationship( + # Many-to-Many Relationship with Tenants user is a part of + tenants: Mapped[list["Tenant"]] = relationship( "Tenant", + secondary=UserTenant.__tablename__, back_populates="users", - foreign_keys=[tenant_id], ) # ACL Relationship (One-to-Many) @@ -46,7 +48,6 @@ class UserRead(schemas.BaseUser[uuid_UUID]): class UserCreate(schemas.BaseUserCreate): - tenant_id: Optional[uuid_UUID] = None is_verified: bool = True diff --git a/cognee/modules/users/models/UserTenant.py b/cognee/modules/users/models/UserTenant.py new file mode 100644 index 000000000..bfb852aa5 --- /dev/null +++ b/cognee/modules/users/models/UserTenant.py @@ -0,0 +1,12 @@ +from datetime import datetime, timezone +from sqlalchemy import Column, ForeignKey, DateTime, UUID +from cognee.infrastructure.databases.relational import Base + + +class UserTenant(Base): + __tablename__ = "user_tenants" + + created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) + + user_id = Column(UUID, ForeignKey("users.id"), primary_key=True) + tenant_id = Column(UUID, ForeignKey("tenants.id"), primary_key=True) diff --git a/cognee/modules/users/models/__init__.py b/cognee/modules/users/models/__init__.py index ba2f40e49..5114cc45a 100644 --- a/cognee/modules/users/models/__init__.py +++ b/cognee/modules/users/models/__init__.py @@ -1,6 +1,7 @@ from .User import User from .Role import Role from .UserRole import UserRole +from .UserTenant import UserTenant from .DatasetDatabase import DatasetDatabase from .RoleDefaultPermissions import RoleDefaultPermissions from .UserDefaultPermissions import UserDefaultPermissions diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index 1185dd7ad..a4f538259 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -1,11 +1,8 @@ -from types import SimpleNamespace - from cognee.shared.logging_utils import get_logger from ...models.User import User from cognee.modules.data.models.Dataset import Dataset from cognee.modules.users.permissions.methods import get_principal_datasets -from cognee.modules.users.permissions.methods import get_role, get_tenant logger = get_logger() @@ -25,17 +22,15 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # Get all datasets User has explicit access to datasets.extend(await get_principal_datasets(user, permission_type)) - if user.tenant_id: - # Get all datasets all tenants have access to - tenant = await get_tenant(user.tenant_id) + # Get all tenants user is a part of + tenants = await user.awaitable_attrs.tenants + + for tenant in tenants: + # Get all datasets all tenant members have access to datasets.extend(await get_principal_datasets(tenant, permission_type)) - # Get all datasets Users roles have access to - if isinstance(user, SimpleNamespace): - # If simple namespace use roles defined in user - roles = user.roles - else: - roles = await user.awaitable_attrs.roles + # Get all datasets accessible by roles user is a part of + roles = await user.awaitable_attrs.roles for role in roles: datasets.extend(await get_principal_datasets(role, permission_type)) @@ -45,4 +40,5 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) + # TODO: Add filtering out of datasets that aren't currently selected tenant of user return list(unique.values()) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index 1374067a7..b9f5898d0 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -1,8 +1,11 @@ +from typing import Optional from uuid import UUID from sqlalchemy.exc import IntegrityError +from sqlalchemy import insert from cognee.infrastructure.databases.exceptions import EntityAlreadyExistsError from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.models.UserTenant import UserTenant from cognee.modules.users.methods import get_user from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import ( @@ -12,14 +15,19 @@ from cognee.modules.users.exceptions import ( ) -async def add_user_to_tenant(user_id: UUID, tenant_id: UUID, owner_id: UUID): +async def add_user_to_tenant( + user_id: UUID, tenant_id: UUID, owner_id: UUID, set_active_tenant: Optional[bool] = True +): """ Add a user with the given id to the tenant with the given id. This can only be successful if the request owner with the given id is the tenant owner. + + If set_active_tenant is true it will automatically set the users active tenant to provided tenant. Args: user_id: Id of the user. tenant_id: Id of the tenant. owner_id: Id of the request owner. + set_active_tenant: If set_active_tenant is true it will automatically set the users active tenant to provided tenant. Returns: None @@ -41,12 +49,17 @@ async def add_user_to_tenant(user_id: UUID, tenant_id: UUID, owner_id: UUID): ) try: - if user.tenant_id is None: + try: + # Add association directly to the association table + create_user_tenant_statement = insert(UserTenant).values( + user_id=user_id, tenant_id=tenant_id + ) + await session.execute(create_user_tenant_statement) + except IntegrityError: + raise EntityAlreadyExistsError(message="User is already part of group.") + + if set_active_tenant: user.tenant_id = tenant_id - elif user.tenant_id == tenant_id: - return - else: - raise IntegrityError await session.merge(user) await session.commit() diff --git a/cognee/modules/users/tenants/methods/create_tenant.py b/cognee/modules/users/tenants/methods/create_tenant.py index bfd23e08f..665e3cc18 100644 --- a/cognee/modules/users/tenants/methods/create_tenant.py +++ b/cognee/modules/users/tenants/methods/create_tenant.py @@ -1,6 +1,8 @@ from uuid import UUID +from sqlalchemy import insert from sqlalchemy.exc import IntegrityError +from cognee.modules.users.models.UserTenant import UserTenant from cognee.infrastructure.databases.exceptions import EntityAlreadyExistsError from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models import Tenant @@ -22,16 +24,22 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: async with db_engine.get_async_session() as session: try: user = await get_user(user_id) - if user.tenant_id: - raise EntityAlreadyExistsError( - message="User already has a tenant. New tenant cannot be created." - ) tenant = Tenant(name=tenant_name, owner_id=user_id) session.add(tenant) await session.flush() user.tenant_id = tenant.id + + try: + # Add association directly to the association table + create_user_tenant_statement = insert(UserTenant).values( + user_id=user_id, tenant_id=tenant.id + ) + await session.execute(create_user_tenant_statement) + except IntegrityError: + raise EntityAlreadyExistsError(message="User is already part of group.") + await session.merge(user) await session.commit() return tenant.id diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 4f51b660f..7c140845c 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -150,7 +150,9 @@ async def main(): # To add a user to a role he must be part of the same tenant/organization print("\nOperation started as user_2 to add user_3 to CogneeLab tenant/organization") - await add_user_to_tenant(user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id) + await add_user_to_tenant( + user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_active_tenant=True + ) print( "\nOperation started by user_2, as tenant owner, to add user_3 to Researcher role inside the tenant/organization" From 0c4e3e1f5295746db287eff5101d60c4cf89c1df Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 20:13:22 +0200 Subject: [PATCH 004/129] fix: Load tenants to default user --- cognee/modules/users/methods/get_default_user.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cognee/modules/users/methods/get_default_user.py b/cognee/modules/users/methods/get_default_user.py index 773545f8e..a48bd8928 100644 --- a/cognee/modules/users/methods/get_default_user.py +++ b/cognee/modules/users/methods/get_default_user.py @@ -18,7 +18,9 @@ async def get_default_user() -> SimpleNamespace: try: async with db_engine.get_async_session() as session: query = ( - select(User).options(selectinload(User.roles)).where(User.email == default_email) + select(User) + .options(selectinload(User.roles), selectinload(User.tenants)) + .where(User.email == default_email) ) result = await session.execute(query) From 12785e31ea327135a4e1968c90f1cb1e5891fad3 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 21:11:14 +0200 Subject: [PATCH 005/129] fix: Resolve issue with adding user to tenants --- .../tenants/methods/add_user_to_tenant.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index b9f5898d0..dabab6b6b 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -48,22 +48,18 @@ async def add_user_to_tenant( message="Only tenant owner can add other users to organization." ) - try: - try: - # Add association directly to the association table - create_user_tenant_statement = insert(UserTenant).values( - user_id=user_id, tenant_id=tenant_id - ) - await session.execute(create_user_tenant_statement) - except IntegrityError: - raise EntityAlreadyExistsError(message="User is already part of group.") - - if set_active_tenant: - user.tenant_id = tenant_id - + if set_active_tenant: + user.tenant_id = tenant_id await session.merge(user) await session.commit() - except IntegrityError: - raise EntityAlreadyExistsError( - message="User is already part of a tenant. Only one tenant can be assigned to user." + + try: + # Add association directly to the association table + create_user_tenant_statement = insert(UserTenant).values( + user_id=user_id, tenant_id=tenant_id ) + await session.execute(create_user_tenant_statement) + await session.commit() + + except IntegrityError: + raise EntityAlreadyExistsError(message="User is already part of group.") From 13f0423a55720debc1d04fbe9f005855c008ae53 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 21:35:50 +0200 Subject: [PATCH 006/129] refactor: Add better TODO message --- .../permissions/methods/get_all_user_permission_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index a4f538259..ff0f52d27 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -40,5 +40,6 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) - # TODO: Add filtering out of datasets that aren't currently selected tenant of user + # TODO: Add filtering out of datasets that aren't currently selected tenant of user (currently selected tenant is the tenant_id value in the User model) + # TODO: Add endpoint/method to select current Tenant for a user (This UUID value should be stored in tenant_id of User model) return list(unique.values()) From d6bb95e3798984bee76a0e5cd92308097f153649 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 21:57:39 +0200 Subject: [PATCH 007/129] fix: load tenants and roles when creating user --- cognee/modules/users/methods/create_user.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/modules/users/methods/create_user.py b/cognee/modules/users/methods/create_user.py index 953c70cd6..ef325fb6f 100644 --- a/cognee/modules/users/methods/create_user.py +++ b/cognee/modules/users/methods/create_user.py @@ -42,6 +42,10 @@ async def create_user( if auto_login: await session.refresh(user) + # Update tenants and roles information for User object + _ = await user.awaitable_attrs.tenants + _ = await user.awaitable_attrs.roles + return user except UserAlreadyExists as error: print(f"User {email} already exists") From 4f874deace3b55072bf97c35b45e158d03c5d844 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Sun, 19 Oct 2025 23:50:17 +0200 Subject: [PATCH 008/129] feat: Add tenant select method/endpoint for users --- .../routers/get_permissions_router.py | 32 ++++++++++++ .../get_all_user_permission_datasets.py | 1 - .../modules/users/tenants/methods/__init__.py | 1 + .../users/tenants/methods/create_tenant.py | 7 +-- .../users/tenants/methods/select_tenant.py | 50 +++++++++++++++++++ 5 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 cognee/modules/users/tenants/methods/select_tenant.py diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 637293268..7959415da 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -220,4 +220,36 @@ def get_permissions_router() -> APIRouter: status_code=200, content={"message": "Tenant created.", "tenant_id": str(tenant_id)} ) + @permissions_router.post("/tenants/{tenant_id}") + async def select_tenant(tenant_id: UUID, user: User = Depends(get_authenticated_user)): + """ + Select current tenant. + + This endpoint selects a tenant with the specified UUID. Tenants are used + to organize users and resources in multi-tenant environments, providing + isolation and access control between different groups or organizations. + + ## Request Parameters + - **tenant_id** (UUID): UUID of the tenant to create + + ## Response + Returns a success message indicating the tenant was created. + """ + send_telemetry( + "Permissions API Endpoint Invoked", + user.id, + additional_properties={ + "endpoint": f"POST /v1/permissions/tenants/{str(tenant_id)}", + "tenant_id": tenant_id, + }, + ) + + from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method + + await select_tenant_method(user_id=user.id, tenant_id=tenant_id) + + return JSONResponse( + status_code=200, content={"message": "Tenant selected.", "tenant_id": str(tenant_id)} + ) + return permissions_router diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index ff0f52d27..e5dbb0e4b 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -41,5 +41,4 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> unique.setdefault(dataset.id, dataset) # TODO: Add filtering out of datasets that aren't currently selected tenant of user (currently selected tenant is the tenant_id value in the User model) - # TODO: Add endpoint/method to select current Tenant for a user (This UUID value should be stored in tenant_id of User model) return list(unique.values()) diff --git a/cognee/modules/users/tenants/methods/__init__.py b/cognee/modules/users/tenants/methods/__init__.py index 9a052e9c6..39e2b31bb 100644 --- a/cognee/modules/users/tenants/methods/__init__.py +++ b/cognee/modules/users/tenants/methods/__init__.py @@ -1,2 +1,3 @@ from .create_tenant import create_tenant from .add_user_to_tenant import add_user_to_tenant +from .select_tenant import select_tenant diff --git a/cognee/modules/users/tenants/methods/create_tenant.py b/cognee/modules/users/tenants/methods/create_tenant.py index 665e3cc18..60e10db5c 100644 --- a/cognee/modules/users/tenants/methods/create_tenant.py +++ b/cognee/modules/users/tenants/methods/create_tenant.py @@ -30,6 +30,8 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: await session.flush() user.tenant_id = tenant.id + await session.merge(user) + await session.commit() try: # Add association directly to the association table @@ -37,11 +39,10 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: user_id=user_id, tenant_id=tenant.id ) await session.execute(create_user_tenant_statement) + await session.commit() except IntegrityError: - raise EntityAlreadyExistsError(message="User is already part of group.") + raise EntityAlreadyExistsError(message="User is already part of tenant.") - await session.merge(user) - await session.commit() return tenant.id except IntegrityError as e: raise EntityAlreadyExistsError(message="Tenant already exists.") from e diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py new file mode 100644 index 000000000..709e46bf2 --- /dev/null +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -0,0 +1,50 @@ +from uuid import UUID + +import sqlalchemy.exc +from sqlalchemy import select + +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.models.UserTenant import UserTenant +from cognee.modules.users.methods import get_user +from cognee.modules.users.permissions.methods import get_tenant +from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError + + +async def select_tenant(user_id: UUID, tenant_id: UUID): + """ + Set the users active tenant to provided tenant. + Args: + user_id: Id of the user. + tenant_id: Id of the tenant. + + Returns: + None + + """ + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + user = await get_user(user_id) + tenant = await get_tenant(tenant_id) + + if not user: + raise UserNotFoundError + elif not tenant: + raise TenantNotFoundError + + # Check if User is part of Tenant + result = await session.execute( + select(UserTenant) + .where(UserTenant.user_id == user_id) + .where(UserTenant.tenant_id == tenant_id) + ) + + try: + result = result.scalar_one() + except sqlalchemy.exc.NoResultFound as e: + raise TenantNotFoundError("User Tenant relationship not found.") from e + + if result: + # If user is part of tenant update current tenant of user + user.tenant_id = tenant_id + await session.merge(user) + await session.commit() From 6934692e1b7646a493f47ec6a189e041db6cb14a Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 20 Oct 2025 15:07:13 +0200 Subject: [PATCH 009/129] refactor: Enable selection of default single user tenant --- .../routers/get_permissions_router.py | 24 ++++++++++++------- .../users/tenants/methods/select_tenant.py | 13 +++++++++- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 7959415da..eeea9b653 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -1,14 +1,19 @@ from uuid import UUID -from typing import List +from typing import List, Union from fastapi import APIRouter, Depends from fastapi.responses import JSONResponse from cognee.modules.users.models import User +from cognee.api.DTO import InDTO from cognee.modules.users.methods import get_authenticated_user from cognee.shared.utils import send_telemetry +class SelectTenantDTO(InDTO): + tenant_id: UUID | None = None + + def get_permissions_router() -> APIRouter: permissions_router = APIRouter() @@ -220,8 +225,8 @@ def get_permissions_router() -> APIRouter: status_code=200, content={"message": "Tenant created.", "tenant_id": str(tenant_id)} ) - @permissions_router.post("/tenants/{tenant_id}") - async def select_tenant(tenant_id: UUID, user: User = Depends(get_authenticated_user)): + @permissions_router.post("/tenants/select") + async def select_tenant(payload: SelectTenantDTO, user: User = Depends(get_authenticated_user)): """ Select current tenant. @@ -229,8 +234,10 @@ def get_permissions_router() -> APIRouter: to organize users and resources in multi-tenant environments, providing isolation and access control between different groups or organizations. + Sending a null/None value as tenant_id selects his default single user tenant + ## Request Parameters - - **tenant_id** (UUID): UUID of the tenant to create + - **tenant_id** (Union[UUID, None]): UUID of the tenant to select, If null/None is provided use the default single user tenant ## Response Returns a success message indicating the tenant was created. @@ -239,17 +246,18 @@ def get_permissions_router() -> APIRouter: "Permissions API Endpoint Invoked", user.id, additional_properties={ - "endpoint": f"POST /v1/permissions/tenants/{str(tenant_id)}", - "tenant_id": tenant_id, + "endpoint": f"POST /v1/permissions/tenants/{str(payload.tenant_id)}", + "tenant_id": str(payload.tenant_id), }, ) from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method - await select_tenant_method(user_id=user.id, tenant_id=tenant_id) + await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id) return JSONResponse( - status_code=200, content={"message": "Tenant selected.", "tenant_id": str(tenant_id)} + status_code=200, + content={"message": "Tenant selected.", "tenant_id": str(payload.tenant_id)}, ) return permissions_router diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index 709e46bf2..732b24858 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -1,4 +1,5 @@ from uuid import UUID +from typing import Union import sqlalchemy.exc from sqlalchemy import select @@ -10,9 +11,11 @@ from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user_id: UUID, tenant_id: UUID): +async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): """ Set the users active tenant to provided tenant. + + If None tenant_id is provided set current Tenant to the default single user-tenant Args: user_id: Id of the user. tenant_id: Id of the tenant. @@ -24,6 +27,14 @@ async def select_tenant(user_id: UUID, tenant_id: UUID): db_engine = get_relational_engine() async with db_engine.get_async_session() as session: user = await get_user(user_id) + + if tenant_id is None: + # If no tenant_id is provided set current Tenant to the single user-tenant + user.tenant_id = None + await session.merge(user) + await session.commit() + return + tenant = await get_tenant(tenant_id) if not user: From c5648e63375d9eb1520f5a007dda520f70c9c145 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 22 Oct 2025 09:22:11 +0200 Subject: [PATCH 010/129] test: Add load test. --- .github/workflows/e2e_tests.yml | 31 ++++++++++++++++++++- cognee/tests/{load_test.py => test_load.py} | 30 ++++++++++++-------- 2 files changed, 49 insertions(+), 12 deletions(-) rename cognee/tests/{load_test.py => test_load.py} (65%) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 9582a3f3b..5f66e71d2 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -330,4 +330,33 @@ jobs: DB_PORT: 5432 DB_USERNAME: cognee DB_PASSWORD: cognee - run: uv run python ./cognee/tests/test_concurrent_subprocess_access.py \ No newline at end of file + run: uv run python ./cognee/tests/test_concurrent_subprocess_access.py + + test-load: + name: Test Load + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Dependencies already installed + run: echo "Dependencies already installed in setup" + + - name: Run Load Test + env: + ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: True + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_load.py \ No newline at end of file diff --git a/cognee/tests/load_test.py b/cognee/tests/test_load.py similarity index 65% rename from cognee/tests/load_test.py rename to cognee/tests/test_load.py index c44efad00..09e2db084 100644 --- a/cognee/tests/load_test.py +++ b/cognee/tests/test_load.py @@ -9,8 +9,8 @@ from cognee.shared.logging_utils import get_logger logger = get_logger() -async def process_and_search(num_of_searches): +async def process_and_search(num_of_searches): start_time = time.time() await cognee.cognify() @@ -26,26 +26,34 @@ async def process_and_search(num_of_searches): return end_time - start_time -async def main(): +async def main(): file_path = os.path.join( pathlib.Path(__file__).resolve().parent, "test_data/artificial-intelligence.pdf" ) + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") + ).resolve() + ) + cognee.config.data_root_directory(data_directory_path) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_load") + ).resolve() + ) + cognee.config.system_root_directory(cognee_directory_path) num_of_pdfs = 10 num_of_reps = 5 - upper_boundary_minutes = 3 - average_minutes = 1.5 + upper_boundary_minutes = 10 + average_minutes = 8 await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await asyncio.gather( - *[ - cognee.add(file_path, dataset_name=f"dataset_{i}") - for i in range(num_of_pdfs) - ] - ) + for i in range(num_of_pdfs): + await cognee.add(file_path, dataset_name=f"dataset_{i}") recorded_times = await asyncio.gather( *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] @@ -59,4 +67,4 @@ async def main(): if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From 742866b4c9f1d4aa53ab60fb54b79474fbfea0d2 Mon Sep 17 00:00:00 2001 From: EricXiao Date: Wed, 22 Oct 2025 16:56:46 +0800 Subject: [PATCH 011/129] feat: csv ingestion loader & chunk Signed-off-by: EricXiao --- cognee/cli/commands/cognify_command.py | 9 +- cognee/cli/config.py | 2 +- .../files/utils/guess_file_type.py | 43 +++++ .../files/utils/is_csv_content.py | 181 ++++++++++++++++++ cognee/infrastructure/loaders/LoaderEngine.py | 1 + .../infrastructure/loaders/core/__init__.py | 3 +- .../infrastructure/loaders/core/csv_loader.py | 93 +++++++++ .../loaders/core/text_loader.py | 3 +- .../loaders/supported_loaders.py | 3 +- cognee/modules/chunking/CsvChunker.py | 35 ++++ .../processing/document_types/CsvDocument.py | 33 ++++ .../processing/document_types/__init__.py | 1 + cognee/tasks/chunks/__init__.py | 1 + cognee/tasks/chunks/chunk_by_row.py | 94 +++++++++ cognee/tasks/documents/classify_documents.py | 2 + .../integration/documents/CsvDocument_test.py | 70 +++++++ .../tests/test_data/example_with_header.csv | 3 + .../processing/chunks/chunk_by_row_test.py | 52 +++++ 18 files changed, 623 insertions(+), 6 deletions(-) create mode 100644 cognee/infrastructure/files/utils/is_csv_content.py create mode 100644 cognee/infrastructure/loaders/core/csv_loader.py create mode 100644 cognee/modules/chunking/CsvChunker.py create mode 100644 cognee/modules/data/processing/document_types/CsvDocument.py create mode 100644 cognee/tasks/chunks/chunk_by_row.py create mode 100644 cognee/tests/integration/documents/CsvDocument_test.py create mode 100644 cognee/tests/test_data/example_with_header.csv create mode 100644 cognee/tests/unit/processing/chunks/chunk_by_row_test.py diff --git a/cognee/cli/commands/cognify_command.py b/cognee/cli/commands/cognify_command.py index 16eaf0454..b89c1f70e 100644 --- a/cognee/cli/commands/cognify_command.py +++ b/cognee/cli/commands/cognify_command.py @@ -22,7 +22,7 @@ relationships, and creates semantic connections for enhanced search and reasonin Processing Pipeline: 1. **Document Classification**: Identifies document types and structures -2. **Permission Validation**: Ensures user has processing rights +2. **Permission Validation**: Ensures user has processing rights 3. **Text Chunking**: Breaks content into semantically meaningful segments 4. **Entity Extraction**: Identifies key concepts, people, places, organizations 5. **Relationship Detection**: Discovers connections between entities @@ -97,6 +97,13 @@ After successful cognify processing, use `cognee search` to query the knowledge chunker_class = LangchainChunker except ImportError: fmt.warning("LangchainChunker not available, using TextChunker") + elif args.chunker == "CsvChunker": + try: + from cognee.modules.chunking.CsvChunker import CsvChunker + + chunker_class = CsvChunker + except ImportError: + fmt.warning("CsvChunker not available, using TextChunker") result = await cognee.cognify( datasets=datasets, diff --git a/cognee/cli/config.py b/cognee/cli/config.py index d016608c1..082adbaec 100644 --- a/cognee/cli/config.py +++ b/cognee/cli/config.py @@ -26,7 +26,7 @@ SEARCH_TYPE_CHOICES = [ ] # Chunker choices -CHUNKER_CHOICES = ["TextChunker", "LangchainChunker"] +CHUNKER_CHOICES = ["TextChunker", "LangchainChunker", "CsvChunker"] # Output format choices OUTPUT_FORMAT_CHOICES = ["json", "pretty", "simple"] diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index dcdd68cad..10f59a400 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -1,6 +1,8 @@ from typing import BinaryIO import filetype + from .is_text_content import is_text_content +from .is_csv_content import is_csv_content class FileTypeException(Exception): @@ -134,3 +136,44 @@ def guess_file_type(file: BinaryIO) -> filetype.Type: raise FileTypeException(f"Unknown file detected: {file.name}.") return file_type + + +class CsvFileType(filetype.Type): + """ + Match CSV file types based on MIME type and extension. + + Public methods: + - match + + Instance variables: + - MIME: The MIME type of the CSV. + - EXTENSION: The file extension of the CSV. + """ + + MIME = "text/csv" + EXTENSION = "csv" + + def __init__(self): + super().__init__(mime=self.MIME, extension=self.EXTENSION) + + def match(self, buf): + """ + Determine if the given buffer contains csv content. + + Parameters: + ----------- + + - buf: The buffer to check for csv content. + + Returns: + -------- + + Returns True if the buffer is identified as csv content, otherwise False. + """ + + return is_csv_content(buf) + + +csv_file_type = CsvFileType() + +filetype.add_type(csv_file_type) diff --git a/cognee/infrastructure/files/utils/is_csv_content.py b/cognee/infrastructure/files/utils/is_csv_content.py new file mode 100644 index 000000000..07b7ea69b --- /dev/null +++ b/cognee/infrastructure/files/utils/is_csv_content.py @@ -0,0 +1,181 @@ +import csv +from collections import Counter + + +def is_csv_content(content): + """ + Heuristically determine whether a bytes-like object is CSV text. + + Strategy (fail-fast and cheap to expensive): + 1) Decode: Try a small ordered list of common encodings with strict errors. + 2) Line sampling: require >= 2 non-empty lines; sample up to 50 lines. + 3) Delimiter detection: + - Prefer csv.Sniffer() with common delimiters. + - Fallback to a lightweight consistency heuristic. + 4) Lightweight parse check: + - Parse a few lines with the delimiter. + - Ensure at least 2 valid rows and relatively stable column counts. + + Returns: + bool: True if the buffer looks like CSV; False otherwise. + """ + try: + encoding_list = [ + "utf-8", + "utf-8-sig", + "utf-32-le", + "utf-32-be", + "utf-16-le", + "utf-16-be", + "gb18030", + "shift_jis", + "cp949", + "cp1252", + "iso-8859-1", + ] + + # Try to decode strictly—if decoding fails for all encodings, it's not text/CSV. + text = None + for enc in encoding_list: + try: + text = content.decode(enc, errors="strict") + break + except UnicodeDecodeError: + continue + if text is None: + return False + + # Reject empty/whitespace-only payloads. + stripped = text.strip() + if not stripped: + return False + + # Split into logical lines and drop empty ones. Require at least two lines. + lines = [ln for ln in text.splitlines() if ln.strip()] + if len(lines) < 2: + return False + + # Take a small sample to keep sniffing cheap and predictable. + sample_lines = lines[:50] + + # Detect delimiter using csv.Sniffer first; if that fails, use our heuristic. + delimiter = _sniff_delimiter(sample_lines) or _heuristic_delimiter(sample_lines) + if not delimiter: + return False + + # Finally, do a lightweight parse sanity check with the chosen delimiter. + return _lightweight_parse_check(sample_lines, delimiter) + except Exception: + return False + + +def _sniff_delimiter(lines): + """ + Try Python's built-in csv.Sniffer on a sample. + + Args: + lines (list[str]): Sample lines (already decoded). + + Returns: + str | None: The detected delimiter if sniffing succeeds; otherwise None. + """ + # Join up to 50 lines to form the sample string Sniffer will inspect. + sample = "\n".join(lines[:50]) + try: + dialect = csv.Sniffer().sniff(sample, delimiters=",\t;|") + return dialect.delimiter + except Exception: + # Sniffer is known to be brittle on small/dirty samples—silently fallback. + return None + + +def _heuristic_delimiter(lines): + """ + Fallback delimiter detection based on count consistency per line. + + Heuristic: + - For each candidate delimiter, count occurrences per line. + - Keep only lines with count > 0 (line must contain the delimiter). + - Require at least half of lines to contain the delimiter (min 2). + - Compute the mode (most common count). If the proportion of lines that + exhibit the modal count is >= 80%, accept that delimiter. + + Args: + lines (list[str]): Sample lines. + + Returns: + str | None: Best delimiter if one meets the consistency threshold; else None. + """ + candidates = [",", "\t", ";", "|"] + best = None + best_score = 0.0 + + for d in candidates: + # Count how many times the delimiter appears in each line. + counts = [ln.count(d) for ln in lines] + # Consider only lines that actually contain the delimiter at least once. + nonzero = [c for c in counts if c > 0] + + # Require that more than half of lines (and at least 2) contain the delimiter. + if len(nonzero) < max(2, int(0.5 * len(lines))): + continue + + # Find the modal count and its frequency. + cnt = Counter(nonzero) + pairs = cnt.most_common(1) + if not pairs: + continue + + mode, mode_freq = pairs[0] + # Consistency ratio: lines with the modal count / total lines in the sample. + consistency = mode_freq / len(lines) + # Accept if consistent enough and better than any previous candidate. + if mode >= 1 and consistency >= 0.80 and consistency > best_score: + best = d + best_score = consistency + + return best + + +def _lightweight_parse_check(lines, delimiter): + """ + Parse a few lines with csv.reader and check structural stability. + + Heuristic: + - Parse up to 5 lines with the given delimiter. + - Count column widths per parsed row. + - Require at least 2 non-empty rows. + - Allow at most 1 row whose width deviates by >2 columns from the first row. + + Args: + lines (list[str]): Sample lines (decoded). + delimiter (str): Delimiter chosen by sniffing/heuristics. + + Returns: + bool: True if parsing looks stable; False otherwise. + """ + try: + # csv.reader accepts any iterable of strings; feeding the first 10 lines is fine. + reader = csv.reader(lines[:10], delimiter=delimiter) + widths = [] + valid_rows = 0 + for row in reader: + if not row: + continue + + widths.append(len(row)) + valid_rows += 1 + + # Need at least two meaningful rows to make a judgment. + if valid_rows < 2: + return False + + if widths: + first = widths[0] + # Count rows whose width deviates significantly (>2) from the first row. + unstable = sum(1 for w in widths if abs(w - first) > 2) + # Permit at most 1 unstable row among the parsed sample. + return unstable <= 1 + return False + except Exception: + return False diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 6b62f7641..37e63c9fc 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -30,6 +30,7 @@ class LoaderEngine: "pypdf_loader", "image_loader", "audio_loader", + "csv_loader", "unstructured_loader", "advanced_pdf_loader", ] diff --git a/cognee/infrastructure/loaders/core/__init__.py b/cognee/infrastructure/loaders/core/__init__.py index 8a2df80f9..09819fbd2 100644 --- a/cognee/infrastructure/loaders/core/__init__.py +++ b/cognee/infrastructure/loaders/core/__init__.py @@ -3,5 +3,6 @@ from .text_loader import TextLoader from .audio_loader import AudioLoader from .image_loader import ImageLoader +from .csv_loader import CsvLoader -__all__ = ["TextLoader", "AudioLoader", "ImageLoader"] +__all__ = ["TextLoader", "AudioLoader", "ImageLoader", "CsvLoader"] diff --git a/cognee/infrastructure/loaders/core/csv_loader.py b/cognee/infrastructure/loaders/core/csv_loader.py new file mode 100644 index 000000000..a314a7a24 --- /dev/null +++ b/cognee/infrastructure/loaders/core/csv_loader.py @@ -0,0 +1,93 @@ +import os +from typing import List +import csv +from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface +from cognee.infrastructure.files.storage import get_file_storage, get_storage_config +from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata + + +class CsvLoader(LoaderInterface): + """ + Core CSV file loader that handles basic CSV file formats. + """ + + @property + def supported_extensions(self) -> List[str]: + """Supported text file extensions.""" + return [ + "csv", + ] + + @property + def supported_mime_types(self) -> List[str]: + """Supported MIME types for text content.""" + return [ + "text/csv", + ] + + @property + def loader_name(self) -> str: + """Unique identifier for this loader.""" + return "csv_loader" + + def can_handle(self, extension: str, mime_type: str) -> bool: + """ + Check if this loader can handle the given file. + + Args: + extension: File extension + mime_type: Optional MIME type + + Returns: + True if file can be handled, False otherwise + """ + if extension in self.supported_extensions and mime_type in self.supported_mime_types: + return True + + return False + + async def load(self, file_path: str, encoding: str = "utf-8", **kwargs): + """ + Load and process the csv file. + + Args: + file_path: Path to the file to load + encoding: Text encoding to use (default: utf-8) + **kwargs: Additional configuration (unused) + + Returns: + LoaderResult containing the file content and metadata + + Raises: + FileNotFoundError: If file doesn't exist + UnicodeDecodeError: If file cannot be decoded with specified encoding + OSError: If file cannot be read + """ + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + with open(file_path, "rb") as f: + file_metadata = await get_file_metadata(f) + # Name ingested file of current loader based on original file content hash + storage_file_name = "text_" + file_metadata["content_hash"] + ".txt" + + row_texts = [] + row_index = 1 + + with open(file_path, "r", encoding=encoding, newline="") as file: + reader = csv.DictReader(file) + for row in reader: + pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()] + row_text = ", ".join(pairs) + row_texts.append(f"Row {row_index}:\n{row_text}\n") + row_index += 1 + + content = "\n".join(row_texts) + + storage_config = get_storage_config() + data_root_directory = storage_config["data_root_directory"] + storage = get_file_storage(data_root_directory) + + full_file_path = await storage.store(storage_file_name, content) + + return full_file_path diff --git a/cognee/infrastructure/loaders/core/text_loader.py b/cognee/infrastructure/loaders/core/text_loader.py index a6f94be9b..e478edb22 100644 --- a/cognee/infrastructure/loaders/core/text_loader.py +++ b/cognee/infrastructure/loaders/core/text_loader.py @@ -16,7 +16,7 @@ class TextLoader(LoaderInterface): @property def supported_extensions(self) -> List[str]: """Supported text file extensions.""" - return ["txt", "md", "csv", "json", "xml", "yaml", "yml", "log"] + return ["txt", "md", "json", "xml", "yaml", "yml", "log"] @property def supported_mime_types(self) -> List[str]: @@ -24,7 +24,6 @@ class TextLoader(LoaderInterface): return [ "text/plain", "text/markdown", - "text/csv", "application/json", "text/xml", "application/xml", diff --git a/cognee/infrastructure/loaders/supported_loaders.py b/cognee/infrastructure/loaders/supported_loaders.py index d103babe3..b506df5f3 100644 --- a/cognee/infrastructure/loaders/supported_loaders.py +++ b/cognee/infrastructure/loaders/supported_loaders.py @@ -1,5 +1,5 @@ from cognee.infrastructure.loaders.external import PyPdfLoader -from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader +from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader, CsvLoader # Registry for loader implementations supported_loaders = { @@ -7,6 +7,7 @@ supported_loaders = { TextLoader.loader_name: TextLoader, ImageLoader.loader_name: ImageLoader, AudioLoader.loader_name: AudioLoader, + CsvLoader.loader_name: CsvLoader, } # Try adding optional loaders diff --git a/cognee/modules/chunking/CsvChunker.py b/cognee/modules/chunking/CsvChunker.py new file mode 100644 index 000000000..4ba4a969e --- /dev/null +++ b/cognee/modules/chunking/CsvChunker.py @@ -0,0 +1,35 @@ +from cognee.shared.logging_utils import get_logger + + +from cognee.tasks.chunks import chunk_by_row +from cognee.modules.chunking.Chunker import Chunker +from .models.DocumentChunk import DocumentChunk + +logger = get_logger() + + +class CsvChunker(Chunker): + async def read(self): + async for content_text in self.get_text(): + if content_text is None: + continue + + for chunk_data in chunk_by_row(content_text, self.max_chunk_size): + if chunk_data["chunk_size"] <= self.max_chunk_size: + yield DocumentChunk( + id=chunk_data["chunk_id"], + text=chunk_data["text"], + chunk_size=chunk_data["chunk_size"], + is_part_of=self.document, + chunk_index=self.chunk_index, + cut_type=chunk_data["cut_type"], + contains=[], + metadata={ + "index_fields": ["text"], + }, + ) + self.chunk_index += 1 + else: + raise ValueError( + f"Chunk size is larger than the maximum chunk size {self.max_chunk_size}" + ) diff --git a/cognee/modules/data/processing/document_types/CsvDocument.py b/cognee/modules/data/processing/document_types/CsvDocument.py new file mode 100644 index 000000000..3381275bd --- /dev/null +++ b/cognee/modules/data/processing/document_types/CsvDocument.py @@ -0,0 +1,33 @@ +import io +import csv +from typing import Type + +from cognee.modules.chunking.Chunker import Chunker +from cognee.infrastructure.files.utils.open_data_file import open_data_file +from .Document import Document + + +class CsvDocument(Document): + type: str = "csv" + mime_type: str = "text/csv" + + async def read(self, chunker_cls: Type[Chunker], max_chunk_size: int): + async def get_text(): + async with open_data_file( + self.raw_data_location, mode="r", encoding="utf-8", newline="" + ) as file: + content = file.read() + file_like_obj = io.StringIO(content) + reader = csv.DictReader(file_like_obj) + + for row in reader: + pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()] + row_text = ", ".join(pairs) + if not row_text.strip(): + break + yield row_text + + chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text) + + async for chunk in chunker.read(): + yield chunk diff --git a/cognee/modules/data/processing/document_types/__init__.py b/cognee/modules/data/processing/document_types/__init__.py index 2e862f4ba..133dd53f8 100644 --- a/cognee/modules/data/processing/document_types/__init__.py +++ b/cognee/modules/data/processing/document_types/__init__.py @@ -4,3 +4,4 @@ from .TextDocument import TextDocument from .ImageDocument import ImageDocument from .AudioDocument import AudioDocument from .UnstructuredDocument import UnstructuredDocument +from .CsvDocument import CsvDocument diff --git a/cognee/tasks/chunks/__init__.py b/cognee/tasks/chunks/__init__.py index 22ce96be8..37d4de73e 100644 --- a/cognee/tasks/chunks/__init__.py +++ b/cognee/tasks/chunks/__init__.py @@ -1,4 +1,5 @@ from .chunk_by_word import chunk_by_word from .chunk_by_sentence import chunk_by_sentence from .chunk_by_paragraph import chunk_by_paragraph +from .chunk_by_row import chunk_by_row from .remove_disconnected_chunks import remove_disconnected_chunks diff --git a/cognee/tasks/chunks/chunk_by_row.py b/cognee/tasks/chunks/chunk_by_row.py new file mode 100644 index 000000000..8daf13689 --- /dev/null +++ b/cognee/tasks/chunks/chunk_by_row.py @@ -0,0 +1,94 @@ +from typing import Any, Dict, Iterator +from uuid import NAMESPACE_OID, uuid5 + +from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine + + +def _get_pair_size(pair_text: str) -> int: + """ + Calculate the size of a given text in terms of tokens. + + If an embedding engine's tokenizer is available, count the tokens for the provided word. + If the tokenizer is not available, assume the word counts as one token. + + Parameters: + ----------- + + - pair_text (str): The key:value pair text for which the token size is to be calculated. + + Returns: + -------- + + - int: The number of tokens representing the text, typically an integer, depending + on the tokenizer's output. + """ + embedding_engine = get_embedding_engine() + if embedding_engine.tokenizer: + return embedding_engine.tokenizer.count_tokens(pair_text) + else: + return 3 + + +def chunk_by_row( + data: str, + max_chunk_size, +) -> Iterator[Dict[str, Any]]: + """ + Chunk the input text by row while enabling exact text reconstruction. + + This function divides the given text data into smaller chunks on a line-by-line basis, + ensuring that the size of each chunk is less than or equal to the specified maximum + chunk size. It guarantees that when the generated chunks are concatenated, they + reproduce the original text accurately. The tokenization process is handled by + adapters compatible with the vector engine's embedding model. + + Parameters: + ----------- + + - data (str): The input text to be chunked. + - max_chunk_size: The maximum allowed size for each chunk, in terms of tokens or + words. + """ + current_chunk_list = [] + chunk_index = 0 + current_chunk_size = 0 + + lines = data.split("\n\n") + for line in lines: + pairs_text = line.split(", ") + + for pair_text in pairs_text: + pair_size = _get_pair_size(pair_text) + if current_chunk_size > 0 and (current_chunk_size + pair_size > max_chunk_size): + # Yield current cut chunk + current_chunk = ", ".join(current_chunk_list) + chunk_dict = { + "text": current_chunk, + "chunk_size": current_chunk_size, + "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "chunk_index": chunk_index, + "cut_type": "row_cut", + } + + yield chunk_dict + + # Start new chunk with current pair text + current_chunk_list = [] + current_chunk_size = 0 + chunk_index += 1 + + current_chunk_list.append(pair_text) + current_chunk_size += pair_size + + # Yield row chunk + current_chunk = ", ".join(current_chunk_list) + if current_chunk: + chunk_dict = { + "text": current_chunk, + "chunk_size": current_chunk_size, + "chunk_id": uuid5(NAMESPACE_OID, current_chunk), + "chunk_index": chunk_index, + "cut_type": "row_end", + } + + yield chunk_dict diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 9fa512906..e4f13ebd1 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -7,6 +7,7 @@ from cognee.modules.data.processing.document_types import ( ImageDocument, TextDocument, UnstructuredDocument, + CsvDocument, ) from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.engine.utils.generate_node_id import generate_node_id @@ -15,6 +16,7 @@ from cognee.tasks.documents.exceptions import WrongDataDocumentInputError EXTENSION_TO_DOCUMENT_CLASS = { "pdf": PdfDocument, # Text documents "txt": TextDocument, + "csv": CsvDocument, "docx": UnstructuredDocument, "doc": UnstructuredDocument, "odt": UnstructuredDocument, diff --git a/cognee/tests/integration/documents/CsvDocument_test.py b/cognee/tests/integration/documents/CsvDocument_test.py new file mode 100644 index 000000000..421bb81bd --- /dev/null +++ b/cognee/tests/integration/documents/CsvDocument_test.py @@ -0,0 +1,70 @@ +import os +import sys +import uuid +import pytest +import pathlib +from unittest.mock import patch + +from cognee.modules.chunking.CsvChunker import CsvChunker +from cognee.modules.data.processing.document_types.CsvDocument import CsvDocument +from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine +from cognee.tests.integration.documents.async_gen_zip import async_gen_zip + +chunk_by_row_module = sys.modules.get("cognee.tasks.chunks.chunk_by_row") + + +GROUND_TRUTH = { + "chunk_size_10": [ + {"token_count": 9, "len_text": 26, "cut_type": "row_cut", "chunk_index": 0}, + {"token_count": 6, "len_text": 29, "cut_type": "row_end", "chunk_index": 1}, + {"token_count": 9, "len_text": 25, "cut_type": "row_cut", "chunk_index": 2}, + {"token_count": 6, "len_text": 30, "cut_type": "row_end", "chunk_index": 3}, + ], + "chunk_size_128": [ + {"token_count": 15, "len_text": 57, "cut_type": "row_end", "chunk_index": 0}, + {"token_count": 15, "len_text": 57, "cut_type": "row_end", "chunk_index": 1}, + ], +} + + +@pytest.mark.parametrize( + "input_file,chunk_size", + [("example_with_header.csv", 10), ("example_with_header.csv", 128)], +) +@patch.object(chunk_by_row_module, "get_embedding_engine", side_effect=mock_get_embedding_engine) +@pytest.mark.asyncio +async def test_CsvDocument(mock_engine, input_file, chunk_size): + # Define file paths of test data + csv_file_path = os.path.join( + pathlib.Path(__file__).parent.parent.parent, + "test_data", + input_file, + ) + + # Define test documents + csv_document = CsvDocument( + id=uuid.uuid4(), + name="example_with_header.csv", + raw_data_location=csv_file_path, + external_metadata="", + mime_type="text/csv", + ) + + # TEST CSV + ground_truth_key = f"chunk_size_{chunk_size}" + async for ground_truth, row_data in async_gen_zip( + GROUND_TRUTH[ground_truth_key], + csv_document.read(chunker_cls=CsvChunker, max_chunk_size=chunk_size), + ): + assert ground_truth["token_count"] == row_data.chunk_size, ( + f'{ground_truth["token_count"] = } != {row_data.chunk_size = }' + ) + assert ground_truth["len_text"] == len(row_data.text), ( + f'{ground_truth["len_text"] = } != {len(row_data.text) = }' + ) + assert ground_truth["cut_type"] == row_data.cut_type, ( + f'{ground_truth["cut_type"] = } != {row_data.cut_type = }' + ) + assert ground_truth["chunk_index"] == row_data.chunk_index, ( + f'{ground_truth["chunk_index"] = } != {row_data.chunk_index = }' + ) diff --git a/cognee/tests/test_data/example_with_header.csv b/cognee/tests/test_data/example_with_header.csv new file mode 100644 index 000000000..dc900e5ef --- /dev/null +++ b/cognee/tests/test_data/example_with_header.csv @@ -0,0 +1,3 @@ +id,name,age,city,country +1,Eric,30,Beijing,China +2,Joe,35,Berlin,Germany diff --git a/cognee/tests/unit/processing/chunks/chunk_by_row_test.py b/cognee/tests/unit/processing/chunks/chunk_by_row_test.py new file mode 100644 index 000000000..7d6a73a06 --- /dev/null +++ b/cognee/tests/unit/processing/chunks/chunk_by_row_test.py @@ -0,0 +1,52 @@ +from itertools import product + +import numpy as np +import pytest + +from cognee.infrastructure.databases.vector.embeddings import get_embedding_engine +from cognee.tasks.chunks import chunk_by_row + +INPUT_TEXTS = "name: John, age: 30, city: New York, country: USA" +max_chunk_size_vals = [8, 32] + + +@pytest.mark.parametrize( + "input_text,max_chunk_size", + list(product([INPUT_TEXTS], max_chunk_size_vals)), +) +def test_chunk_by_row_isomorphism(input_text, max_chunk_size): + chunks = chunk_by_row(input_text, max_chunk_size) + reconstructed_text = ", ".join([chunk["text"] for chunk in chunks]) + assert reconstructed_text == input_text, ( + f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" + ) + + +@pytest.mark.parametrize( + "input_text,max_chunk_size", + list(product([INPUT_TEXTS], max_chunk_size_vals)), +) +def test_row_chunk_length(input_text, max_chunk_size): + chunks = list(chunk_by_row(data=input_text, max_chunk_size=max_chunk_size)) + embedding_engine = get_embedding_engine() + + chunk_lengths = np.array( + [embedding_engine.tokenizer.count_tokens(chunk["text"]) for chunk in chunks] + ) + + larger_chunks = chunk_lengths[chunk_lengths > max_chunk_size] + assert np.all(chunk_lengths <= max_chunk_size), ( + f"{max_chunk_size = }: {larger_chunks} are too large" + ) + + +@pytest.mark.parametrize( + "input_text,max_chunk_size", + list(product([INPUT_TEXTS], max_chunk_size_vals)), +) +def test_chunk_by_row_chunk_numbering(input_text, max_chunk_size): + chunks = chunk_by_row(data=input_text, max_chunk_size=max_chunk_size) + chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) + assert np.all(chunk_indices == np.arange(len(chunk_indices))), ( + f"{chunk_indices = } are not monotonically increasing" + ) From 8566516ceca89a0e85db6aa5ba967f5d8070b2c7 Mon Sep 17 00:00:00 2001 From: EricXiao Date: Wed, 22 Oct 2025 16:59:07 +0800 Subject: [PATCH 012/129] chore: Remove local test code Signed-off-by: EricXiao --- .../loaders/external/advanced_pdf_loader.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cognee/infrastructure/loaders/external/advanced_pdf_loader.py b/cognee/infrastructure/loaders/external/advanced_pdf_loader.py index 6d1412b77..4b3ba296a 100644 --- a/cognee/infrastructure/loaders/external/advanced_pdf_loader.py +++ b/cognee/infrastructure/loaders/external/advanced_pdf_loader.py @@ -227,12 +227,3 @@ class AdvancedPdfLoader(LoaderInterface): if value is None: return "" return str(value).replace("\xa0", " ").strip() - - -if __name__ == "__main__": - loader = AdvancedPdfLoader() - asyncio.run( - loader.load( - "/Users/xiaotao/work/cognee/cognee/infrastructure/loaders/external/attention_is_all_you_need.pdf" - ) - ) From eb40945c6d7394ebb9e997b9bb19631411c2a3a1 Mon Sep 17 00:00:00 2001 From: vasilije Date: Sat, 25 Oct 2025 10:26:46 +0200 Subject: [PATCH 013/129] added logs --- cognee/api/client.py | 5 +++++ docker-compose.yml | 2 +- entrypoint.sh | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cognee/api/client.py b/cognee/api/client.py index 6766c12de..19a607ff0 100644 --- a/cognee/api/client.py +++ b/cognee/api/client.py @@ -39,6 +39,8 @@ from cognee.api.v1.users.routers import ( ) from cognee.modules.users.methods.get_authenticated_user import REQUIRE_AUTHENTICATION +# Ensure application logging is configured for container stdout/stderr +setup_logging() logger = get_logger() if os.getenv("ENV", "prod") == "prod": @@ -74,6 +76,9 @@ async def lifespan(app: FastAPI): await get_default_user() + # Emit a clear startup message for docker logs + logger.info("Backend server has started") + yield diff --git a/docker-compose.yml b/docker-compose.yml index 43d9b2607..472f24c21 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: - DEBUG=false # Change to true if debugging - HOST=0.0.0.0 - ENVIRONMENT=local - - LOG_LEVEL=ERROR + - LOG_LEVEL=INFO extra_hosts: # Allows the container to reach your local machine using "host.docker.internal" instead of "localhost" - "host.docker.internal:host-gateway" diff --git a/entrypoint.sh b/entrypoint.sh index bad9b7aa3..496825408 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -43,10 +43,10 @@ sleep 2 if [ "$ENVIRONMENT" = "dev" ] || [ "$ENVIRONMENT" = "local" ]; then if [ "$DEBUG" = "true" ]; then echo "Waiting for the debugger to attach..." - debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m gunicorn -w 1 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:$HTTP_PORT --log-level debug --reload cognee.api.client:app + exec debugpy --wait-for-client --listen 0.0.0.0:$DEBUG_PORT -m gunicorn -w 1 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:$HTTP_PORT --log-level debug --reload --access-logfile - --error-logfile - cognee.api.client:app else - gunicorn -w 1 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:$HTTP_PORT --log-level debug --reload cognee.api.client:app + exec gunicorn -w 1 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:$HTTP_PORT --log-level debug --reload --access-logfile - --error-logfile - cognee.api.client:app fi else - gunicorn -w 1 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:$HTTP_PORT --log-level error cognee.api.client:app + exec gunicorn -w 1 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:$HTTP_PORT --log-level error --access-logfile - --error-logfile - cognee.api.client:app fi From 897fbd2f09abfc1c3c5cc30fc2fcf17ed549ae80 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 27 Oct 2025 15:42:09 +0100 Subject: [PATCH 014/129] load test now uses s3 bucket --- cognee/tests/test_load.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cognee/tests/test_load.py b/cognee/tests/test_load.py index 09e2db084..f8d007d28 100644 --- a/cognee/tests/test_load.py +++ b/cognee/tests/test_load.py @@ -17,7 +17,9 @@ async def process_and_search(num_of_searches): await asyncio.gather( *[ - cognee.search(query_text="Tell me about AI", query_type=SearchType.GRAPH_COMPLETION) + cognee.search( + query_text="Tell me about the document", query_type=SearchType.GRAPH_COMPLETION + ) for _ in range(num_of_searches) ] ) @@ -28,9 +30,6 @@ async def process_and_search(num_of_searches): async def main(): - file_path = os.path.join( - pathlib.Path(__file__).resolve().parent, "test_data/artificial-intelligence.pdf" - ) data_directory_path = str( pathlib.Path( os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") @@ -52,8 +51,8 @@ async def main(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - for i in range(num_of_pdfs): - await cognee.add(file_path, dataset_name=f"dataset_{i}") + s3_input = "s3://cognee-load-test-s3-bucket" + await cognee.add(s3_input) recorded_times = await asyncio.gather( *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] From 2b083dd0f110e44341d30a6228abb18591cfabac Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 28 Oct 2025 09:27:33 +0100 Subject: [PATCH 015/129] small changes to load test --- cognee/tests/test_load.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cognee/tests/test_load.py b/cognee/tests/test_load.py index f8d007d28..a09ce053d 100644 --- a/cognee/tests/test_load.py +++ b/cognee/tests/test_load.py @@ -48,15 +48,15 @@ async def main(): upper_boundary_minutes = 10 average_minutes = 8 - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) + recorded_times = [] + for _ in range(num_of_reps): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) - s3_input = "s3://cognee-load-test-s3-bucket" - await cognee.add(s3_input) + s3_input = "s3://cognee-test-load-s3-bucket" + await cognee.add(s3_input) - recorded_times = await asyncio.gather( - *[process_and_search(num_of_pdfs) for _ in range(num_of_reps)] - ) + recorded_times.append(await process_and_search(num_of_pdfs)) average_recorded_time = sum(recorded_times) / len(recorded_times) From fb7e74eaa8d8bdae50021e0a24e7e6bf6bfc2a09 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 16:28:09 +0100 Subject: [PATCH 016/129] refactor: Enable multi user mode by default if graph and vector db providers support it --- .env.template | 5 ++- cognee/context_global_variables.py | 43 ++++++++++++++++++- cognee/modules/search/methods/search.py | 6 +-- .../users/methods/get_authenticated_user.py | 3 +- .../relational_database_migration_example.py | 3 ++ logs/.gitkeep | 0 logs/README.md | 31 ------------- 7 files changed, 53 insertions(+), 38 deletions(-) delete mode 100644 logs/.gitkeep delete mode 100644 logs/README.md diff --git a/.env.template b/.env.template index 89ac06830..8e1bdd23f 100644 --- a/.env.template +++ b/.env.template @@ -169,8 +169,9 @@ REQUIRE_AUTHENTICATION=False # Vector: LanceDB # Graph: KuzuDB # -# It enforces LanceDB and KuzuDB use and uses them to create databases per Cognee user + dataset -ENABLE_BACKEND_ACCESS_CONTROL=False +# It enforces creation of databases per Cognee user + dataset. Does not work with some graph and database providers. +# Disable mode when using not supported graph/vector databases. +ENABLE_BACKEND_ACCESS_CONTROL=True ################################################################################ # ☁️ Cloud Sync Settings diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index d52de4b4e..8ad855724 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -4,6 +4,8 @@ from typing import Union from uuid import UUID from cognee.base_config import get_base_config +from cognee.infrastructure.databases.vector.config import get_vectordb_context_config +from cognee.infrastructure.databases.graph.config import get_graph_context_config from cognee.infrastructure.databases.utils import get_or_create_dataset_database from cognee.infrastructure.files.storage.config import file_storage_config from cognee.modules.users.methods import get_user @@ -14,11 +16,50 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) +vector_dbs_with_multi_user_support = ["lancedb"] +graph_dbs_with_multi_user_support = ["kuzu"] + async def set_session_user_context_variable(user): session_user.set(user) +def check_multi_user_support(): + graph_db_config = get_graph_context_config() + vector_db_config = get_vectordb_context_config() + if ( + graph_db_config["graph_database_provider"] in graph_dbs_with_multi_user_support + and vector_db_config["vector_db_provider"] in vector_dbs_with_multi_user_support + ): + return True + else: + return False + + +def check_backend_access_control_mode(): + backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None) + if backend_access_control is None: + # If backend access control is not defined in environment variables, + # enable it by default if graph and vector DBs can support it, otherwise disable it + multi_user_support = check_multi_user_support() + if multi_user_support: + return "true" + else: + return "false" + elif backend_access_control.lower() == "true": + # If enabled, ensure that the current graph and vector DBs can support it + multi_user_support = check_multi_user_support() + if not multi_user_support: + raise EnvironmentError( + "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control." + ) + else: + return "true" + else: + # If explicitly disabled, return false + return "false" + + async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID): """ If backend access control is enabled this function will ensure all datasets have their own databases, @@ -40,7 +81,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config = get_base_config() - if not os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": + if not check_backend_access_control_mode() == "true": return user = await get_user(user_id) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index aab004924..e3d7c220e 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -1,4 +1,3 @@ -import os import json import asyncio from uuid import UUID @@ -9,6 +8,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine from cognee.shared.logging_utils import get_logger from cognee.shared.utils import send_telemetry from cognee.context_global_variables import set_database_global_context_variables +from cognee.context_global_variables import check_backend_access_control_mode from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge @@ -74,7 +74,7 @@ async def search( ) # Use search function filtered by permissions if access control is enabled - if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": + if check_backend_access_control_mode() == "true": search_results = await authorized_search( query_type=query_type, query_text=query_text, @@ -156,7 +156,7 @@ async def search( ) else: # This is for maintaining backwards compatibility - if os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true": + if check_backend_access_control_mode() == "true": return_value = [] for search_result in search_results: prepared_search_results = await prepare_search_result(search_result) diff --git a/cognee/modules/users/methods/get_authenticated_user.py b/cognee/modules/users/methods/get_authenticated_user.py index d78215892..34d82586e 100644 --- a/cognee/modules/users/methods/get_authenticated_user.py +++ b/cognee/modules/users/methods/get_authenticated_user.py @@ -5,6 +5,7 @@ from ..models import User from ..get_fastapi_users import get_fastapi_users from .get_default_user import get_default_user from cognee.shared.logging_utils import get_logger +from cognee.context_global_variables import check_backend_access_control_mode logger = get_logger("get_authenticated_user") @@ -12,7 +13,7 @@ logger = get_logger("get_authenticated_user") # Check environment variable to determine authentication requirement REQUIRE_AUTHENTICATION = ( os.getenv("REQUIRE_AUTHENTICATION", "false").lower() == "true" - or os.getenv("ENABLE_BACKEND_ACCESS_CONTROL", "false").lower() == "true" + or check_backend_access_control_mode() == "true" ) fastapi_users = get_fastapi_users() diff --git a/examples/python/relational_database_migration_example.py b/examples/python/relational_database_migration_example.py index 7e87347bc..98482cb4b 100644 --- a/examples/python/relational_database_migration_example.py +++ b/examples/python/relational_database_migration_example.py @@ -31,6 +31,9 @@ from cognee.infrastructure.databases.vector.pgvector import ( async def main(): + # Disable backend access control to migrate relational data + os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" + # Clean all data stored in Cognee await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) diff --git a/logs/.gitkeep b/logs/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/logs/README.md b/logs/README.md deleted file mode 100644 index 96ef613b5..000000000 --- a/logs/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Logs Directory - -This directory contains the application logs for Cognee. - -## Log Files - -- Log files are named by date in the format `YYYY-MM-DD_HH-MM-SS.log` -- Logs are stored in plain text format with a consistent structure -- Each log entry includes: - - Timestamp (ISO format) - - Log level (padded to consistent width) - - Message - - Additional context (if any) - - Logger name (in square brackets) -- Exception tracebacks are included for error logs - -## Sample Log Entry - -``` -2025-03-27T13:05:27.481446Z [INFO ] Structured log message user_id=user123 action=login status=success [TestLogger] -``` - -## Retention Policy - -The system automatically keeps only the 10 most recent log files. Older log files are automatically deleted when new log files are created. This prevents excessive disk usage in long-running deployments. - -## Usage - -Logs are automatically generated by the application's logging mechanism. No manual actions are required to use this feature. - -The logs directory structure is preserved in version control, but the log files themselves are gitignored. From 6a7660a7c10892657422307b90788d0d6f80b8ab Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 16:31:42 +0100 Subject: [PATCH 017/129] refactor: Return logs folder --- logs/.gitkeep | 0 logs/README.md | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 logs/.gitkeep create mode 100644 logs/README.md diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/logs/README.md b/logs/README.md new file mode 100644 index 000000000..96ef613b5 --- /dev/null +++ b/logs/README.md @@ -0,0 +1,31 @@ +# Logs Directory + +This directory contains the application logs for Cognee. + +## Log Files + +- Log files are named by date in the format `YYYY-MM-DD_HH-MM-SS.log` +- Logs are stored in plain text format with a consistent structure +- Each log entry includes: + - Timestamp (ISO format) + - Log level (padded to consistent width) + - Message + - Additional context (if any) + - Logger name (in square brackets) +- Exception tracebacks are included for error logs + +## Sample Log Entry + +``` +2025-03-27T13:05:27.481446Z [INFO ] Structured log message user_id=user123 action=login status=success [TestLogger] +``` + +## Retention Policy + +The system automatically keeps only the 10 most recent log files. Older log files are automatically deleted when new log files are created. This prevents excessive disk usage in long-running deployments. + +## Usage + +Logs are automatically generated by the application's logging mechanism. No manual actions are required to use this feature. + +The logs directory structure is preserved in version control, but the log files themselves are gitignored. From 6572cf5cb9bcd7bc3906dc9149a22759069e79b2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 16:35:44 +0100 Subject: [PATCH 018/129] refactor: use boolean instead of string --- cognee/context_global_variables.py | 10 +++++----- cognee/modules/search/methods/search.py | 4 ++-- cognee/modules/users/methods/get_authenticated_user.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 8ad855724..b4b848192 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -43,9 +43,9 @@ def check_backend_access_control_mode(): # enable it by default if graph and vector DBs can support it, otherwise disable it multi_user_support = check_multi_user_support() if multi_user_support: - return "true" + return True else: - return "false" + return False elif backend_access_control.lower() == "true": # If enabled, ensure that the current graph and vector DBs can support it multi_user_support = check_multi_user_support() @@ -54,10 +54,10 @@ def check_backend_access_control_mode(): "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control." ) else: - return "true" + return True else: # If explicitly disabled, return false - return "false" + return False async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID): @@ -81,7 +81,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config = get_base_config() - if not check_backend_access_control_mode() == "true": + if not check_backend_access_control_mode(): return user = await get_user(user_id) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index e3d7c220e..4a67093e8 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -74,7 +74,7 @@ async def search( ) # Use search function filtered by permissions if access control is enabled - if check_backend_access_control_mode() == "true": + if check_backend_access_control_mode(): search_results = await authorized_search( query_type=query_type, query_text=query_text, @@ -156,7 +156,7 @@ async def search( ) else: # This is for maintaining backwards compatibility - if check_backend_access_control_mode() == "true": + if check_backend_access_control_mode(): return_value = [] for search_result in search_results: prepared_search_results = await prepare_search_result(search_result) diff --git a/cognee/modules/users/methods/get_authenticated_user.py b/cognee/modules/users/methods/get_authenticated_user.py index 34d82586e..3cc16f3a8 100644 --- a/cognee/modules/users/methods/get_authenticated_user.py +++ b/cognee/modules/users/methods/get_authenticated_user.py @@ -13,7 +13,7 @@ logger = get_logger("get_authenticated_user") # Check environment variable to determine authentication requirement REQUIRE_AUTHENTICATION = ( os.getenv("REQUIRE_AUTHENTICATION", "false").lower() == "true" - or check_backend_access_control_mode() == "true" + or check_backend_access_control_mode() ) fastapi_users = get_fastapi_users() From d1581e9ebab143930acf1cec404dda083fb06a9f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 17:36:56 +0100 Subject: [PATCH 019/129] refactor: disable permissions for code graph example --- examples/python/code_graph_example.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 431069050..1b476a2c3 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,5 +1,7 @@ import argparse import asyncio +import os + import cognee from cognee import SearchType from cognee.shared.logging_utils import setup_logging, ERROR @@ -8,6 +10,9 @@ from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline async def main(repo_path, include_docs): + # Disable permissions feature for this example + os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" + run_status = False async for run_status in run_code_graph_pipeline(repo_path, include_docs=include_docs): run_status = run_status From eec96e4f1fb30e692b6e448aa9b1e553c0fead98 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 29 Oct 2025 19:14:53 +0100 Subject: [PATCH 020/129] refactor: fix search result for library test --- cognee/tests/test_library.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 81f81ee61..893b836c0 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -90,15 +90,17 @@ async def main(): ) search_results = await cognee.search( - query_type=SearchType.GRAPH_COMPLETION, query_text="What information do you contain?" + query_type=SearchType.GRAPH_COMPLETION, + query_text="What information do you contain?", + dataset_ids=[pipeline_run_obj.dataset_id], ) - assert "Mark" in search_results[0], ( + assert "Mark" in search_results[0]["search_result"][0], ( "Failed to update document, no mention of Mark in search results" ) - assert "Cindy" in search_results[0], ( + assert "Cindy" in search_results[0]["search_result"][0], ( "Failed to update document, no mention of Cindy in search results" ) - assert "Artificial intelligence" not in search_results[0], ( + assert "Artificial intelligence" not in search_results[0]["search_result"][0], ( "Failed to update document, Artificial intelligence still mentioned in search results" ) From e5df629ff32d84a240b7a6473b3add1b95f91691 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 30 Oct 2025 12:51:43 +0300 Subject: [PATCH 021/129] CI: Extract Windows and MacOS tests to separate job --- .../test_different_operating_systems.yml | 16 ++++++++----- .github/workflows/test_suites.yml | 23 +++++++++++++++---- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test_different_operating_systems.yml b/.github/workflows/test_different_operating_systems.yml index 64f1a14f9..53745ebee 100644 --- a/.github/workflows/test_different_operating_systems.yml +++ b/.github/workflows/test_different_operating_systems.yml @@ -10,6 +10,10 @@ on: required: false type: string default: '["3.10.x", "3.12.x", "3.13.x"]' + os: + required: false + type: string + default: '["ubuntu-22.04", "macos-15", "windows-latest"]' secrets: LLM_PROVIDER: required: true @@ -43,7 +47,7 @@ jobs: strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} - os: [ubuntu-22.04, macos-15, windows-latest] + os: ${{ fromJSON(inputs.os) }} fail-fast: false steps: - name: Check out @@ -79,7 +83,7 @@ jobs: strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} - os: [ ubuntu-22.04, macos-15, windows-latest ] + os: ${{ fromJSON(inputs.os) }} fail-fast: false steps: - name: Check out @@ -115,7 +119,7 @@ jobs: strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} - os: [ ubuntu-22.04, macos-15, windows-latest ] + os: ${{ fromJSON(inputs.os) }} fail-fast: false steps: - name: Check out @@ -151,7 +155,7 @@ jobs: strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} - os: [ ubuntu-22.04, macos-15, windows-latest ] + os: ${{ fromJSON(inputs.os) }} fail-fast: false steps: - name: Check out @@ -180,7 +184,7 @@ jobs: strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} - os: [ ubuntu-22.04, macos-15, windows-latest ] + os: ${{ fromJSON(inputs.os) }} fail-fast: false steps: - name: Check out @@ -217,7 +221,7 @@ jobs: strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} - os: [ ubuntu-22.04, macos-15, windows-latest ] + os: ${{ fromJSON(inputs.os) }} fail-fast: false steps: - name: Check out diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml index 5c1597a93..43099c03b 100644 --- a/.github/workflows/test_suites.yml +++ b/.github/workflows/test_suites.yml @@ -80,14 +80,24 @@ jobs: uses: ./.github/workflows/notebooks_tests.yml secrets: inherit - different-operating-systems-tests: - name: Operating System and Python Tests + different-os-tests-basic: + name: OS and Python Tests Ubuntu needs: [basic-tests, e2e-tests] uses: ./.github/workflows/test_different_operating_systems.yml with: python-versions: '["3.10.x", "3.11.x", "3.12.x", "3.13.x"]' + os: '["ubuntu-22.04"]' secrets: inherit + different-os-tests-extended: + name: OS and Python Tests Extended + needs: [basic-tests, e2e-tests] + uses: ./.github/workflows/test_different_operating_systems.yml + with: + python-versions: '["3.13.x"]' + os: '["macos-15", "windows-latest"]' + secrets: inherit + # Matrix-based vector database tests vector-db-tests: name: Vector DB Tests @@ -135,7 +145,8 @@ jobs: e2e-tests, graph-db-tests, notebook-tests, - different-operating-systems-tests, + different-os-tests-basic, + different-os-tests-extended, vector-db-tests, example-tests, llm-tests, @@ -155,7 +166,8 @@ jobs: cli-tests, graph-db-tests, notebook-tests, - different-operating-systems-tests, + different-os-tests-basic, + different-os-tests-extended, vector-db-tests, example-tests, db-examples-tests, @@ -176,7 +188,8 @@ jobs: "${{ needs.cli-tests.result }}" == "success" && "${{ needs.graph-db-tests.result }}" == "success" && "${{ needs.notebook-tests.result }}" == "success" && - "${{ needs.different-operating-systems-tests.result }}" == "success" && + "${{ needs.different-os-tests-basic.result }}" == "success" && + "${{ needs.different-os-tests-extended.result }}" == "success" && "${{ needs.vector-db-tests.result }}" == "success" && "${{ needs.example-tests.result }}" == "success" && "${{ needs.db-examples-tests.result }}" == "success" && From b30b52921a41132980f77f140e19519d0de23f4a Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 30 Oct 2025 13:08:09 +0300 Subject: [PATCH 022/129] Remove trailing whitespaces --- .github/workflows/test_different_operating_systems.yml | 4 ++-- .github/workflows/test_suites.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_different_operating_systems.yml b/.github/workflows/test_different_operating_systems.yml index 53745ebee..b6b3f7b3c 100644 --- a/.github/workflows/test_different_operating_systems.yml +++ b/.github/workflows/test_different_operating_systems.yml @@ -13,7 +13,7 @@ on: os: required: false type: string - default: '["ubuntu-22.04", "macos-15", "windows-latest"]' + default: '["ubuntu-22.04", "macos-15", "windows-latest"]' secrets: LLM_PROVIDER: required: true @@ -119,7 +119,7 @@ jobs: strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} - os: ${{ fromJSON(inputs.os) }} + os: ${{ fromJSON(inputs.os) }} fail-fast: false steps: - name: Check out diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml index 43099c03b..44cfb03cf 100644 --- a/.github/workflows/test_suites.yml +++ b/.github/workflows/test_suites.yml @@ -96,7 +96,7 @@ jobs: with: python-versions: '["3.13.x"]' os: '["macos-15", "windows-latest"]' - secrets: inherit + secrets: inherit # Matrix-based vector database tests vector-db-tests: From 6223ecf05ba662f08b7d560196f43df248114adb Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:56:06 +0100 Subject: [PATCH 023/129] feat: optimize repeated entity extraction (#1682) ## Description - Added an `edge_text` field to edges that auto-fills from `relationship_type` if not provided. - Containts edges now store descriptions for better embedding - Updated and refactored indexing so that edge_text gets embedded and exposed - Updated retrieval to use the new embeddings - Added a test to verify edge_text exists in the graph with the correct format. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [x] Code refactoring - [x] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/infrastructure/engine/models/Edge.py | 14 ++- .../modules/chunking/models/DocumentChunk.py | 3 +- .../modules/graph/cognee_graph/CogneeGraph.py | 6 +- .../utils/expand_with_nodes_and_edges.py | 21 +++- .../graph/utils/resolve_edges_to_text.py | 119 +++++++++--------- .../utils/brute_force_triplet_search.py | 2 +- cognee/tasks/storage/index_data_points.py | 55 ++++---- cognee/tasks/storage/index_graph_edges.py | 94 ++++++-------- cognee/tests/test_edge_ingestion.py | 27 ++++ .../databases/test_index_data_points.py | 27 ++++ .../databases/test_index_graph_edges.py | 30 +++-- 11 files changed, 236 insertions(+), 162 deletions(-) create mode 100644 cognee/tests/unit/infrastructure/databases/test_index_data_points.py diff --git a/cognee/infrastructure/engine/models/Edge.py b/cognee/infrastructure/engine/models/Edge.py index 5ad9c84dd..59f01a9ab 100644 --- a/cognee/infrastructure/engine/models/Edge.py +++ b/cognee/infrastructure/engine/models/Edge.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, field_validator from typing import Optional, Any, Dict @@ -18,9 +18,21 @@ class Edge(BaseModel): # Mixed usage has_items: (Edge(weight=0.5, weights={"confidence": 0.9}), list[Item]) + + # With edge_text for rich embedding representation + contains: (Edge(relationship_type="contains", edge_text="relationship_name: contains; entity_description: Alice"), Entity) """ weight: Optional[float] = None weights: Optional[Dict[str, float]] = None relationship_type: Optional[str] = None properties: Optional[Dict[str, Any]] = None + edge_text: Optional[str] = None + + @field_validator("edge_text", mode="before") + @classmethod + def ensure_edge_text(cls, v, info): + """Auto-populate edge_text from relationship_type if not explicitly provided.""" + if v is None and info.data.get("relationship_type"): + return info.data["relationship_type"] + return v diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 9f8c57486..e024bf00b 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,6 +1,7 @@ from typing import List, Union from cognee.infrastructure.engine import DataPoint +from cognee.infrastructure.engine.models.Edge import Edge from cognee.modules.data.processing.document_types import Document from cognee.modules.engine.models import Entity from cognee.tasks.temporal_graph.models import Event @@ -31,6 +32,6 @@ class DocumentChunk(DataPoint): chunk_index: int cut_type: str is_part_of: Document - contains: List[Union[Entity, Event]] = None + contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None metadata: dict = {"index_fields": ["text"]} diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 9703928f0..cb7562422 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -171,8 +171,10 @@ class CogneeGraph(CogneeAbstractGraph): embedding_map = {result.payload["text"]: result.score for result in edge_distances} for edge in self.edges: - relationship_type = edge.attributes.get("relationship_type") - distance = embedding_map.get(relationship_type, None) + edge_key = edge.attributes.get("edge_text") or edge.attributes.get( + "relationship_type" + ) + distance = embedding_map.get(edge_key, None) if distance is not None: edge.attributes["vector_distance"] = distance diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index 3b01f5af4..c68eb494d 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -1,5 +1,6 @@ from typing import Optional +from cognee.infrastructure.engine.models.Edge import Edge from cognee.modules.chunking.models import DocumentChunk from cognee.modules.engine.models import Entity, EntityType from cognee.modules.engine.utils import ( @@ -243,10 +244,26 @@ def _process_graph_nodes( ontology_relationships, ) - # Add entity to data chunk if data_chunk.contains is None: data_chunk.contains = [] - data_chunk.contains.append(entity_node) + + edge_text = "; ".join( + [ + "relationship_name: contains", + f"entity_name: {entity_node.name}", + f"entity_description: {entity_node.description}", + ] + ) + + data_chunk.contains.append( + ( + Edge( + relationship_type="contains", + edge_text=edge_text, + ), + entity_node, + ) + ) def _process_graph_edges( diff --git a/cognee/modules/graph/utils/resolve_edges_to_text.py b/cognee/modules/graph/utils/resolve_edges_to_text.py index eb5bedd2c..5deb13ba8 100644 --- a/cognee/modules/graph/utils/resolve_edges_to_text.py +++ b/cognee/modules/graph/utils/resolve_edges_to_text.py @@ -1,71 +1,70 @@ +import string from typing import List +from collections import Counter + from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge +from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS + + +def _get_top_n_frequent_words( + text: str, stop_words: set = None, top_n: int = 3, separator: str = ", " +) -> str: + """Concatenates the top N frequent words in text.""" + if stop_words is None: + stop_words = DEFAULT_STOP_WORDS + + words = [word.lower().strip(string.punctuation) for word in text.split()] + words = [word for word in words if word and word not in stop_words] + + top_words = [word for word, freq in Counter(words).most_common(top_n)] + return separator.join(top_words) + + +def _create_title_from_text(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str: + """Creates a title by combining first words with most frequent words from the text.""" + first_words = text.split()[:first_n_words] + top_words = _get_top_n_frequent_words(text, top_n=top_n_words) + return f"{' '.join(first_words)}... [{top_words}]" + + +def _extract_nodes_from_edges(retrieved_edges: List[Edge]) -> dict: + """Creates a dictionary of nodes with their names and content.""" + nodes = {} + + for edge in retrieved_edges: + for node in (edge.node1, edge.node2): + if node.id in nodes: + continue + + text = node.attributes.get("text") + if text: + name = _create_title_from_text(text) + content = text + else: + name = node.attributes.get("name", "Unnamed Node") + content = node.attributes.get("description", name) + + nodes[node.id] = {"node": node, "name": name, "content": content} + + return nodes async def resolve_edges_to_text(retrieved_edges: List[Edge]) -> str: - """ - Converts retrieved graph edges into a human-readable string format. + """Converts retrieved graph edges into a human-readable string format.""" + nodes = _extract_nodes_from_edges(retrieved_edges) - Parameters: - ----------- - - - retrieved_edges (list): A list of edges retrieved from the graph. - - Returns: - -------- - - - str: A formatted string representation of the nodes and their connections. - """ - - def _get_nodes(retrieved_edges: List[Edge]) -> dict: - def _get_title(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str: - def _top_n_words(text, stop_words=None, top_n=3, separator=", "): - """Concatenates the top N frequent words in text.""" - if stop_words is None: - from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS - - stop_words = DEFAULT_STOP_WORDS - - import string - - words = [word.lower().strip(string.punctuation) for word in text.split()] - - if stop_words: - words = [word for word in words if word and word not in stop_words] - - from collections import Counter - - top_words = [word for word, freq in Counter(words).most_common(top_n)] - - return separator.join(top_words) - - """Creates a title, by combining first words with most frequent words from the text.""" - first_words = text.split()[:first_n_words] - top_words = _top_n_words(text, top_n=first_n_words) - return f"{' '.join(first_words)}... [{top_words}]" - - """Creates a dictionary of nodes with their names and content.""" - nodes = {} - for edge in retrieved_edges: - for node in (edge.node1, edge.node2): - if node.id not in nodes: - text = node.attributes.get("text") - if text: - name = _get_title(text) - content = text - else: - name = node.attributes.get("name", "Unnamed Node") - content = node.attributes.get("description", name) - nodes[node.id] = {"node": node, "name": name, "content": content} - return nodes - - nodes = _get_nodes(retrieved_edges) node_section = "\n".join( f"Node: {info['name']}\n__node_content_start__\n{info['content']}\n__node_content_end__\n" for info in nodes.values() ) - connection_section = "\n".join( - f"{nodes[edge.node1.id]['name']} --[{edge.attributes['relationship_type']}]--> {nodes[edge.node2.id]['name']}" - for edge in retrieved_edges - ) + + connections = [] + for edge in retrieved_edges: + source_name = nodes[edge.node1.id]["name"] + target_name = nodes[edge.node2.id]["name"] + edge_label = edge.attributes.get("edge_text") or edge.attributes.get("relationship_type") + connections.append(f"{source_name} --[{edge_label}]--> {target_name}") + + connection_section = "\n".join(connections) + return f"Nodes:\n{node_section}\n\nConnections:\n{connection_section}" diff --git a/cognee/modules/retrieval/utils/brute_force_triplet_search.py b/cognee/modules/retrieval/utils/brute_force_triplet_search.py index 1ef7545c2..f8bdbb97d 100644 --- a/cognee/modules/retrieval/utils/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/utils/brute_force_triplet_search.py @@ -71,7 +71,7 @@ async def get_memory_fragment( await memory_fragment.project_graph_from_db( graph_engine, node_properties_to_project=properties_to_project, - edge_properties_to_project=["relationship_name"], + edge_properties_to_project=["relationship_name", "edge_text"], node_type=node_type, node_name=node_name, ) diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 902789c80..b0ec3a5b4 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -8,47 +8,58 @@ logger = get_logger("index_data_points") async def index_data_points(data_points: list[DataPoint]): - created_indexes = {} - index_points = {} + """Index data points in the vector engine by creating embeddings for specified fields. + + Process: + 1. Groups data points into a nested dict: {type_name: {field_name: [points]}} + 2. Creates vector indexes for each (type, field) combination on first encounter + 3. Batches points per (type, field) and creates async indexing tasks + 4. Executes all indexing tasks in parallel for efficient embedding generation + + Args: + data_points: List of DataPoint objects to index. Each DataPoint's metadata must + contain an 'index_fields' list specifying which fields to embed. + + Returns: + The original data_points list. + """ + data_points_by_type = {} vector_engine = get_vector_engine() for data_point in data_points: data_point_type = type(data_point) + type_name = data_point_type.__name__ for field_name in data_point.metadata["index_fields"]: if getattr(data_point, field_name, None) is None: continue - index_name = f"{data_point_type.__name__}_{field_name}" + if type_name not in data_points_by_type: + data_points_by_type[type_name] = {} - if index_name not in created_indexes: - await vector_engine.create_vector_index(data_point_type.__name__, field_name) - created_indexes[index_name] = True - - if index_name not in index_points: - index_points[index_name] = [] + if field_name not in data_points_by_type[type_name]: + await vector_engine.create_vector_index(type_name, field_name) + data_points_by_type[type_name][field_name] = [] indexed_data_point = data_point.model_copy() indexed_data_point.metadata["index_fields"] = [field_name] - index_points[index_name].append(indexed_data_point) + data_points_by_type[type_name][field_name].append(indexed_data_point) - tasks: list[asyncio.Task] = [] batch_size = vector_engine.embedding_engine.get_batch_size() - for index_name_and_field, points in index_points.items(): - first = index_name_and_field.index("_") - index_name = index_name_and_field[:first] - field_name = index_name_and_field[first + 1 :] + batches = ( + (type_name, field_name, points[i : i + batch_size]) + for type_name, fields in data_points_by_type.items() + for field_name, points in fields.items() + for i in range(0, len(points), batch_size) + ) - # Create embedding requests per batch to run in parallel later - for i in range(0, len(points), batch_size): - batch = points[i : i + batch_size] - tasks.append( - asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch)) - ) + tasks = [ + asyncio.create_task(vector_engine.index_data_points(type_name, field_name, batch_points)) + for type_name, field_name, batch_points in batches + ] - # Run all embedding requests in parallel await asyncio.gather(*tasks) return data_points diff --git a/cognee/tasks/storage/index_graph_edges.py b/cognee/tasks/storage/index_graph_edges.py index 4fa8cfc75..03b5a25a5 100644 --- a/cognee/tasks/storage/index_graph_edges.py +++ b/cognee/tasks/storage/index_graph_edges.py @@ -1,17 +1,44 @@ -import asyncio +from collections import Counter +from typing import Optional, Dict, Any, List, Tuple, Union from cognee.modules.engine.utils.generate_edge_id import generate_edge_id from cognee.shared.logging_utils import get_logger -from collections import Counter -from typing import Optional, Dict, Any, List, Tuple, Union -from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.graph.models.EdgeType import EdgeType from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData +from cognee.tasks.storage.index_data_points import index_data_points logger = get_logger() +def _get_edge_text(item: dict) -> str: + """Extract edge text for embedding - prefers edge_text field with fallback.""" + if "edge_text" in item: + return item["edge_text"] + + if "relationship_name" in item: + return item["relationship_name"] + + return "" + + +def create_edge_type_datapoints(edges_data) -> list[EdgeType]: + """Transform raw edge data into EdgeType datapoints.""" + edge_texts = [ + _get_edge_text(item) + for edge in edges_data + for item in edge + if isinstance(item, dict) and "relationship_name" in item + ] + + edge_types = Counter(edge_texts) + + return [ + EdgeType(id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count) + for text, count in edge_types.items() + ] + + async def index_graph_edges( edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None, ): @@ -23,24 +50,17 @@ async def index_graph_edges( the `relationship_name` field. Steps: - 1. Initialize the vector engine and graph engine. - 2. Retrieve graph edge data and count relationship types (`relationship_name`). - 3. Create vector indexes for `relationship_name` if they don't exist. - 4. Transform the counted relationships into `EdgeType` objects. - 5. Index the transformed data points in the vector engine. + 1. Initialize the graph engine if needed and retrieve edge data. + 2. Transform edge data into EdgeType datapoints. + 3. Index the EdgeType datapoints using the standard indexing function. Raises: - RuntimeError: If initialization of the vector engine or graph engine fails. + RuntimeError: If initialization of the graph engine fails. Returns: None """ try: - created_indexes = {} - index_points = {} - - vector_engine = get_vector_engine() - if edges_data is None: graph_engine = await get_graph_engine() _, edges_data = await graph_engine.get_graph_data() @@ -51,47 +71,7 @@ async def index_graph_edges( logger.error("Failed to initialize engines: %s", e) raise RuntimeError("Initialization error") from e - edge_types = Counter( - item.get("relationship_name") - for edge in edges_data - for item in edge - if isinstance(item, dict) and "relationship_name" in item - ) - - for text, count in edge_types.items(): - edge = EdgeType( - id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count - ) - data_point_type = type(edge) - - for field_name in edge.metadata["index_fields"]: - index_name = f"{data_point_type.__name__}.{field_name}" - - if index_name not in created_indexes: - await vector_engine.create_vector_index(data_point_type.__name__, field_name) - created_indexes[index_name] = True - - if index_name not in index_points: - index_points[index_name] = [] - - indexed_data_point = edge.model_copy() - indexed_data_point.metadata["index_fields"] = [field_name] - index_points[index_name].append(indexed_data_point) - - # Get maximum batch size for embedding model - batch_size = vector_engine.embedding_engine.get_batch_size() - tasks: list[asyncio.Task] = [] - - for index_name, indexable_points in index_points.items(): - index_name, field_name = index_name.split(".") - - # Create embedding tasks to run in parallel later - for start in range(0, len(indexable_points), batch_size): - batch = indexable_points[start : start + batch_size] - - tasks.append(vector_engine.index_data_points(index_name, field_name, batch)) - - # Start all embedding tasks and wait for completion - await asyncio.gather(*tasks) + edge_type_datapoints = create_edge_type_datapoints(edges_data) + await index_data_points(edge_type_datapoints) return None diff --git a/cognee/tests/test_edge_ingestion.py b/cognee/tests/test_edge_ingestion.py index 5b23f7819..0d1407fab 100755 --- a/cognee/tests/test_edge_ingestion.py +++ b/cognee/tests/test_edge_ingestion.py @@ -52,6 +52,33 @@ async def test_edge_ingestion(): edge_type_counts = Counter(edge_type[2] for edge_type in graph[1]) + "Tests edge_text presence and format" + contains_edges = [edge for edge in graph[1] if edge[2] == "contains"] + assert len(contains_edges) > 0, "Expected at least one contains edge for edge_text verification" + + edge_properties = contains_edges[0][3] + assert "edge_text" in edge_properties, "Expected edge_text in edge properties" + + edge_text = edge_properties["edge_text"] + assert "relationship_name: contains" in edge_text, ( + f"Expected 'relationship_name: contains' in edge_text, got: {edge_text}" + ) + assert "entity_name:" in edge_text, f"Expected 'entity_name:' in edge_text, got: {edge_text}" + assert "entity_description:" in edge_text, ( + f"Expected 'entity_description:' in edge_text, got: {edge_text}" + ) + + all_edge_texts = [ + edge[3].get("edge_text", "") for edge in contains_edges if "edge_text" in edge[3] + ] + expected_entities = ["dave", "ana", "bob", "dexter", "apples", "cognee"] + found_entity = any( + any(entity in text.lower() for entity in expected_entities) for text in all_edge_texts + ) + assert found_entity, ( + f"Expected to find at least one entity name in edge_text: {all_edge_texts[:3]}" + ) + "Tests the presence of basic nested edges" for basic_nested_edge in basic_nested_edges: assert edge_type_counts.get(basic_nested_edge, 0) >= 1, ( diff --git a/cognee/tests/unit/infrastructure/databases/test_index_data_points.py b/cognee/tests/unit/infrastructure/databases/test_index_data_points.py new file mode 100644 index 000000000..21a5695de --- /dev/null +++ b/cognee/tests/unit/infrastructure/databases/test_index_data_points.py @@ -0,0 +1,27 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from cognee.tasks.storage.index_data_points import index_data_points +from cognee.infrastructure.engine import DataPoint + + +class TestDataPoint(DataPoint): + name: str + metadata: dict = {"index_fields": ["name"]} + + +@pytest.mark.asyncio +async def test_index_data_points_calls_vector_engine(): + """Test that index_data_points creates vector index and indexes data.""" + data_points = [TestDataPoint(name="test1")] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine.get_batch_size = MagicMock(return_value=100) + + with patch.dict( + index_data_points.__globals__, + {"get_vector_engine": lambda: mock_vector_engine}, + ): + await index_data_points(data_points) + + assert mock_vector_engine.create_vector_index.await_count >= 1 + assert mock_vector_engine.index_data_points.await_count >= 1 diff --git a/cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py b/cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py index 48bbc53e3..cee0896c2 100644 --- a/cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +++ b/cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py @@ -5,8 +5,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges @pytest.mark.asyncio async def test_index_graph_edges_success(): - """Test that index_graph_edges uses the index datapoints and creates vector index.""" - # Create the mocks for the graph and vector engines. + """Test that index_graph_edges retrieves edges and delegates to index_data_points.""" mock_graph_engine = AsyncMock() mock_graph_engine.get_graph_data.return_value = ( None, @@ -15,26 +14,23 @@ async def test_index_graph_edges_success(): [{"relationship_name": "rel2"}], ], ) - mock_vector_engine = AsyncMock() - mock_vector_engine.embedding_engine.get_batch_size = MagicMock(return_value=100) + mock_index_data_points = AsyncMock() - # Patch the globals of the function so that when it does: - # vector_engine = get_vector_engine() - # graph_engine = await get_graph_engine() - # it uses the mocked versions. with patch.dict( index_graph_edges.__globals__, { "get_graph_engine": AsyncMock(return_value=mock_graph_engine), - "get_vector_engine": lambda: mock_vector_engine, + "index_data_points": mock_index_data_points, }, ): await index_graph_edges() - # Assertions on the mock calls. mock_graph_engine.get_graph_data.assert_awaited_once() - assert mock_vector_engine.create_vector_index.await_count == 1 - assert mock_vector_engine.index_data_points.await_count == 1 + mock_index_data_points.assert_awaited_once() + + call_args = mock_index_data_points.call_args[0][0] + assert len(call_args) == 2 + assert all(hasattr(item, "relationship_name") for item in call_args) @pytest.mark.asyncio @@ -42,20 +38,22 @@ async def test_index_graph_edges_no_relationships(): """Test that index_graph_edges handles empty relationships correctly.""" mock_graph_engine = AsyncMock() mock_graph_engine.get_graph_data.return_value = (None, []) - mock_vector_engine = AsyncMock() + mock_index_data_points = AsyncMock() with patch.dict( index_graph_edges.__globals__, { "get_graph_engine": AsyncMock(return_value=mock_graph_engine), - "get_vector_engine": lambda: mock_vector_engine, + "index_data_points": mock_index_data_points, }, ): await index_graph_edges() mock_graph_engine.get_graph_data.assert_awaited_once() - mock_vector_engine.create_vector_index.assert_not_awaited() - mock_vector_engine.index_data_points.assert_not_awaited() + mock_index_data_points.assert_awaited_once() + + call_args = mock_index_data_points.call_args[0][0] + assert len(call_args) == 0 @pytest.mark.asyncio From ee0ecd52d8115396523c1a62b2c99b3178f5d182 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 16:25:34 +0100 Subject: [PATCH 024/129] refactor: Rewrite tests to work with multi-user mode by default --- .../users/test_conditional_authentication.py | 63 ------------------- 1 file changed, 63 deletions(-) diff --git a/cognee/tests/unit/modules/users/test_conditional_authentication.py b/cognee/tests/unit/modules/users/test_conditional_authentication.py index c4368d796..6568c3cb0 100644 --- a/cognee/tests/unit/modules/users/test_conditional_authentication.py +++ b/cognee/tests/unit/modules/users/test_conditional_authentication.py @@ -107,29 +107,10 @@ class TestConditionalAuthenticationIntegration: # REQUIRE_AUTHENTICATION should be a boolean assert isinstance(REQUIRE_AUTHENTICATION, bool) - # Currently should be False (optional authentication) - assert not REQUIRE_AUTHENTICATION - class TestConditionalAuthenticationEnvironmentVariables: """Test environment variable handling.""" - def test_require_authentication_default_false(self): - """Test that REQUIRE_AUTHENTICATION defaults to false when imported with no env vars.""" - with patch.dict(os.environ, {}, clear=True): - # Remove module from cache to force fresh import - module_name = "cognee.modules.users.methods.get_authenticated_user" - if module_name in sys.modules: - del sys.modules[module_name] - - # Import after patching environment - module will see empty environment - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - importlib.invalidate_caches() - assert not REQUIRE_AUTHENTICATION - def test_require_authentication_true(self): """Test that REQUIRE_AUTHENTICATION=true is parsed correctly when imported.""" with patch.dict(os.environ, {"REQUIRE_AUTHENTICATION": "true"}): @@ -145,50 +126,6 @@ class TestConditionalAuthenticationEnvironmentVariables: assert REQUIRE_AUTHENTICATION - def test_require_authentication_false_explicit(self): - """Test that REQUIRE_AUTHENTICATION=false is parsed correctly when imported.""" - with patch.dict(os.environ, {"REQUIRE_AUTHENTICATION": "false"}): - # Remove module from cache to force fresh import - module_name = "cognee.modules.users.methods.get_authenticated_user" - if module_name in sys.modules: - del sys.modules[module_name] - - # Import after patching environment - module will see REQUIRE_AUTHENTICATION=false - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - assert not REQUIRE_AUTHENTICATION - - def test_require_authentication_case_insensitive(self): - """Test that environment variable parsing is case insensitive when imported.""" - test_cases = ["TRUE", "True", "tRuE", "FALSE", "False", "fAlSe"] - - for case in test_cases: - with patch.dict(os.environ, {"REQUIRE_AUTHENTICATION": case}): - # Remove module from cache to force fresh import - module_name = "cognee.modules.users.methods.get_authenticated_user" - if module_name in sys.modules: - del sys.modules[module_name] - - # Import after patching environment - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - expected = case.lower() == "true" - assert REQUIRE_AUTHENTICATION == expected, f"Failed for case: {case}" - - def test_current_require_authentication_value(self): - """Test that the current REQUIRE_AUTHENTICATION module value is as expected.""" - from cognee.modules.users.methods.get_authenticated_user import ( - REQUIRE_AUTHENTICATION, - ) - - # The module-level variable should currently be False (set at import time) - assert isinstance(REQUIRE_AUTHENTICATION, bool) - assert not REQUIRE_AUTHENTICATION - class TestConditionalAuthenticationEdgeCases: """Test edge cases and error scenarios.""" From 9d8430cfb08e17467390ff53e57d330885c6c7d9 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 16:52:04 +0100 Subject: [PATCH 025/129] refactor: Update unit tests for require auth --- .../test_conditional_authentication_endpoints.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cognee/tests/unit/api/test_conditional_authentication_endpoints.py b/cognee/tests/unit/api/test_conditional_authentication_endpoints.py index 2eabee91a..6cc37ef38 100644 --- a/cognee/tests/unit/api/test_conditional_authentication_endpoints.py +++ b/cognee/tests/unit/api/test_conditional_authentication_endpoints.py @@ -1,3 +1,4 @@ +import os import pytest from unittest.mock import patch, AsyncMock, MagicMock from uuid import uuid4 @@ -5,8 +6,6 @@ from fastapi.testclient import TestClient from types import SimpleNamespace import importlib -from cognee.api.client import app - # Fixtures for reuse across test classes @pytest.fixture @@ -32,6 +31,10 @@ def mock_authenticated_user(): ) +# To turn off authentication we need to set the environment variable before importing the module +# Also both require_authentication and backend access control must be false +os.environ["REQUIRE_AUTHENTICATION"] = "false" +os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" gau_mod = importlib.import_module("cognee.modules.users.methods.get_authenticated_user") @@ -40,6 +43,8 @@ class TestConditionalAuthenticationEndpoints: @pytest.fixture def client(self): + from cognee.api.client import app + """Create a test client.""" return TestClient(app) @@ -133,6 +138,8 @@ class TestConditionalAuthenticationBehavior: @pytest.fixture def client(self): + from cognee.api.client import app + return TestClient(app) @pytest.mark.parametrize( @@ -209,6 +216,8 @@ class TestConditionalAuthenticationErrorHandling: @pytest.fixture def client(self): + from cognee.api.client import app + return TestClient(app) @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) @@ -232,7 +241,7 @@ class TestConditionalAuthenticationErrorHandling: # The exact error message may vary depending on the actual database connection # The important thing is that we get a 500 error when user creation fails - def test_current_environment_configuration(self): + def test_current_environment_configuration(self, client): """Test that current environment configuration is working properly.""" # This tests the actual module state without trying to change it from cognee.modules.users.methods.get_authenticated_user import ( From e061f34a28bf9cd27453ea6aac8abf215c7bde8f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 17:13:10 +0100 Subject: [PATCH 026/129] fix: Resolve issue with dataset names for example --- examples/python/feedback_enrichment_minimal_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/python/feedback_enrichment_minimal_example.py b/examples/python/feedback_enrichment_minimal_example.py index 11ef20830..8954bd5f6 100644 --- a/examples/python/feedback_enrichment_minimal_example.py +++ b/examples/python/feedback_enrichment_minimal_example.py @@ -67,7 +67,6 @@ async def run_feedback_enrichment_memify(last_n: int = 5): extraction_tasks=extraction_tasks, enrichment_tasks=enrichment_tasks, data=[{}], # A placeholder to prevent fetching the entire graph - dataset="feedback_enrichment_minimal", ) From 995e7aa4834bd3967342fded1b7ad6af10b4da0e Mon Sep 17 00:00:00 2001 From: Hande <159312713+hande-k@users.noreply.github.com> Date: Thu, 30 Oct 2025 17:38:28 +0100 Subject: [PATCH 027/129] fix: update unsupported vector db log (#1708) ## Description ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/infrastructure/databases/vector/create_vector_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index d1cf855d7..c54d94f6c 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -133,6 +133,6 @@ def create_vector_engine( else: raise EnvironmentError( - f"Unsupported graph database provider: {vector_db_provider}. " + f"Unsupported vector database provider: {vector_db_provider}. " f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}" ) From 45bb3130c695260af9ccb00e756a0b4d22f0a85b Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 17:40:00 +0100 Subject: [PATCH 028/129] fix: Use same dataset name accross cognee calls --- cognee/tests/test_feedback_enrichment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/test_feedback_enrichment.py b/cognee/tests/test_feedback_enrichment.py index 02d90db32..378cb0e45 100644 --- a/cognee/tests/test_feedback_enrichment.py +++ b/cognee/tests/test_feedback_enrichment.py @@ -133,7 +133,7 @@ async def main(): extraction_tasks=extraction_tasks, enrichment_tasks=enrichment_tasks, data=[{}], - dataset="feedback_enrichment_test_memify", + dataset=dataset_name, ) nodes_after, edges_after = await graph_engine.get_graph_data() From 1b483276b0077c57eefdc7e7aa93d58501cf7840 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 30 Oct 2025 18:04:27 +0100 Subject: [PATCH 029/129] fix: disable backend access control for rel db test --- cognee/tests/test_relational_db_migration.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cognee/tests/test_relational_db_migration.py b/cognee/tests/test_relational_db_migration.py index 2b69ce854..4557e9e2f 100644 --- a/cognee/tests/test_relational_db_migration.py +++ b/cognee/tests/test_relational_db_migration.py @@ -27,6 +27,9 @@ def normalize_node_name(node_name: str) -> str: async def setup_test_db(): + # Disable backend access control to migrate relational data + os.environ["ENABLE_BACKEND_ACCESS_CONTROL"] = "false" + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) From a60e53964c7eaea79680508bf33422dc35bd9056 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Fri, 31 Oct 2025 12:37:38 +0100 Subject: [PATCH 030/129] Potential fix for code scanning alert no. 399: Workflow does not contain permissions Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .github/workflows/test_suites.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml index 44cfb03cf..be1e354fc 100644 --- a/.github/workflows/test_suites.yml +++ b/.github/workflows/test_suites.yml @@ -1,4 +1,6 @@ name: Test Suites +permissions: + contents: read on: push: From 3c09433adead92f8a09093069a6d88de63c28409 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 13:57:12 +0100 Subject: [PATCH 031/129] fix: Resolve docling test --- cognee/tests/test_add_docling_document.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tests/test_add_docling_document.py b/cognee/tests/test_add_docling_document.py index 2c82af66f..c5aa4e9d1 100644 --- a/cognee/tests/test_add_docling_document.py +++ b/cognee/tests/test_add_docling_document.py @@ -39,12 +39,12 @@ async def main(): answer = await cognee.search("Do programmers change light bulbs?") assert len(answer) != 0 - lowercase_answer = answer[0].lower() + lowercase_answer = answer[0]["search_result"][0].lower() assert ("no" in lowercase_answer) or ("none" in lowercase_answer) answer = await cognee.search("What colours are there in the presentation table?") assert len(answer) != 0 - lowercase_answer = answer[0].lower() + lowercase_answer = answer[0]["search_result"][0].lower() assert ( ("red" in lowercase_answer) and ("blue" in lowercase_answer) From 00a1fe71d76ae62deac5832751366061f21bc96f Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 14:33:07 +0100 Subject: [PATCH 032/129] fix: Use multi-user mode search --- examples/python/agentic_reasoning_procurement_example.py | 2 +- examples/python/memify_coding_agent_example.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/python/agentic_reasoning_procurement_example.py b/examples/python/agentic_reasoning_procurement_example.py index 5aa3caa70..4e9d2d7e4 100644 --- a/examples/python/agentic_reasoning_procurement_example.py +++ b/examples/python/agentic_reasoning_procurement_example.py @@ -168,7 +168,7 @@ async def run_procurement_example(): for q in questions: print(f"Question: \n{q}") results = await procurement_system.search_memory(q, search_categories=[category]) - top_answer = results[category][0] + top_answer = results[category][0]["search_result"][0] print(f"Answer: \n{top_answer.strip()}\n") research_notes[category].append({"question": q, "answer": top_answer}) diff --git a/examples/python/memify_coding_agent_example.py b/examples/python/memify_coding_agent_example.py index 1fd3b1528..4a087ba61 100644 --- a/examples/python/memify_coding_agent_example.py +++ b/examples/python/memify_coding_agent_example.py @@ -89,7 +89,7 @@ async def main(): ) print("Coding rules created by memify:") - for coding_rule in coding_rules: + for coding_rule in coding_rules[0]["search_result"][0]: print("- " + coding_rule) # Visualize new graph with added memify context From 4c8b8211979fc12b6e46a911eba1c2e2610187d8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 14:55:52 +0100 Subject: [PATCH 033/129] fix: resolve test failing --- cognee/tests/test_search_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/test_search_db.py b/cognee/tests/test_search_db.py index e24abd0f5..bcc4529a9 100644 --- a/cognee/tests/test_search_db.py +++ b/cognee/tests/test_search_db.py @@ -146,7 +146,7 @@ async def main(): assert len(search_results) == 1, ( f"{name}: expected single-element list, got {len(search_results)}" ) - text = search_results[0] + text = search_results[0]["search_result"][0] assert isinstance(text, str), f"{name}: element should be a string" assert text.strip(), f"{name}: string should not be empty" assert "netherlands" in text.lower(), ( From f368a1a4d5d79a4485bc52ff5d9f16fc909942d2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 31 Oct 2025 20:10:05 +0100 Subject: [PATCH 034/129] fix: set tests to not use multi-user mode --- .github/workflows/search_db_tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/search_db_tests.yml b/.github/workflows/search_db_tests.yml index e3e46dd97..118c1c06c 100644 --- a/.github/workflows/search_db_tests.yml +++ b/.github/workflows/search_db_tests.yml @@ -84,6 +84,7 @@ jobs: GRAPH_DATABASE_PROVIDER: 'neo4j' VECTOR_DB_PROVIDER: 'lancedb' DB_PROVIDER: 'sqlite' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }} GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }} GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }} @@ -135,6 +136,7 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_PROVIDER: 'kuzu' VECTOR_DB_PROVIDER: 'pgvector' + ENABLE_BACKEND_ACCESS_CONTROL: 'false' DB_PROVIDER: 'postgres' DB_NAME: 'cognee_db' DB_HOST: '127.0.0.1' @@ -197,6 +199,7 @@ jobs: GRAPH_DATABASE_URL: ${{ steps.neo4j.outputs.neo4j-url }} GRAPH_DATABASE_USERNAME: ${{ steps.neo4j.outputs.neo4j-username }} GRAPH_DATABASE_PASSWORD: ${{ steps.neo4j.outputs.neo4j-password }} + ENABLE_BACKEND_ACCESS_CONTROL: 'false' DB_NAME: cognee_db DB_HOST: 127.0.0.1 DB_PORT: 5432 From a4a9e762465ecaf0dcdb9b0132db7951d11b437c Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Sun, 2 Nov 2025 17:05:03 +0500 Subject: [PATCH 035/129] feat: add ontology endpoint in REST API - Add POST /api/v1/ontologies endpoint for file upload - Add GET /api/v1/ontologies endpoint for listing ontologies - Implement OntologyService for file management and metadata - Integrate ontology_key parameter in cognify endpoint - Update RDFLibOntologyResolver to support file-like objects - Add essential test suite for ontology endpoints --- cognee/api/client.py | 3 + .../v1/cognify/routers/get_cognify_router.py | 31 +++++- cognee/api/v1/ontologies/__init__.py | 4 + cognee/api/v1/ontologies/ontologies.py | 101 ++++++++++++++++++ cognee/api/v1/ontologies/routers/__init__.py | 0 .../ontologies/routers/get_ontology_router.py | 89 +++++++++++++++ .../rdf_xml/RDFLibOntologyResolver.py | 69 +++++++----- cognee/tests/test_ontology_endpoint.py | 89 +++++++++++++++ 8 files changed, 356 insertions(+), 30 deletions(-) create mode 100644 cognee/api/v1/ontologies/__init__.py create mode 100644 cognee/api/v1/ontologies/ontologies.py create mode 100644 cognee/api/v1/ontologies/routers/__init__.py create mode 100644 cognee/api/v1/ontologies/routers/get_ontology_router.py create mode 100644 cognee/tests/test_ontology_endpoint.py diff --git a/cognee/api/client.py b/cognee/api/client.py index 6766c12de..89e9eb2f5 100644 --- a/cognee/api/client.py +++ b/cognee/api/client.py @@ -23,6 +23,7 @@ from cognee.api.v1.settings.routers import get_settings_router from cognee.api.v1.datasets.routers import get_datasets_router from cognee.api.v1.cognify.routers import get_code_pipeline_router, get_cognify_router from cognee.api.v1.search.routers import get_search_router +from cognee.api.v1.ontologies.routers.get_ontology_router import get_ontology_router from cognee.api.v1.memify.routers import get_memify_router from cognee.api.v1.add.routers import get_add_router from cognee.api.v1.delete.routers import get_delete_router @@ -258,6 +259,8 @@ app.include_router( app.include_router(get_datasets_router(), prefix="/api/v1/datasets", tags=["datasets"]) +app.include_router(get_ontology_router(), prefix="/api/v1/ontologies", tags=["ontologies"]) + app.include_router(get_settings_router(), prefix="/api/v1/settings", tags=["settings"]) app.include_router(get_visualize_router(), prefix="/api/v1/visualize", tags=["visualize"]) diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index 231bbcd11..246cc6c56 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -41,6 +41,10 @@ class CognifyPayloadDTO(InDTO): custom_prompt: Optional[str] = Field( default="", description="Custom prompt for entity extraction and graph generation" ) + ontology_key: Optional[str] = Field( + default=None, + description="Reference to previously uploaded ontology" + ) def get_cognify_router() -> APIRouter: @@ -68,6 +72,7 @@ def get_cognify_router() -> APIRouter: - **dataset_ids** (Optional[List[UUID]]): List of existing dataset UUIDs to process. UUIDs allow processing of datasets not owned by the user (if permitted). - **run_in_background** (Optional[bool]): Whether to execute processing asynchronously. Defaults to False (blocking). - **custom_prompt** (Optional[str]): Custom prompt for entity extraction and graph generation. If provided, this prompt will be used instead of the default prompts for knowledge graph extraction. + - **ontology_key** (Optional[str]): Reference to a previously uploaded ontology file to use for knowledge graph construction. ## Response - **Blocking execution**: Complete pipeline run information with entity counts, processing duration, and success/failure status @@ -82,7 +87,8 @@ def get_cognify_router() -> APIRouter: { "datasets": ["research_papers", "documentation"], "run_in_background": false, - "custom_prompt": "Extract entities focusing on technical concepts and their relationships. Identify key technologies, methodologies, and their interconnections." + "custom_prompt": "Extract entities focusing on technical concepts and their relationships. Identify key technologies, methodologies, and their interconnections.", + "ontology_key": "medical_ontology_v1" } ``` @@ -108,13 +114,36 @@ def get_cognify_router() -> APIRouter: ) from cognee.api.v1.cognify import cognify as cognee_cognify + from cognee.api.v1.ontologies.ontologies import OntologyService try: datasets = payload.dataset_ids if payload.dataset_ids else payload.datasets + config_to_use = None + + if payload.ontology_key: + ontology_service = OntologyService() + try: + ontology_content = ontology_service.get_ontology_content(payload.ontology_key, user) + + from cognee.modules.ontology.ontology_config import Config + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from io import StringIO + + ontology_stream = StringIO(ontology_content) + config_to_use: Config = { + "ontology_config": { + "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_stream) + } + } + except ValueError as e: + return JSONResponse( + status_code=400, content={"error": f"Ontology error: {str(e)}"} + ) cognify_run = await cognee_cognify( datasets, user, + config=config_to_use, run_in_background=payload.run_in_background, custom_prompt=payload.custom_prompt, ) diff --git a/cognee/api/v1/ontologies/__init__.py b/cognee/api/v1/ontologies/__init__.py new file mode 100644 index 000000000..c25064edc --- /dev/null +++ b/cognee/api/v1/ontologies/__init__.py @@ -0,0 +1,4 @@ +from .ontologies import OntologyService +from .routers.get_ontology_router import get_ontology_router + +__all__ = ["OntologyService", "get_ontology_router"] \ No newline at end of file diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py new file mode 100644 index 000000000..fb7f3cd9a --- /dev/null +++ b/cognee/api/v1/ontologies/ontologies.py @@ -0,0 +1,101 @@ +import os +import json +import tempfile +from pathlib import Path +from datetime import datetime, timezone +from typing import Optional +from dataclasses import dataclass + +@dataclass +class OntologyMetadata: + ontology_key: str + filename: str + size_bytes: int + uploaded_at: str + description: Optional[str] = None + +class OntologyService: + def __init__(self): + pass + + @property + def base_dir(self) -> Path: + return Path(tempfile.gettempdir()) / "ontologies" + + def _get_user_dir(self, user_id: str) -> Path: + user_dir = self.base_dir / str(user_id) + user_dir.mkdir(parents=True, exist_ok=True) + return user_dir + + def _get_metadata_path(self, user_dir: Path) -> Path: + return user_dir / "metadata.json" + + def _load_metadata(self, user_dir: Path) -> dict: + metadata_path = self._get_metadata_path(user_dir) + if metadata_path.exists(): + with open(metadata_path, 'r') as f: + return json.load(f) + return {} + + def _save_metadata(self, user_dir: Path, metadata: dict): + metadata_path = self._get_metadata_path(user_dir) + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + + async def upload_ontology(self, ontology_key: str, file, user, description: Optional[str] = None) -> OntologyMetadata: + # Validate file format + if not file.filename.lower().endswith('.owl'): + raise ValueError("File must be in .owl format") + + user_dir = self._get_user_dir(str(user.id)) + metadata = self._load_metadata(user_dir) + + # Check for duplicate key + if ontology_key in metadata: + raise ValueError(f"Ontology key '{ontology_key}' already exists") + + # Read file content + content = await file.read() + if len(content) > 10 * 1024 * 1024: # 10MB limit + raise ValueError("File size exceeds 10MB limit") + + # Save file + file_path = user_dir / f"{ontology_key}.owl" + with open(file_path, 'wb') as f: + f.write(content) + + # Update metadata + ontology_metadata = { + "filename": file.filename, + "size_bytes": len(content), + "uploaded_at": datetime.now(timezone.utc).isoformat(), + "description": description + } + metadata[ontology_key] = ontology_metadata + self._save_metadata(user_dir, metadata) + + return OntologyMetadata( + ontology_key=ontology_key, + filename=file.filename, + size_bytes=len(content), + uploaded_at=ontology_metadata["uploaded_at"], + description=description + ) + + def get_ontology_content(self, ontology_key: str, user) -> str: + user_dir = self._get_user_dir(str(user.id)) + metadata = self._load_metadata(user_dir) + + if ontology_key not in metadata: + raise ValueError(f"Ontology key '{ontology_key}' not found") + + file_path = user_dir / f"{ontology_key}.owl" + if not file_path.exists(): + raise ValueError(f"Ontology file for key '{ontology_key}' not found") + + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + + def list_ontologies(self, user) -> dict: + user_dir = self._get_user_dir(str(user.id)) + return self._load_metadata(user_dir) \ No newline at end of file diff --git a/cognee/api/v1/ontologies/routers/__init__.py b/cognee/api/v1/ontologies/routers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/api/v1/ontologies/routers/get_ontology_router.py b/cognee/api/v1/ontologies/routers/get_ontology_router.py new file mode 100644 index 000000000..c171fa7bb --- /dev/null +++ b/cognee/api/v1/ontologies/routers/get_ontology_router.py @@ -0,0 +1,89 @@ +from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException +from fastapi.responses import JSONResponse +from typing import Optional + +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_authenticated_user +from cognee.shared.utils import send_telemetry +from cognee import __version__ as cognee_version +from ..ontologies import OntologyService + +def get_ontology_router() -> APIRouter: + router = APIRouter() + ontology_service = OntologyService() + + @router.post("", response_model=dict) + async def upload_ontology( + ontology_key: str = Form(...), + ontology_file: UploadFile = File(...), + description: Optional[str] = Form(None), + user: User = Depends(get_authenticated_user) + ): + """ + Upload an ontology file with a named key for later use in cognify operations. + + ## Request Parameters + - **ontology_key** (str): User-defined identifier for the ontology + - **ontology_file** (UploadFile): OWL format ontology file + - **description** (Optional[str]): Optional description of the ontology + + ## Response + Returns metadata about the uploaded ontology including key, filename, size, and upload timestamp. + + ## Error Codes + - **400 Bad Request**: Invalid file format, duplicate key, file size exceeded + - **500 Internal Server Error**: File system or processing errors + """ + send_telemetry( + "Ontology Upload API Endpoint Invoked", + user.id, + additional_properties={ + "endpoint": "POST /api/v1/ontologies", + "cognee_version": cognee_version, + }, + ) + + try: + result = await ontology_service.upload_ontology( + ontology_key, ontology_file, user, description + ) + return { + "ontology_key": result.ontology_key, + "filename": result.filename, + "size_bytes": result.size_bytes, + "uploaded_at": result.uploaded_at + } + except ValueError as e: + return JSONResponse(status_code=400, content={"error": str(e)}) + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + @router.get("", response_model=dict) + async def list_ontologies( + user: User = Depends(get_authenticated_user) + ): + """ + List all uploaded ontologies for the authenticated user. + + ## Response + Returns a dictionary mapping ontology keys to their metadata including filename, size, and upload timestamp. + + ## Error Codes + - **500 Internal Server Error**: File system or processing errors + """ + send_telemetry( + "Ontology List API Endpoint Invoked", + user.id, + additional_properties={ + "endpoint": "GET /api/v1/ontologies", + "cognee_version": cognee_version, + }, + ) + + try: + metadata = ontology_service.list_ontologies(user) + return metadata + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + return router \ No newline at end of file diff --git a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index 45e32936a..4acc8861b 100644 --- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -2,7 +2,7 @@ import os import difflib from cognee.shared.logging_utils import get_logger from collections import deque -from typing import List, Tuple, Dict, Optional, Any, Union +from typing import List, Tuple, Dict, Optional, Any, Union, IO from rdflib import Graph, URIRef, RDF, RDFS, OWL from cognee.modules.ontology.exceptions import ( @@ -26,44 +26,55 @@ class RDFLibOntologyResolver(BaseOntologyResolver): def __init__( self, - ontology_file: Optional[Union[str, List[str]]] = None, + ontology_file: Optional[Union[str, List[str], IO]] = None, matching_strategy: Optional[MatchingStrategy] = None, ) -> None: super().__init__(matching_strategy) self.ontology_file = ontology_file try: - files_to_load = [] + self.graph = None if ontology_file is not None: - if isinstance(ontology_file, str): - files_to_load = [ontology_file] - elif isinstance(ontology_file, list): - files_to_load = ontology_file + if hasattr(ontology_file, "read"): + self.graph = Graph() + content = ontology_file.read() + self.graph.parse(data=content, format="xml") + logger.info("Ontology loaded successfully from file object") else: - raise ValueError( - f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}" - ) - - if files_to_load: - self.graph = Graph() - loaded_files = [] - for file_path in files_to_load: - if os.path.exists(file_path): - self.graph.parse(file_path) - loaded_files.append(file_path) - logger.info("Ontology loaded successfully from file: %s", file_path) + files_to_load = [] + if isinstance(ontology_file, str): + files_to_load = [ontology_file] + elif isinstance(ontology_file, list): + files_to_load = ontology_file else: - logger.warning( - "Ontology file '%s' not found. Skipping this file.", - file_path, + raise ValueError( + f"ontology_file must be a string, list of strings, file-like object, or None. Got: {type(ontology_file)}" ) - if not loaded_files: - logger.info( - "No valid ontology files found. No owl ontology will be attached to the graph." - ) - self.graph = None - else: - logger.info("Total ontology files loaded: %d", len(loaded_files)) + if files_to_load: + self.graph = Graph() + loaded_files = [] + for file_path in files_to_load: + if os.path.exists(file_path): + self.graph.parse(file_path) + loaded_files.append(file_path) + logger.info("Ontology loaded successfully from file: %s", file_path) + else: + logger.warning( + "Ontology file '%s' not found. Skipping this file.", + file_path, + ) + + if not loaded_files: + logger.info( + "No valid ontology files found. No owl ontology will be attached to the graph." + ) + self.graph = None + else: + logger.info("Total ontology files loaded: %d", len(loaded_files)) + else: + logger.info( + "No ontology file provided. No owl ontology will be attached to the graph." + ) else: logger.info( "No ontology file provided. No owl ontology will be attached to the graph." diff --git a/cognee/tests/test_ontology_endpoint.py b/cognee/tests/test_ontology_endpoint.py new file mode 100644 index 000000000..4849f8649 --- /dev/null +++ b/cognee/tests/test_ontology_endpoint.py @@ -0,0 +1,89 @@ +import pytest +import uuid +from fastapi.testclient import TestClient +from unittest.mock import patch, Mock, AsyncMock +from types import SimpleNamespace +import importlib +from cognee.api.client import app + +gau_mod = importlib.import_module("cognee.modules.users.methods.get_authenticated_user") + +@pytest.fixture +def client(): + return TestClient(app) + +@pytest.fixture +def mock_user(): + user = Mock() + user.id = "test-user-123" + return user + +@pytest.fixture +def mock_default_user(): + """Mock default user for testing.""" + return SimpleNamespace( + id=uuid.uuid4(), + email="default@example.com", + is_active=True, + tenant_id=uuid.uuid4() + ) + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_success(mock_get_default_user, client, mock_default_user): + """Test successful ontology upload""" + mock_get_default_user.return_value = mock_default_user + ontology_content = b"" + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + + response = client.post( + "/api/v1/ontologies", + files={"ontology_file": ("test.owl", ontology_content)}, + data={"ontology_key": unique_key, "description": "Test"} + ) + + assert response.status_code == 200 + data = response.json() + assert data["ontology_key"] == unique_key + assert "uploaded_at" in data + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_default_user): + """Test 400 response for non-.owl files""" + mock_get_default_user.return_value = mock_default_user + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + response = client.post( + "/api/v1/ontologies", + files={"ontology_file": ("test.txt", b"not xml")}, + data={"ontology_key": unique_key} + ) + assert response.status_code == 400 + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_missing_data(mock_get_default_user, client, mock_default_user): + """Test 400 response for missing file or key""" + mock_get_default_user.return_value = mock_default_user + # Missing file + response = client.post("/api/v1/ontologies", data={"ontology_key": "test"}) + assert response.status_code == 400 + + # Missing key + response = client.post("/api/v1/ontologies", files={"ontology_file": ("test.owl", b"xml")}) + assert response.status_code == 400 + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_default_user): + """Test behavior when default user is provided (no explicit authentication)""" + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + mock_get_default_user.return_value = mock_default_user + response = client.post( + "/api/v1/ontologies", + files={"ontology_file": ("test.owl", b"")}, + data={"ontology_key": unique_key} + ) + + # The current system provides a default user when no explicit authentication is given + # This test verifies the system works with conditional authentication + assert response.status_code == 200 + data = response.json() + assert data["ontology_key"] == unique_key + assert "uploaded_at" in data \ No newline at end of file From b8241e58e5794f006122d4b61c4805a669fc217f Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Mon, 3 Nov 2025 16:20:03 +0100 Subject: [PATCH 036/129] CI: Limit deletion integration tests to 60 minutes --- .github/workflows/test_different_operating_systems.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test_different_operating_systems.yml b/.github/workflows/test_different_operating_systems.yml index b6b3f7b3c..7a5288cf6 100644 --- a/.github/workflows/test_different_operating_systems.yml +++ b/.github/workflows/test_different_operating_systems.yml @@ -181,6 +181,7 @@ jobs: run-soft-deletion-test: name: Soft Delete test ${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} + timeout-minutes: 60 strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} @@ -218,6 +219,7 @@ jobs: run-hard-deletion-test: name: Hard Delete test ${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} + timeout-minutes: 60 strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} From 2ab2cffd07ed35a268362af48c2fb668674509f1 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 3 Nov 2025 16:37:03 +0100 Subject: [PATCH 037/129] chore: update test_search_db to work with all graph providers --- cognee/tests/test_search_db.py | 8 +++++++- examples/python/simple_example.py | 8 -------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/cognee/tests/test_search_db.py b/cognee/tests/test_search_db.py index bcc4529a9..ea3f0ea44 100644 --- a/cognee/tests/test_search_db.py +++ b/cognee/tests/test_search_db.py @@ -146,7 +146,13 @@ async def main(): assert len(search_results) == 1, ( f"{name}: expected single-element list, got {len(search_results)}" ) - text = search_results[0]["search_result"][0] + + from cognee.context_global_variables import check_backend_access_control_mode + + if check_backend_access_control_mode(): + text = search_results[0]["search_result"][0] + else: + text = search_results[0] assert isinstance(text, str), f"{name}: element should be a string" assert text.strip(), f"{name}: string should not be empty" assert "netherlands" in text.lower(), ( diff --git a/examples/python/simple_example.py b/examples/python/simple_example.py index c13e48f85..237a8295e 100644 --- a/examples/python/simple_example.py +++ b/examples/python/simple_example.py @@ -59,14 +59,6 @@ async def main(): for result_text in search_results: print(result_text) - # Example output: - # ({'id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'natural language processing', 'description': 'An interdisciplinary subfield of computer science and information retrieval.'}, {'relationship_name': 'is_a_subfield_of', 'source_node_id': UUID('bc338a39-64d6-549a-acec-da60846dd90d'), 'target_node_id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 15, 473137, tzinfo=datetime.timezone.utc)}, {'id': UUID('6218dbab-eb6a-5759-a864-b3419755ffe0'), 'updated_at': datetime.datetime(2024, 11, 21, 12, 23, 1, 211808, tzinfo=datetime.timezone.utc), 'name': 'computer science', 'description': 'The study of computation and information processing.'}) - # (...) - # It represents nodes and relationships in the knowledge graph: - # - The first element is the source node (e.g., 'natural language processing'). - # - The second element is the relationship between nodes (e.g., 'is_a_subfield_of'). - # - The third element is the target node (e.g., 'computer science'). - if __name__ == "__main__": logger = setup_logging(log_level=ERROR) From 4424bdc76471342119d59943c38d392dac9d72b0 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 3 Nov 2025 17:06:51 +0100 Subject: [PATCH 038/129] test: fix path based on pr comment --- cognee/tests/test_load.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/cognee/tests/test_load.py b/cognee/tests/test_load.py index a09ce053d..b38466bc7 100644 --- a/cognee/tests/test_load.py +++ b/cognee/tests/test_load.py @@ -30,17 +30,10 @@ async def process_and_search(num_of_searches): async def main(): - data_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") - ).resolve() - ) + data_directory_path = os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load") cognee.config.data_root_directory(data_directory_path) - cognee_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_load") - ).resolve() - ) + + cognee_directory_path = os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_load") cognee.config.system_root_directory(cognee_directory_path) num_of_pdfs = 10 From eb8df45dab2cb7a07de7eb97570d364790f80080 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 3 Nov 2025 18:10:19 +0100 Subject: [PATCH 039/129] test: increase file descriptor limit on workflow load test --- .github/workflows/e2e_tests.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index cf704c76a..79df3ff6b 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -463,6 +463,12 @@ jobs: with: python-version: '3.11.x' + - name: Set File Descriptor Limit + run: sudo prlimit --pid $$ --nofile=4096:4096 + + - name: Verify File Descriptor Limit + run: ulimit -n + - name: Dependencies already installed run: echo "Dependencies already installed in setup" @@ -478,4 +484,9 @@ jobs: EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + STORAGE_BACKEND: s3 + AWS_REGION: eu-west-1 + AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} run: uv run python ./cognee/tests/test_load.py \ No newline at end of file From a7d63df98c7ad21a40679c1b821acb30690c65d8 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Mon, 3 Nov 2025 18:15:18 +0100 Subject: [PATCH 040/129] test: add extra aws dependency to load test --- .github/workflows/e2e_tests.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 79df3ff6b..0596f22d3 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -267,8 +267,6 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_edge_ingestion.py - - run_concurrent_subprocess_access_test: name: Concurrent Subprocess access test runs-on: ubuntu-latest @@ -450,7 +448,6 @@ jobs: DB_PASSWORD: cognee run: uv run python ./cognee/tests/test_conversation_history.py - test-load: name: Test Load runs-on: ubuntu-22.04 @@ -462,6 +459,7 @@ jobs: uses: ./.github/actions/cognee_setup with: python-version: '3.11.x' + extra-dependencies: "aws" - name: Set File Descriptor Limit run: sudo prlimit --pid $$ --nofile=4096:4096 From c81d06d364d3b1f114fb2e7e81db856e7389386d Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Mon, 3 Nov 2025 19:37:52 +0100 Subject: [PATCH 041/129] Update cognee/context_global_variables.py Co-authored-by: Pavel Zorin --- cognee/context_global_variables.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index b4b848192..6f3965441 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -41,11 +41,7 @@ def check_backend_access_control_mode(): if backend_access_control is None: # If backend access control is not defined in environment variables, # enable it by default if graph and vector DBs can support it, otherwise disable it - multi_user_support = check_multi_user_support() - if multi_user_support: - return True - else: - return False + return check_multi_user_support() elif backend_access_control.lower() == "true": # If enabled, ensure that the current graph and vector DBs can support it multi_user_support = check_multi_user_support() From 53521c2068319d340c3f2b396dbbc7f3f9c80523 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Mon, 3 Nov 2025 19:42:51 +0100 Subject: [PATCH 042/129] Update cognee/context_global_variables.py Co-authored-by: Pavel Zorin --- cognee/context_global_variables.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 6f3965441..3afbf6ff2 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -27,13 +27,10 @@ async def set_session_user_context_variable(user): def check_multi_user_support(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() - if ( + return ( graph_db_config["graph_database_provider"] in graph_dbs_with_multi_user_support and vector_db_config["vector_db_provider"] in vector_dbs_with_multi_user_support - ): - return True - else: - return False + ) def check_backend_access_control_mode(): From 46c509778f89d5bebbcbe5f7578159e45841ca2d Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 12:06:16 +0100 Subject: [PATCH 043/129] refactor: Rename access control functions --- cognee/context_global_variables.py | 17 +++++++---------- cognee/modules/search/methods/search.py | 6 +++--- .../users/methods/get_authenticated_user.py | 4 ++-- cognee/tests/test_search_db.py | 4 ++-- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index 3afbf6ff2..f17c9187a 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -24,7 +24,7 @@ async def set_session_user_context_variable(user): session_user.set(user) -def check_multi_user_support(): +def multi_user_support_possible(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() return ( @@ -33,24 +33,21 @@ def check_multi_user_support(): ) -def check_backend_access_control_mode(): +def backend_access_control_enabled(): backend_access_control = os.environ.get("ENABLE_BACKEND_ACCESS_CONTROL", None) if backend_access_control is None: # If backend access control is not defined in environment variables, # enable it by default if graph and vector DBs can support it, otherwise disable it - return check_multi_user_support() + return multi_user_support_possible() elif backend_access_control.lower() == "true": # If enabled, ensure that the current graph and vector DBs can support it - multi_user_support = check_multi_user_support() + multi_user_support = multi_user_support_possible() if not multi_user_support: raise EnvironmentError( "ENABLE_BACKEND_ACCESS_CONTROL is set to true but the current graph and/or vector databases do not support multi-user access control. Please use supported databases or disable backend access control." ) - else: - return True - else: - # If explicitly disabled, return false - return False + return True + return False async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID): @@ -74,7 +71,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ base_config = get_base_config() - if not check_backend_access_control_mode(): + if not backend_access_control_enabled(): return user = await get_user(user_id) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 4a67093e8..5e465b239 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -8,7 +8,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine from cognee.shared.logging_utils import get_logger from cognee.shared.utils import send_telemetry from cognee.context_global_variables import set_database_global_context_variables -from cognee.context_global_variables import check_backend_access_control_mode +from cognee.context_global_variables import backend_access_control_enabled from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge @@ -74,7 +74,7 @@ async def search( ) # Use search function filtered by permissions if access control is enabled - if check_backend_access_control_mode(): + if backend_access_control_enabled(): search_results = await authorized_search( query_type=query_type, query_text=query_text, @@ -156,7 +156,7 @@ async def search( ) else: # This is for maintaining backwards compatibility - if check_backend_access_control_mode(): + if backend_access_control_enabled(): return_value = [] for search_result in search_results: prepared_search_results = await prepare_search_result(search_result) diff --git a/cognee/modules/users/methods/get_authenticated_user.py b/cognee/modules/users/methods/get_authenticated_user.py index 3cc16f3a8..d6d701737 100644 --- a/cognee/modules/users/methods/get_authenticated_user.py +++ b/cognee/modules/users/methods/get_authenticated_user.py @@ -5,7 +5,7 @@ from ..models import User from ..get_fastapi_users import get_fastapi_users from .get_default_user import get_default_user from cognee.shared.logging_utils import get_logger -from cognee.context_global_variables import check_backend_access_control_mode +from cognee.context_global_variables import backend_access_control_enabled logger = get_logger("get_authenticated_user") @@ -13,7 +13,7 @@ logger = get_logger("get_authenticated_user") # Check environment variable to determine authentication requirement REQUIRE_AUTHENTICATION = ( os.getenv("REQUIRE_AUTHENTICATION", "false").lower() == "true" - or check_backend_access_control_mode() + or backend_access_control_enabled() ) fastapi_users = get_fastapi_users() diff --git a/cognee/tests/test_search_db.py b/cognee/tests/test_search_db.py index ea3f0ea44..bd11dc62e 100644 --- a/cognee/tests/test_search_db.py +++ b/cognee/tests/test_search_db.py @@ -147,9 +147,9 @@ async def main(): f"{name}: expected single-element list, got {len(search_results)}" ) - from cognee.context_global_variables import check_backend_access_control_mode + from cognee.context_global_variables import backend_access_control_enabled - if check_backend_access_control_mode(): + if backend_access_control_enabled(): text = search_results[0]["search_result"][0] else: text = search_results[0] From e11a8b9a51fa4c4f3fea43cd8b2f9585a2a27997 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Tue, 4 Nov 2025 12:13:48 +0100 Subject: [PATCH 044/129] CI: Added timeouts for all OS tests --- .github/workflows/test_different_operating_systems.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test_different_operating_systems.yml b/.github/workflows/test_different_operating_systems.yml index 7a5288cf6..02651b474 100644 --- a/.github/workflows/test_different_operating_systems.yml +++ b/.github/workflows/test_different_operating_systems.yml @@ -44,6 +44,7 @@ jobs: run-unit-tests: name: Unit tests ${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} + timeout-minutes: 60 strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} @@ -80,6 +81,7 @@ jobs: run-integration-tests: name: Integration tests ${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} + timeout-minutes: 60 strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} @@ -116,6 +118,7 @@ jobs: run-library-test: name: Library test ${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} + timeout-minutes: 60 strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} @@ -152,6 +155,7 @@ jobs: run-build-test: name: Build test ${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} + timeout-minutes: 60 strategy: matrix: python-version: ${{ fromJSON(inputs.python-versions) }} From e3b707a0c242fb7268d56d2b485990f120fe0462 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 12:20:17 +0100 Subject: [PATCH 045/129] refactor: Change variable names, add setting of current tenant to be optional for tenant creation --- .../users/tenants/methods/add_user_to_tenant.py | 8 ++++---- .../modules/users/tenants/methods/create_tenant.py | 13 +++++++++---- .../modules/users/tenants/methods/select_tenant.py | 2 +- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index dabab6b6b..edadfe66b 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -16,18 +16,18 @@ from cognee.modules.users.exceptions import ( async def add_user_to_tenant( - user_id: UUID, tenant_id: UUID, owner_id: UUID, set_active_tenant: Optional[bool] = True + user_id: UUID, tenant_id: UUID, owner_id: UUID, set_as_active_tenant: Optional[bool] = True ): """ Add a user with the given id to the tenant with the given id. This can only be successful if the request owner with the given id is the tenant owner. - If set_active_tenant is true it will automatically set the users active tenant to provided tenant. + If set_as_active_tenant is true it will automatically set the users active tenant to provided tenant. Args: user_id: Id of the user. tenant_id: Id of the tenant. owner_id: Id of the request owner. - set_active_tenant: If set_active_tenant is true it will automatically set the users active tenant to provided tenant. + set_as_active_tenant: If set_as_active_tenant is true it will automatically set the users active tenant to provided tenant. Returns: None @@ -48,7 +48,7 @@ async def add_user_to_tenant( message="Only tenant owner can add other users to organization." ) - if set_active_tenant: + if set_as_active_tenant: user.tenant_id = tenant_id await session.merge(user) await session.commit() diff --git a/cognee/modules/users/tenants/methods/create_tenant.py b/cognee/modules/users/tenants/methods/create_tenant.py index 60e10db5c..32baa05fd 100644 --- a/cognee/modules/users/tenants/methods/create_tenant.py +++ b/cognee/modules/users/tenants/methods/create_tenant.py @@ -1,6 +1,7 @@ from uuid import UUID from sqlalchemy import insert from sqlalchemy.exc import IntegrityError +from typing import Optional from cognee.modules.users.models.UserTenant import UserTenant from cognee.infrastructure.databases.exceptions import EntityAlreadyExistsError @@ -9,13 +10,16 @@ from cognee.modules.users.models import Tenant from cognee.modules.users.methods import get_user -async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: +async def create_tenant( + tenant_name: str, user_id: UUID, set_as_active_tenant: Optional[bool] = True +) -> UUID: """ Create a new tenant with the given name, for the user with the given id. This user is the owner of the tenant. Args: tenant_name: Name of the new tenant. user_id: Id of the user. + set_as_active_tenant: If true, set the newly created tenant as the active tenant for the user. Returns: None @@ -29,9 +33,10 @@ async def create_tenant(tenant_name: str, user_id: UUID) -> UUID: session.add(tenant) await session.flush() - user.tenant_id = tenant.id - await session.merge(user) - await session.commit() + if set_as_active_tenant: + user.tenant_id = tenant.id + await session.merge(user) + await session.commit() try: # Add association directly to the association table diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index 732b24858..6e72fea2f 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -52,7 +52,7 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): try: result = result.scalar_one() except sqlalchemy.exc.NoResultFound as e: - raise TenantNotFoundError("User Tenant relationship not found.") from e + raise TenantNotFoundError("User is not part of the tenant.") from e if result: # If user is part of tenant update current tenant of user From b0f85c9e990f8dd20e6fce8dcd6f29c4050050e8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:01:10 +0100 Subject: [PATCH 046/129] feat: add legacy and modern data_id calculating --- cognee/modules/data/methods/__init__.py | 1 + .../data/methods/get_unique_data_id.py | 71 +++++++++++++++++++ cognee/modules/ingestion/identify.py | 10 +-- 3 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 cognee/modules/data/methods/get_unique_data_id.py diff --git a/cognee/modules/data/methods/__init__.py b/cognee/modules/data/methods/__init__.py index 83913085c..7936a9afd 100644 --- a/cognee/modules/data/methods/__init__.py +++ b/cognee/modules/data/methods/__init__.py @@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset from .get_authorized_dataset_by_name import get_authorized_dataset_by_name from .get_data import get_data from .get_unique_dataset_id import get_unique_dataset_id +from .get_unique_data_id import get_unique_data_id from .get_authorized_existing_datasets import get_authorized_existing_datasets from .get_dataset_ids import get_dataset_ids diff --git a/cognee/modules/data/methods/get_unique_data_id.py b/cognee/modules/data/methods/get_unique_data_id.py new file mode 100644 index 000000000..3fc184ce4 --- /dev/null +++ b/cognee/modules/data/methods/get_unique_data_id.py @@ -0,0 +1,71 @@ +from uuid import uuid5, NAMESPACE_OID, UUID +from typing import Optional +from sqlalchemy import select + +from cognee.modules.data.models.Data import Data +from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.models import User + + +async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID: + """ + Function returns a unique UUID for data based on data identifier, user id and tenant id. + If data with legacy ID exists, return that ID to maintain compatibility. + + Args: + data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.) + user: User object adding the data + tenant_id: UUID of the tenant for which data is being added + + Returns: + UUID: Unique identifier for the data + """ + + def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID: + """ + Deprecated function, returns a unique UUID for data based on data identifier and user id. + Needed to support legacy data without tenant information. + Args: + data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.) + user: User object adding the data + + Returns: + UUID: Unique identifier for the data + """ + # return UUID hash of file contents + owner id + tenant_id + return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}") + + def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID: + """ + Function returns a unique UUID for data based on data identifier, user id and tenant id. + Args: + data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.) + user: User object adding the data + tenant_id: UUID of the tenant for which data is being added + + Returns: + UUID: Unique identifier for the data + """ + # return UUID hash of file contents + owner id + tenant_id + return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}") + + # Get all possible data_id values + data_id = { + "modern_data_id": _get_modern_unique_data_id( + data_identifier=data_identifier, user=user, tenant_id=tenant_id + ), + "legacy_data_id": _get_deprecated_unique_data_id( + data_identifier=data_identifier, user=user + ), + } + + # Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + legacy_data_point = ( + await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"])) + ).scalar_one_or_none() + + if not legacy_data_point: + return data_id["modern_data_id"] + return data_id["legacy_data_id"] diff --git a/cognee/modules/ingestion/identify.py b/cognee/modules/ingestion/identify.py index 977ff3f0b..5a0fe379e 100644 --- a/cognee/modules/ingestion/identify.py +++ b/cognee/modules/ingestion/identify.py @@ -1,11 +1,13 @@ -from uuid import uuid5, NAMESPACE_OID +from uuid import UUID from .data_types import IngestionData from cognee.modules.users.models import User +from cognee.modules.data.methods import get_unique_data_id -def identify(data: IngestionData, user: User) -> str: +async def identify(data: IngestionData, user: User) -> UUID: data_content_hash: str = data.get_identifier() - # return UUID hash of file contents + owner id - return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}") + return await get_unique_data_id( + data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id + ) From ff388179fb38cd82a59b3a45ea3f343b16c56c86 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:11:57 +0100 Subject: [PATCH 047/129] feat: Add dataset_id calculation that handles legacy dataset_id --- .../data/methods/get_unique_data_id.py | 11 ++- .../data/methods/get_unique_dataset_id.py | 70 +++++++++++++++++-- cognee/modules/ingestion/identify.py | 4 +- 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/cognee/modules/data/methods/get_unique_data_id.py b/cognee/modules/data/methods/get_unique_data_id.py index 3fc184ce4..877b5930c 100644 --- a/cognee/modules/data/methods/get_unique_data_id.py +++ b/cognee/modules/data/methods/get_unique_data_id.py @@ -1,5 +1,4 @@ from uuid import uuid5, NAMESPACE_OID, UUID -from typing import Optional from sqlalchemy import select from cognee.modules.data.models.Data import Data @@ -7,7 +6,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models import User -async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Optional[UUID]) -> UUID: +async def get_unique_data_id(data_identifier: str, user: User) -> UUID: """ Function returns a unique UUID for data based on data identifier, user id and tenant id. If data with legacy ID exists, return that ID to maintain compatibility. @@ -35,7 +34,7 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option # return UUID hash of file contents + owner id + tenant_id return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}") - def _get_modern_unique_data_id(data_identifier: str, user: User, tenant_id: UUID) -> UUID: + def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID: """ Function returns a unique UUID for data based on data identifier, user id and tenant id. Args: @@ -47,13 +46,11 @@ async def get_unique_data_id(data_identifier: str, user: User, tenant_id: Option UUID: Unique identifier for the data """ # return UUID hash of file contents + owner id + tenant_id - return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(tenant_id)}") + return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}") # Get all possible data_id values data_id = { - "modern_data_id": _get_modern_unique_data_id( - data_identifier=data_identifier, user=user, tenant_id=tenant_id - ), + "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user), "legacy_data_id": _get_deprecated_unique_data_id( data_identifier=data_identifier, user=user ), diff --git a/cognee/modules/data/methods/get_unique_dataset_id.py b/cognee/modules/data/methods/get_unique_dataset_id.py index 2caf5fb55..274f24d1a 100644 --- a/cognee/modules/data/methods/get_unique_dataset_id.py +++ b/cognee/modules/data/methods/get_unique_dataset_id.py @@ -1,9 +1,71 @@ from uuid import UUID, uuid5, NAMESPACE_OID -from cognee.modules.users.models import User from typing import Union +from sqlalchemy import select + +from cognee.modules.data.models.Dataset import Dataset +from cognee.modules.users.models import User +from cognee.infrastructure.databases.relational import get_relational_engine async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: - if isinstance(dataset_name, UUID): - return dataset_name - return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}") + """ + Function returns a unique UUID for dataset based on dataset name, user id and tenant id. + If dataset with legacy ID exists, return that ID to maintain compatibility. + + Args: + dataset_name: string representing the dataset name + user: User object adding the dataset + tenant_id: UUID of the tenant for which dataset is being added + + Returns: + UUID: Unique identifier for the dataset + """ + + def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: + """ + Legacy function, returns a unique UUID for dataset based on dataset name and user id. + Needed to support legacy datasets without tenant information. + Args: + dataset_name: string representing the dataset name + user: Current User object adding the dataset + + Returns: + UUID: Unique identifier for the dataset + """ + if isinstance(dataset_name, UUID): + return dataset_name + return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}") + + def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID: + """ + Returns a unique UUID for dataset based on dataset name, user id and tenant_id. + Args: + dataset_name: string representing the dataset name + user: Current User object adding the dataset + tenant_id: UUID of the tenant for which dataset is being added + + Returns: + UUID: Unique identifier for the dataset + """ + if isinstance(dataset_name, UUID): + return dataset_name + return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}") + + # Get all possible dataset_id values + dataset_id = { + "modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user), + "legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user), + } + + # Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id + db_engine = get_relational_engine() + async with db_engine.get_async_session() as session: + legacy_dataset = ( + await session.execute( + select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"]) + ) + ).scalar_one_or_none() + + if not legacy_dataset: + return dataset_id["modern_dataset_id"] + return dataset_id["legacy_dataset_id"] diff --git a/cognee/modules/ingestion/identify.py b/cognee/modules/ingestion/identify.py index 5a0fe379e..640fce4a2 100644 --- a/cognee/modules/ingestion/identify.py +++ b/cognee/modules/ingestion/identify.py @@ -8,6 +8,4 @@ from cognee.modules.data.methods import get_unique_data_id async def identify(data: IngestionData, user: User) -> UUID: data_content_hash: str = data.get_identifier() - return await get_unique_data_id( - data_identifier=data_content_hash, user=user, tenant_id=user.tenant_id - ) + return await get_unique_data_id(data_identifier=data_content_hash, user=user) From ac257dca1db4123cf97abacf32e1ecd85dab9afd Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:13:42 +0100 Subject: [PATCH 048/129] refactor: Account for async change for identify function --- cognee/modules/pipelines/operations/run_tasks_data_item.py | 2 +- cognee/tasks/ingestion/ingest_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/modules/pipelines/operations/run_tasks_data_item.py b/cognee/modules/pipelines/operations/run_tasks_data_item.py index 152e72d7f..2cc449df6 100644 --- a/cognee/modules/pipelines/operations/run_tasks_data_item.py +++ b/cognee/modules/pipelines/operations/run_tasks_data_item.py @@ -69,7 +69,7 @@ async def run_tasks_data_item_incremental( async with open_data_file(file_path) as file: classified_data = ingestion.classify(file) # data_id is the hash of file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, user) + data_id = await ingestion.identify(classified_data, user) else: # If data was already processed by Cognee get data id data_id = data_item.id diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 0572d0f1e..5987f38d5 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -99,7 +99,7 @@ async def ingest_data( # data_id is the hash of original file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, user) + data_id = await ingestion.identify(classified_data, user) original_file_metadata = classified_data.get_metadata() # Find metadata from Cognee data storage text file From ea675f29d65dcf354d8999106ff3b8db3a8149f2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:15:49 +0100 Subject: [PATCH 049/129] fix: Resolve typo in accessing dictionary for dataset_id --- cognee/modules/data/methods/get_unique_dataset_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/data/methods/get_unique_dataset_id.py b/cognee/modules/data/methods/get_unique_dataset_id.py index 274f24d1a..2b765ec78 100644 --- a/cognee/modules/data/methods/get_unique_dataset_id.py +++ b/cognee/modules/data/methods/get_unique_dataset_id.py @@ -62,7 +62,7 @@ async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> U async with db_engine.get_async_session() as session: legacy_dataset = ( await session.execute( - select(Dataset).filter(Dataset.id == dataset_id["legacy_data_id"]) + select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"]) ) ).scalar_one_or_none() From 9d771acc2427592f40caf0e9727c8e8151c5af64 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 13:35:50 +0100 Subject: [PATCH 050/129] refactor: filter out search results --- .../methods/get_all_user_permission_datasets.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index e5dbb0e4b..a8cb96fbb 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -26,13 +26,16 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> tenants = await user.awaitable_attrs.tenants for tenant in tenants: - # Get all datasets all tenant members have access to - datasets.extend(await get_principal_datasets(tenant, permission_type)) + # If tenant is the user's selected tenant add datasets that users roles in the tenant and the tenant itself + # have access for + if tenant.id == user.tenant_id: + # Get all datasets all tenant members have access to + datasets.extend(await get_principal_datasets(tenant, permission_type)) - # Get all datasets accessible by roles user is a part of - roles = await user.awaitable_attrs.roles - for role in roles: - datasets.extend(await get_principal_datasets(role, permission_type)) + # Get all datasets accessible by roles user is a part of + roles = await user.awaitable_attrs.roles + for role in roles: + datasets.extend(await get_principal_datasets(role, permission_type)) # Deduplicate datasets with same ID unique = {} From 7e3c24100b0606ecf7d9b71ab76bba9be9c64e68 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 4 Nov 2025 15:09:33 +0100 Subject: [PATCH 051/129] refactor: add structured output to completion retrievers --- .../retrieval/EntityCompletionRetriever.py | 11 +++- .../modules/retrieval/completion_retriever.py | 13 +++- ..._completion_context_extension_retriever.py | 4 ++ .../graph_completion_cot_retriever.py | 65 ++++--------------- .../retrieval/graph_completion_retriever.py | 3 + .../modules/retrieval/temporal_retriever.py | 9 ++- cognee/modules/retrieval/utils/completion.py | 26 +------- .../feedback/generate_improved_answers.py | 6 +- .../graph_completion_retriever_cot_test.py | 20 ++++-- 9 files changed, 67 insertions(+), 90 deletions(-) diff --git a/cognee/modules/retrieval/EntityCompletionRetriever.py b/cognee/modules/retrieval/EntityCompletionRetriever.py index 6086977ce..1f1ddad0a 100644 --- a/cognee/modules/retrieval/EntityCompletionRetriever.py +++ b/cognee/modules/retrieval/EntityCompletionRetriever.py @@ -1,5 +1,5 @@ import asyncio -from typing import Any, Optional, List +from typing import Any, Optional, List, Type from cognee.shared.logging_utils import get_logger from cognee.infrastructure.entities.BaseEntityExtractor import BaseEntityExtractor @@ -85,7 +85,11 @@ class EntityCompletionRetriever(BaseRetriever): return None async def get_completion( - self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None + self, + query: str, + context: Optional[Any] = None, + session_id: Optional[str] = None, + response_model: Type = str, ) -> List[str]: """ Generate completion using provided context or fetch new context. @@ -102,6 +106,7 @@ class EntityCompletionRetriever(BaseRetriever): fetched if not provided. (default None) - session_id (Optional[str]): Optional session identifier for caching. If None, defaults to 'default_session'. (default None) + - response_model (Type): The Pydantic model type for structured output. (default str) Returns: -------- @@ -133,6 +138,7 @@ class EntityCompletionRetriever(BaseRetriever): user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, conversation_history=conversation_history, + response_model=response_model, ), ) else: @@ -141,6 +147,7 @@ class EntityCompletionRetriever(BaseRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, + response_model=response_model, ) if session_save: diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py index bb568924d..f071e41de 100644 --- a/cognee/modules/retrieval/completion_retriever.py +++ b/cognee/modules/retrieval/completion_retriever.py @@ -1,5 +1,5 @@ import asyncio -from typing import Any, Optional +from typing import Any, Optional, Type from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector import get_vector_engine @@ -75,7 +75,11 @@ class CompletionRetriever(BaseRetriever): raise NoDataError("No data found in the system, please add data first.") from error async def get_completion( - self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None + self, + query: str, + context: Optional[Any] = None, + session_id: Optional[str] = None, + response_model: Type = str, ) -> str: """ Generates an LLM completion using the context. @@ -91,6 +95,7 @@ class CompletionRetriever(BaseRetriever): completion; if None, it retrieves the context for the query. (default None) - session_id (Optional[str]): Optional session identifier for caching. If None, defaults to 'default_session'. (default None) + - response_model (Type): The Pydantic model type for structured output. (default str) Returns: -------- @@ -118,6 +123,7 @@ class CompletionRetriever(BaseRetriever): system_prompt_path=self.system_prompt_path, system_prompt=self.system_prompt, conversation_history=conversation_history, + response_model=response_model, ), ) else: @@ -127,6 +133,7 @@ class CompletionRetriever(BaseRetriever): user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, system_prompt=self.system_prompt, + response_model=response_model, ) if session_save: @@ -137,4 +144,4 @@ class CompletionRetriever(BaseRetriever): session_id=session_id, ) - return completion + return [completion] diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py index 58b6b586f..6b2c6a9e6 100644 --- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py +++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py @@ -56,6 +56,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): context: Optional[List[Edge]] = None, session_id: Optional[str] = None, context_extension_rounds=4, + response_model: Type = str, ) -> List[str]: """ Extends the context for a given query by retrieving related triplets and generating new @@ -76,6 +77,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): defaults to 'default_session'. (default None) - context_extension_rounds: The maximum number of rounds to extend the context with new triplets before halting. (default 4) + - response_model (Type): The Pydantic model type for structured output. (default str) Returns: -------- @@ -143,6 +145,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): system_prompt_path=self.system_prompt_path, system_prompt=self.system_prompt, conversation_history=conversation_history, + response_model=response_model, ), ) else: @@ -152,6 +155,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, system_prompt=self.system_prompt, + response_model=response_model, ) if self.save_interaction and context_text and triplets and completion: diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py index 299db6855..39255fe68 100644 --- a/cognee/modules/retrieval/graph_completion_cot_retriever.py +++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py @@ -7,7 +7,7 @@ from cognee.shared.logging_utils import get_logger from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever from cognee.modules.retrieval.utils.completion import ( - generate_structured_completion, + generate_completion, summarize_text, ) from cognee.modules.retrieval.utils.session_cache import ( @@ -44,7 +44,6 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): questions based on reasoning. The public methods are: - get_completion - - get_structured_completion Instance variables include: - validation_system_prompt_path @@ -121,7 +120,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): triplets += await self.get_context(followup_question) context_text = await self.resolve_edges_to_text(list(set(triplets))) - completion = await generate_structured_completion( + completion = await generate_completion( query=query, context=context_text, user_prompt_path=self.user_prompt_path, @@ -165,24 +164,28 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): return completion, context_text, triplets - async def get_structured_completion( + async def get_completion( self, query: str, context: Optional[List[Edge]] = None, session_id: Optional[str] = None, - max_iter: int = 4, + max_iter=4, response_model: Type = str, - ) -> Any: + ) -> List[str]: """ - Generate structured completion responses based on a user query and contextual information. + Generate completion responses based on a user query and contextual information. - This method applies the same chain-of-thought logic as get_completion but returns + This method interacts with a language model client to retrieve a structured response, + using a series of iterations to refine the answers and generate follow-up questions + based on reasoning derived from previous outputs. It raises exceptions if the context + retrieval fails or if the model encounters issues in generating outputs. It returns structured output using the provided response model. Parameters: ----------- + - query (str): The user's query to be processed and answered. - - context (Optional[List[Edge]]): Optional context that may assist in answering the query. + - context (Optional[Any]): Optional context that may assist in answering the query. If not provided, it will be fetched based on the query. (default None) - session_id (Optional[str]): Optional session identifier for caching. If None, defaults to 'default_session'. (default None) @@ -192,7 +195,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): Returns: -------- - - Any: The generated structured completion based on the response model. + + - List[str]: A list containing the generated answer to the user's query. """ # Check if session saving is enabled cache_config = CacheConfig() @@ -228,45 +232,4 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): session_id=session_id, ) - return completion - - async def get_completion( - self, - query: str, - context: Optional[List[Edge]] = None, - session_id: Optional[str] = None, - max_iter=4, - ) -> List[str]: - """ - Generate completion responses based on a user query and contextual information. - - This method interacts with a language model client to retrieve a structured response, - using a series of iterations to refine the answers and generate follow-up questions - based on reasoning derived from previous outputs. It raises exceptions if the context - retrieval fails or if the model encounters issues in generating outputs. - - Parameters: - ----------- - - - query (str): The user's query to be processed and answered. - - context (Optional[Any]): Optional context that may assist in answering the query. - If not provided, it will be fetched based on the query. (default None) - - session_id (Optional[str]): Optional session identifier for caching. If None, - defaults to 'default_session'. (default None) - - max_iter: The maximum number of iterations to refine the answer and generate - follow-up questions. (default 4) - - Returns: - -------- - - - List[str]: A list containing the generated answer to the user's query. - """ - completion = await self.get_structured_completion( - query=query, - context=context, - session_id=session_id, - max_iter=max_iter, - response_model=str, - ) - return [completion] diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index b7ab4edae..b544e8ead 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -146,6 +146,7 @@ class GraphCompletionRetriever(BaseGraphRetriever): query: str, context: Optional[List[Edge]] = None, session_id: Optional[str] = None, + response_model: Type = str, ) -> List[str]: """ Generates a completion using graph connections context based on a query. @@ -188,6 +189,7 @@ class GraphCompletionRetriever(BaseGraphRetriever): system_prompt_path=self.system_prompt_path, system_prompt=self.system_prompt, conversation_history=conversation_history, + response_model=response_model, ), ) else: @@ -197,6 +199,7 @@ class GraphCompletionRetriever(BaseGraphRetriever): user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, system_prompt=self.system_prompt, + response_model=response_model, ) if self.save_interaction and context and triplets and completion: diff --git a/cognee/modules/retrieval/temporal_retriever.py b/cognee/modules/retrieval/temporal_retriever.py index ec68d37bb..38d69ec80 100644 --- a/cognee/modules/retrieval/temporal_retriever.py +++ b/cognee/modules/retrieval/temporal_retriever.py @@ -146,7 +146,11 @@ class TemporalRetriever(GraphCompletionRetriever): return self.descriptions_to_string(top_k_events) async def get_completion( - self, query: str, context: Optional[str] = None, session_id: Optional[str] = None + self, + query: str, + context: Optional[str] = None, + session_id: Optional[str] = None, + response_model: Type = str, ) -> List[str]: """ Generates a response using the query and optional context. @@ -159,6 +163,7 @@ class TemporalRetriever(GraphCompletionRetriever): retrieved based on the query. (default None) - session_id (Optional[str]): Optional session identifier for caching. If None, defaults to 'default_session'. (default None) + - response_model (Type): The Pydantic model type for structured output. (default str) Returns: -------- @@ -186,6 +191,7 @@ class TemporalRetriever(GraphCompletionRetriever): user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, conversation_history=conversation_history, + response_model=response_model, ), ) else: @@ -194,6 +200,7 @@ class TemporalRetriever(GraphCompletionRetriever): context=context, user_prompt_path=self.user_prompt_path, system_prompt_path=self.system_prompt_path, + response_model=response_model, ) if session_save: diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py index db7a10252..b77d7ef90 100644 --- a/cognee/modules/retrieval/utils/completion.py +++ b/cognee/modules/retrieval/utils/completion.py @@ -3,7 +3,7 @@ from cognee.infrastructure.llm.LLMGateway import LLMGateway from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt -async def generate_structured_completion( +async def generate_completion( query: str, context: str, user_prompt_path: str, @@ -11,8 +11,8 @@ async def generate_structured_completion( system_prompt: Optional[str] = None, conversation_history: Optional[str] = None, response_model: Type = str, -) -> Any: - """Generates a structured completion using LLM with given context and prompts.""" +) -> str: + """Generates a completion using LLM with given context and prompts.""" args = {"question": query, "context": context} user_prompt = render_prompt(user_prompt_path, args) system_prompt = system_prompt if system_prompt else read_query_prompt(system_prompt_path) @@ -28,26 +28,6 @@ async def generate_structured_completion( ) -async def generate_completion( - query: str, - context: str, - user_prompt_path: str, - system_prompt_path: str, - system_prompt: Optional[str] = None, - conversation_history: Optional[str] = None, -) -> str: - """Generates a completion using LLM with given context and prompts.""" - return await generate_structured_completion( - query=query, - context=context, - user_prompt_path=user_prompt_path, - system_prompt_path=system_prompt_path, - system_prompt=system_prompt, - conversation_history=conversation_history, - response_model=str, - ) - - async def summarize_text( text: str, system_prompt_path: str = "summarize_search_results.txt", diff --git a/cognee/tasks/feedback/generate_improved_answers.py b/cognee/tasks/feedback/generate_improved_answers.py index e439cf9e5..d2b143d29 100644 --- a/cognee/tasks/feedback/generate_improved_answers.py +++ b/cognee/tasks/feedback/generate_improved_answers.py @@ -61,7 +61,7 @@ async def _generate_improved_answer_for_single_interaction( ) retrieved_context = await retriever.get_context(query_text) - completion = await retriever.get_structured_completion( + completion = await retriever.get_completion( query=query_text, context=retrieved_context, response_model=ImprovedAnswerResponse, @@ -70,9 +70,9 @@ async def _generate_improved_answer_for_single_interaction( new_context_text = await retriever.resolve_edges_to_text(retrieved_context) if completion: - enrichment.improved_answer = completion.answer + enrichment.improved_answer = completion[0].answer enrichment.new_context = new_context_text - enrichment.explanation = completion.explanation + enrichment.explanation = completion[0].explanation return enrichment else: logger.warning( diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py index 7fcfe0d6b..bf10dc023 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py @@ -206,16 +206,22 @@ class TestGraphCompletionCoTRetriever: retriever = GraphCompletionCotRetriever() # Test with string response model (default) - string_answer = await retriever.get_structured_completion("Who works at Figma?") - assert isinstance(string_answer, str), f"Expected str, got {type(string_answer).__name__}" - assert string_answer.strip(), "Answer should not be empty" + string_answer = await retriever.get_completion("Who works at Figma?") + assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" + assert all(isinstance(item, str) and item.strip() for item in string_answer), ( + "Answer should not be empty" + ) # Test with structured response model - structured_answer = await retriever.get_structured_completion( + structured_answer = await retriever.get_completion( "Who works at Figma?", response_model=TestAnswer ) - assert isinstance(structured_answer, TestAnswer), ( + assert isinstance(structured_answer, list), ( + f"Expected list, got {type(structured_answer).__name__}" + ) + assert all(isinstance(item, TestAnswer) for item in string_answer), ( f"Expected TestAnswer, got {type(structured_answer).__name__}" ) - assert structured_answer.answer.strip(), "Answer field should not be empty" - assert structured_answer.explanation.strip(), "Explanation field should not be empty" + + assert structured_answer[0].answer.strip(), "Answer field should not be empty" + assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" From 33b05163811c7f7f74db39abfea711103b823132 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 4 Nov 2025 15:27:03 +0100 Subject: [PATCH 052/129] test: fix completion tests --- ...letion_retriever_context_extension_test.py | 59 +++++++++++++++++++ .../graph_completion_retriever_cot_test.py | 2 +- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py index 0e21fe351..5335a3ca7 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py @@ -2,6 +2,7 @@ import os import pytest import pathlib from typing import Optional, Union +from pydantic import BaseModel import cognee from cognee.low_level import setup, DataPoint @@ -12,6 +13,11 @@ from cognee.modules.retrieval.graph_completion_context_extension_retriever impor ) +class TestAnswer(BaseModel): + answer: str + explanation: str + + class TestGraphCompletionWithContextExtensionRetriever: @pytest.mark.asyncio async def test_graph_completion_extension_context_simple(self): @@ -175,3 +181,56 @@ class TestGraphCompletionWithContextExtensionRetriever: assert all(isinstance(item, str) and item.strip() for item in answer), ( "Answer must contain only non-empty strings" ) + + @pytest.mark.asyncio + async def test_get_structured_completion_extension_context(self): + system_directory_path = os.path.join( + pathlib.Path(__file__).parent, + ".cognee_system/test_get_structured_completion_extension_context", + ) + cognee.config.system_root_directory(system_directory_path) + data_directory_path = os.path.join( + pathlib.Path(__file__).parent, + ".data_storage/test_get_structured_completion_extension_context", + ) + cognee.config.data_root_directory(data_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await setup() + + class Company(DataPoint): + name: str + + class Person(DataPoint): + name: str + works_for: Company + + company1 = Company(name="Figma") + person1 = Person(name="Steve Rodger", works_for=company1) + + entities = [company1, person1] + await add_data_points(entities) + + retriever = GraphCompletionContextExtensionRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("Who works at Figma?") + assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" + assert all(isinstance(item, str) and item.strip() for item in string_answer), ( + "Answer should not be empty" + ) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Who works at Figma?", response_model=TestAnswer + ) + assert isinstance(structured_answer, list), ( + f"Expected list, got {type(structured_answer).__name__}" + ) + assert all(isinstance(item, TestAnswer) for item in structured_answer), ( + f"Expected TestAnswer, got {type(structured_answer).__name__}" + ) + + assert structured_answer[0].answer.strip(), "Answer field should not be empty" + assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py index bf10dc023..731e9fccf 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py @@ -219,7 +219,7 @@ class TestGraphCompletionCoTRetriever: assert isinstance(structured_answer, list), ( f"Expected list, got {type(structured_answer).__name__}" ) - assert all(isinstance(item, TestAnswer) for item in string_answer), ( + assert all(isinstance(item, TestAnswer) for item in structured_answer), ( f"Expected TestAnswer, got {type(structured_answer).__name__}" ) From f4117c42e9c1bd0630a333bb789faba8686ba5b0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 16:43:41 +0100 Subject: [PATCH 053/129] fix: Resolve issue with entity extraction test --- cognee/tests/tasks/entity_extraction/entity_extraction_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py index 39e883e09..41a9254ca 100644 --- a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py +++ b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py @@ -55,7 +55,7 @@ async def main(): classified_data = ingestion.classify(file) # data_id is the hash of original file contents + owner id to avoid duplicate data - data_id = ingestion.identify(classified_data, await get_default_user()) + data_id = await ingestion.identify(classified_data, await get_default_user()) await cognee.add(file_path) From cd32b492a469c9bfac14d4b3f20ed99a727a9460 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 17:56:01 +0100 Subject: [PATCH 054/129] refactor: Add filtering of non current tenant results when authorizing dataset --- .../get_all_user_permission_datasets.py | 25 ++++++++++--------- .../users/roles/methods/add_user_to_role.py | 4 ++- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index a8cb96fbb..ee1de3c72 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -24,18 +24,14 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # Get all tenants user is a part of tenants = await user.awaitable_attrs.tenants - for tenant in tenants: - # If tenant is the user's selected tenant add datasets that users roles in the tenant and the tenant itself - # have access for - if tenant.id == user.tenant_id: - # Get all datasets all tenant members have access to - datasets.extend(await get_principal_datasets(tenant, permission_type)) + # Get all datasets all tenant members have access to + datasets.extend(await get_principal_datasets(tenant, permission_type)) - # Get all datasets accessible by roles user is a part of - roles = await user.awaitable_attrs.roles - for role in roles: - datasets.extend(await get_principal_datasets(role, permission_type)) + # Get all datasets accessible by roles user is a part of + roles = await user.awaitable_attrs.roles + for role in roles: + datasets.extend(await get_principal_datasets(role, permission_type)) # Deduplicate datasets with same ID unique = {} @@ -43,5 +39,10 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) - # TODO: Add filtering out of datasets that aren't currently selected tenant of user (currently selected tenant is the tenant_id value in the User model) - return list(unique.values()) + # Filter out dataset that aren't part of the current user's tenant + filtered_datasets = [] + for dataset in list(unique.values()): + if dataset.tenant_id == user.tenant_id: + filtered_datasets.append(dataset) + + return filtered_datasets diff --git a/cognee/modules/users/roles/methods/add_user_to_role.py b/cognee/modules/users/roles/methods/add_user_to_role.py index de5e47775..d764ac900 100644 --- a/cognee/modules/users/roles/methods/add_user_to_role.py +++ b/cognee/modules/users/roles/methods/add_user_to_role.py @@ -42,11 +42,13 @@ async def add_user_to_role(user_id: UUID, role_id: UUID, owner_id: UUID): .first() ) + user_tenants = await user.awaitable_attrs.tenants + if not user: raise UserNotFoundError elif not role: raise RoleNotFoundError - elif user.tenant_id != role.tenant_id: + elif role.tenant_id not in [tenant.id for tenant in user_tenants]: # TESTME raise TenantNotFoundError( message="User tenant does not match role tenant. User cannot be added to role." ) From bee2fe3ba70307c4a5ea03a5cdc1e07e927708b4 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Tue, 4 Nov 2025 17:58:34 +0100 Subject: [PATCH 055/129] feat: Add initial custom pipeline (#1716) ## Description Add run_custom_pipeline to have a way to execute a custom collection of tasks in Cognee ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .github/workflows/examples_tests.yml | 25 ++++++ cognee/__init__.py | 1 + .../modules/run_custom_pipeline/__init__.py | 1 + .../run_custom_pipeline.py | 69 +++++++++++++++ .../modules/users/methods/get_default_user.py | 2 +- .../python/run_custom_pipeline_example.py | 84 +++++++++++++++++++ 6 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 cognee/modules/run_custom_pipeline/__init__.py create mode 100644 cognee/modules/run_custom_pipeline/run_custom_pipeline.py create mode 100644 examples/python/run_custom_pipeline_example.py diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 57bc88157..36953e259 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -210,6 +210,31 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./examples/python/memify_coding_agent_example.py + test-custom-pipeline: + name: Run Custom Pipeline Example + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Custom Pipeline Example + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./examples/python/run_custom_pipeline_example.py + test-permissions-example: name: Run Permissions Example runs-on: ubuntu-22.04 diff --git a/cognee/__init__.py b/cognee/__init__.py index 6e4d2a903..4d150ce4e 100644 --- a/cognee/__init__.py +++ b/cognee/__init__.py @@ -19,6 +19,7 @@ from .api.v1.add import add from .api.v1.delete import delete from .api.v1.cognify import cognify from .modules.memify import memify +from .modules.run_custom_pipeline import run_custom_pipeline from .api.v1.update import update from .api.v1.config.config import config from .api.v1.datasets.datasets import datasets diff --git a/cognee/modules/run_custom_pipeline/__init__.py b/cognee/modules/run_custom_pipeline/__init__.py new file mode 100644 index 000000000..2d30e2e0c --- /dev/null +++ b/cognee/modules/run_custom_pipeline/__init__.py @@ -0,0 +1 @@ +from .run_custom_pipeline import run_custom_pipeline diff --git a/cognee/modules/run_custom_pipeline/run_custom_pipeline.py b/cognee/modules/run_custom_pipeline/run_custom_pipeline.py new file mode 100644 index 000000000..d3df1c060 --- /dev/null +++ b/cognee/modules/run_custom_pipeline/run_custom_pipeline.py @@ -0,0 +1,69 @@ +from typing import Union, Optional, List, Type, Any +from uuid import UUID + +from cognee.shared.logging_utils import get_logger + +from cognee.modules.pipelines import run_pipeline +from cognee.modules.pipelines.tasks.task import Task +from cognee.modules.users.models import User +from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor + +logger = get_logger() + + +async def run_custom_pipeline( + tasks: Union[List[Task], List[str]] = None, + data: Any = None, + dataset: Union[str, UUID] = "main_dataset", + user: User = None, + vector_db_config: Optional[dict] = None, + graph_db_config: Optional[dict] = None, + data_per_batch: int = 20, + run_in_background: bool = False, + pipeline_name: str = "custom_pipeline", +): + """ + Custom pipeline in Cognee, can work with already built graphs. Data needs to be provided which can be processed + with provided tasks. + + Provided tasks and data will be arranged to run the Cognee pipeline and execute graph enrichment/creation. + + This is the core processing step in Cognee that converts raw text and documents + into an intelligent knowledge graph. It analyzes content, extracts entities and + relationships, and creates semantic connections for enhanced search and reasoning. + + Args: + tasks: List of Cognee Tasks to execute. + data: The data to ingest. Can be anything when custom extraction and enrichment tasks are used. + Data provided here will be forwarded to the first extraction task in the pipeline as input. + dataset: Dataset name or dataset uuid to process. + user: User context for authentication and data access. Uses default if None. + vector_db_config: Custom vector database configuration for embeddings storage. + graph_db_config: Custom graph database configuration for relationship storage. + data_per_batch: Number of data items to be processed in parallel. + run_in_background: If True, starts processing asynchronously and returns immediately. + If False, waits for completion before returning. + Background mode recommended for large datasets (>100MB). + Use pipeline_run_id from return value to monitor progress. + """ + + custom_tasks = [ + *tasks, + ] + + # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for + pipeline_executor_func = get_pipeline_executor(run_in_background=run_in_background) + + # Run the run_pipeline in the background or blocking based on executor + return await pipeline_executor_func( + pipeline=run_pipeline, + tasks=custom_tasks, + user=user, + data=data, + datasets=dataset, + vector_db_config=vector_db_config, + graph_db_config=graph_db_config, + incremental_loading=False, + data_per_batch=data_per_batch, + pipeline_name=pipeline_name, + ) diff --git a/cognee/modules/users/methods/get_default_user.py b/cognee/modules/users/methods/get_default_user.py index 773545f8e..9e3940617 100644 --- a/cognee/modules/users/methods/get_default_user.py +++ b/cognee/modules/users/methods/get_default_user.py @@ -10,7 +10,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.methods.create_default_user import create_default_user -async def get_default_user() -> SimpleNamespace: +async def get_default_user() -> User: db_engine = get_relational_engine() base_config = get_base_config() default_email = base_config.default_user_email or "default_user@example.com" diff --git a/examples/python/run_custom_pipeline_example.py b/examples/python/run_custom_pipeline_example.py new file mode 100644 index 000000000..1ca1b4402 --- /dev/null +++ b/examples/python/run_custom_pipeline_example.py @@ -0,0 +1,84 @@ +import asyncio +import cognee +from cognee.modules.engine.operations.setup import setup +from cognee.modules.users.methods import get_default_user +from cognee.shared.logging_utils import setup_logging, INFO +from cognee.modules.pipelines import Task +from cognee.api.v1.search import SearchType + +# Prerequisites: +# 1. Copy `.env.template` and rename it to `.env`. +# 2. Add your OpenAI API key to the `.env` file in the `LLM_API_KEY` field: +# LLM_API_KEY = "your_key_here" + + +async def main(): + # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") + + # Create relational database and tables + await setup() + + # cognee knowledge graph will be created based on this text + text = """ + Natural language processing (NLP) is an interdisciplinary + subfield of computer science and information retrieval. + """ + + print("Adding text to cognee:") + print(text.strip()) + + # Let's recreate the cognee add pipeline through the custom pipeline framework + from cognee.tasks.ingestion import ingest_data, resolve_data_directories + + user = await get_default_user() + + # Values for tasks need to be filled before calling the pipeline + add_tasks = [ + Task(resolve_data_directories, include_subdirectories=True), + Task( + ingest_data, + "main_dataset", + user, + ), + ] + # Forward tasks to custom pipeline along with data and user information + await cognee.run_custom_pipeline( + tasks=add_tasks, data=text, user=user, dataset="main_dataset", pipeline_name="add_pipeline" + ) + print("Text added successfully.\n") + + # Use LLMs and cognee to create knowledge graph + from cognee.api.v1.cognify.cognify import get_default_tasks + + cognify_tasks = await get_default_tasks(user=user) + print("Recreating existing cognify pipeline in custom pipeline to create knowledge graph...\n") + await cognee.run_custom_pipeline( + tasks=cognify_tasks, user=user, dataset="main_dataset", pipeline_name="cognify_pipeline" + ) + print("Cognify process complete.\n") + + query_text = "Tell me about NLP" + print(f"Searching cognee for insights with query: '{query_text}'") + # Query cognee for insights on the added text + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, query_text=query_text + ) + + print("Search results:") + # Display results + for result_text in search_results: + print(result_text) + + +if __name__ == "__main__": + logger = setup_logging(log_level=INFO) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(main()) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) From fb102f29a8fbbfa941641208f41a55e1eb370fb5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 19:03:56 +0100 Subject: [PATCH 056/129] chore: Add alembic migration for multi-tenant system --- .../c946955da633_multi_tenant_support.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 alembic/versions/c946955da633_multi_tenant_support.py diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py new file mode 100644 index 000000000..2ad230974 --- /dev/null +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -0,0 +1,113 @@ +"""Multi Tenant Support + +Revision ID: c946955da633 +Revises: 211ab850ef3d +Create Date: 2025-11-04 18:11:09.325158 + +""" + +from typing import Sequence, Union +from datetime import datetime, timezone +from uuid import uuid4 + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision: str = "c946955da633" +down_revision: Union[str, None] = "211ab850ef3d" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _define_user_table() -> sa.Table: + table = sa.Table( + "users", + sa.MetaData(), + sa.Column( + "id", + sa.UUID, + sa.ForeignKey("principals.id", ondelete="CASCADE"), + primary_key=True, + nullable=False, + ), + sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), index=True, nullable=True), + ) + return table + + +def _define_dataset_table() -> sa.Table: + # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table + # definition or load what is in the database + table = sa.Table( + "datasets", + sa.MetaData(), + sa.Column("id", sa.UUID, primary_key=True, default=uuid4), + sa.Column("name", sa.Text), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + default=lambda: datetime.now(timezone.utc), + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + onupdate=lambda: datetime.now(timezone.utc), + ), + sa.Column("owner_id", sa.UUID(), sa.ForeignKey("principals.id"), index=True), + sa.Column("tenant_id", sa.UUID(), sa.ForeignKey("tenants.id"), index=True, nullable=True), + ) + + return table + + +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + dataset = _define_dataset_table() + user = _define_user_table() + + tenant_id_column = _get_column(insp, "datasets", "tenant_id") + if not tenant_id_column: + op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True)) + + # Build correlated subquery: select users.tenant_id for each dataset.owner_id + tenant_id_from_dataset_owner = ( + sa.select(user.c.tenant_id).where(user.c.id == dataset.c.owner_id).scalar_subquery() + ) + + # Update statement; restrict to rows where tenant_id is currently NULL + # update_stmt = ( + # sa.update(dataset) + # .values(tenant_id=subq) + # ) + + user = _define_user_table() + if op.get_context().dialect.name == "sqlite": + # If column doesn't exist create new original_extension column and update from values of extension column + with op.batch_alter_table("datasets") as batch_op: + batch_op.execute( + dataset.update().values( + tenant_id=tenant_id_from_dataset_owner, + ) + ) + else: + conn = op.get_bind() + conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner)) + + op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"], unique=False) + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + op.drop_column("datasets", "tenant_id") + # ### end Alembic commands ### From db2a32dd171a7db53487bec4c29474d1f36d1aa2 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 19:17:02 +0100 Subject: [PATCH 057/129] test: Resolve issue permission example --- alembic/versions/c946955da633_multi_tenant_support.py | 9 +-------- examples/python/permissions_example.py | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 2ad230974..09781c85c 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -79,18 +79,11 @@ def upgrade() -> None: if not tenant_id_column: op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True)) - # Build correlated subquery: select users.tenant_id for each dataset.owner_id + # Build subquery, select users.tenant_id for each dataset.owner_id tenant_id_from_dataset_owner = ( sa.select(user.c.tenant_id).where(user.c.id == dataset.c.owner_id).scalar_subquery() ) - # Update statement; restrict to rows where tenant_id is currently NULL - # update_stmt = ( - # sa.update(dataset) - # .values(tenant_id=subq) - # ) - - user = _define_user_table() if op.get_context().dialect.name == "sqlite": # If column doesn't exist create new original_extension column and update from values of extension column with op.batch_alter_table("datasets") as batch_op: diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 7c140845c..5d1195a11 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -151,7 +151,7 @@ async def main(): # To add a user to a role he must be part of the same tenant/organization print("\nOperation started as user_2 to add user_3 to CogneeLab tenant/organization") await add_user_to_tenant( - user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_active_tenant=True + user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_as_active_tenant=True ) print( From f002d3bf0ef24e8db113625971bfb98e6473e6b7 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 20:24:16 +0100 Subject: [PATCH 058/129] refactor: Update permissions example --- .../tenants/methods/add_user_to_tenant.py | 2 +- .../users/tenants/methods/select_tenant.py | 6 ++- examples/python/permissions_example.py | 45 ++++++++++++++++--- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index edadfe66b..eecc49f6f 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -16,7 +16,7 @@ from cognee.modules.users.exceptions import ( async def add_user_to_tenant( - user_id: UUID, tenant_id: UUID, owner_id: UUID, set_as_active_tenant: Optional[bool] = True + user_id: UUID, tenant_id: UUID, owner_id: UUID, set_as_active_tenant: Optional[bool] = False ): """ Add a user with the given id to the tenant with the given id. diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index 6e72fea2f..b444e9b1e 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -7,11 +7,12 @@ from sqlalchemy import select from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models.UserTenant import UserTenant from cognee.modules.users.methods import get_user +from cognee.modules.users.models.User import User from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): +async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: """ Set the users active tenant to provided tenant. @@ -33,7 +34,7 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): user.tenant_id = None await session.merge(user) await session.commit() - return + return user tenant = await get_tenant(tenant_id) @@ -59,3 +60,4 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]): user.tenant_id = tenant_id await session.merge(user) await session.commit() + return user diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 5d1195a11..fdbde00f0 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -3,6 +3,7 @@ import cognee import pathlib from cognee.modules.users.exceptions import PermissionDeniedError +from cognee.modules.users.tenants.methods import select_tenant from cognee.shared.logging_utils import get_logger from cognee.modules.search.types import SearchType from cognee.modules.users.methods import create_user @@ -116,6 +117,7 @@ async def main(): print( "\nOperation started as user_2 to give read permission to user_1 for the dataset owned by user_2" ) + await authorized_give_permission_on_datasets( user_1.id, [quantum_dataset_id], @@ -142,6 +144,9 @@ async def main(): print("User 2 is creating CogneeLab tenant/organization") tenant_id = await create_tenant("CogneeLab", user_2.id) + print("User 2 is selecting CogneeLab tenant/organization as active tenant") + await select_tenant(user_id=user_2.id, tenant_id=tenant_id) + print("\nUser 2 is creating Researcher role") role_id = await create_role(role_name="Researcher", owner_id=user_2.id) @@ -150,27 +155,55 @@ async def main(): # To add a user to a role he must be part of the same tenant/organization print("\nOperation started as user_2 to add user_3 to CogneeLab tenant/organization") - await add_user_to_tenant( - user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id, set_as_active_tenant=True - ) + await add_user_to_tenant(user_id=user_3.id, tenant_id=tenant_id, owner_id=user_2.id) print( "\nOperation started by user_2, as tenant owner, to add user_3 to Researcher role inside the tenant/organization" ) await add_user_to_role(user_id=user_3.id, role_id=role_id, owner_id=user_2.id) + print("\nOperation as user_3 to select CogneeLab tenant/organization as active tenant") + await select_tenant(user_id=user_3.id, tenant_id=tenant_id) + print( - "\nOperation started as user_2 to give read permission to Researcher role for the dataset owned by user_2" + "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by user_2" + ) + # Even though the dataset owner is user_2, the dataset doesn't belong to the tenant/organization CogneeLab. + # So we can't assign permissions to it when we're acting in the CogneeLab tenant. + try: + await authorized_give_permission_on_datasets( + role_id, + [quantum_dataset_id], + "read", + user_2.id, + ) + except PermissionDeniedError: + print( + "User 2 could not give permission to the role as the QUANTUM dataset is not part of the CogneeLab tenant" + ) + + print( + "We will now create a new QUANTUM dataset in the CogneeLab tenant so that permissions can be assigned to the Researcher role inside the tenant/organization" + ) + # Re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally + # and can still be accessed by selecting the personal tenant for user 2. + await cognee.add([text], dataset_name="QUANTUM", user=user_2) + quantum_cognify_result = await cognee.cognify(["QUANTUM"], user=user_2) + + # The recreated Quantum dataset will now have a different dataset_id as it's a new dataset in a different organization + quantum_dataset_id_cognee_lab_tenant = extract_dataset_id_from_cognify(quantum_cognify_result) + print( + "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by the CogneeLab tenant" ) await authorized_give_permission_on_datasets( role_id, - [quantum_dataset_id], + [quantum_dataset_id_cognee_lab_tenant], "read", user_2.id, ) # Now user_3 can read from QUANTUM dataset as part of the Researcher role after proper permissions have been assigned by the QUANTUM dataset owner, user_2. - print("\nSearch result as user_3 on the dataset owned by user_2:") + print("\nSearch result as user_3 on the QUANTUM dataset owned by the CogneeLab organization:") search_results = await cognee.search( query_type=SearchType.GRAPH_COMPLETION, query_text="What is in the document?", From 7782f246d30f159e23c4ae46afa7896936a8a677 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 20:54:00 +0100 Subject: [PATCH 059/129] refactor: Update permissions example to work with new changes --- .../routers/get_permissions_router.py | 2 +- .../users/tenants/methods/select_tenant.py | 9 +++----- examples/python/permissions_example.py | 22 ++++++++++--------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 20d35e748..db2c72705 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -259,7 +259,7 @@ def get_permissions_router() -> APIRouter: from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method - await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id) + await select_tenant_method(user=user, tenant_id=payload.tenant_id) return JSONResponse( status_code=200, diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index b444e9b1e..cb291d5f2 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -6,19 +6,18 @@ from sqlalchemy import select from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.users.models.UserTenant import UserTenant -from cognee.modules.users.methods import get_user from cognee.modules.users.models.User import User from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: +async def select_tenant(user: User, tenant_id: Union[UUID, None]) -> User: """ Set the users active tenant to provided tenant. If None tenant_id is provided set current Tenant to the default single user-tenant Args: - user_id: Id of the user. + user: User object. tenant_id: Id of the tenant. Returns: @@ -27,8 +26,6 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: - user = await get_user(user_id) - if tenant_id is None: # If no tenant_id is provided set current Tenant to the single user-tenant user.tenant_id = None @@ -46,7 +43,7 @@ async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: # Check if User is part of Tenant result = await session.execute( select(UserTenant) - .where(UserTenant.user_id == user_id) + .where(UserTenant.user_id == user.id) .where(UserTenant.tenant_id == tenant_id) ) diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index fdbde00f0..4bbd30bea 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -145,7 +145,7 @@ async def main(): tenant_id = await create_tenant("CogneeLab", user_2.id) print("User 2 is selecting CogneeLab tenant/organization as active tenant") - await select_tenant(user_id=user_2.id, tenant_id=tenant_id) + await select_tenant(user=user_2, tenant_id=tenant_id) print("\nUser 2 is creating Researcher role") role_id = await create_role(role_name="Researcher", owner_id=user_2.id) @@ -163,7 +163,7 @@ async def main(): await add_user_to_role(user_id=user_3.id, role_id=role_id, owner_id=user_2.id) print("\nOperation as user_3 to select CogneeLab tenant/organization as active tenant") - await select_tenant(user_id=user_3.id, tenant_id=tenant_id) + await select_tenant(user=user_3, tenant_id=tenant_id) print( "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by user_2" @@ -183,21 +183,23 @@ async def main(): ) print( - "We will now create a new QUANTUM dataset in the CogneeLab tenant so that permissions can be assigned to the Researcher role inside the tenant/organization" + "We will now create a new QUANTUM dataset with the QUANTUM_COGNEE_LAB name in the CogneeLab tenant so that permissions can be assigned to the Researcher role inside the tenant/organization" ) - # Re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally + # We can re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally # and can still be accessed by selecting the personal tenant for user 2. - await cognee.add([text], dataset_name="QUANTUM", user=user_2) - quantum_cognify_result = await cognee.cognify(["QUANTUM"], user=user_2) + await cognee.add([text], dataset_name="QUANTUM_COGNEE_LAB", user=user_2) + quantum_cognee_lab_cognify_result = await cognee.cognify(["QUANTUM_COGNEE_LAB"], user=user_2) # The recreated Quantum dataset will now have a different dataset_id as it's a new dataset in a different organization - quantum_dataset_id_cognee_lab_tenant = extract_dataset_id_from_cognify(quantum_cognify_result) + quantum_cognee_lab_dataset_id = extract_dataset_id_from_cognify( + quantum_cognee_lab_cognify_result + ) print( "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by the CogneeLab tenant" ) await authorized_give_permission_on_datasets( role_id, - [quantum_dataset_id_cognee_lab_tenant], + [quantum_cognee_lab_dataset_id], "read", user_2.id, ) @@ -207,8 +209,8 @@ async def main(): search_results = await cognee.search( query_type=SearchType.GRAPH_COMPLETION, query_text="What is in the document?", - user=user_1, - dataset_ids=[quantum_dataset_id], + user=user_3, + dataset_ids=[quantum_cognee_lab_dataset_id], ) for result in search_results: print(f"{result}\n") From c2aaec2a827fbdd8f91747989753b4f62a41fa38 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 4 Nov 2025 23:34:51 +0100 Subject: [PATCH 060/129] refactor: Resolve issue with permissions example --- .../api/v1/permissions/routers/get_permissions_router.py | 2 +- cognee/modules/users/tenants/methods/select_tenant.py | 6 ++++-- examples/python/permissions_example.py | 8 ++++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index db2c72705..20d35e748 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -259,7 +259,7 @@ def get_permissions_router() -> APIRouter: from cognee.modules.users.tenants.methods import select_tenant as select_tenant_method - await select_tenant_method(user=user, tenant_id=payload.tenant_id) + await select_tenant_method(user_id=user.id, tenant_id=payload.tenant_id) return JSONResponse( status_code=200, diff --git a/cognee/modules/users/tenants/methods/select_tenant.py b/cognee/modules/users/tenants/methods/select_tenant.py index cb291d5f2..83c11dc91 100644 --- a/cognee/modules/users/tenants/methods/select_tenant.py +++ b/cognee/modules/users/tenants/methods/select_tenant.py @@ -5,19 +5,20 @@ import sqlalchemy.exc from sqlalchemy import select from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.modules.users.methods.get_user import get_user from cognee.modules.users.models.UserTenant import UserTenant from cognee.modules.users.models.User import User from cognee.modules.users.permissions.methods import get_tenant from cognee.modules.users.exceptions import UserNotFoundError, TenantNotFoundError -async def select_tenant(user: User, tenant_id: Union[UUID, None]) -> User: +async def select_tenant(user_id: UUID, tenant_id: Union[UUID, None]) -> User: """ Set the users active tenant to provided tenant. If None tenant_id is provided set current Tenant to the default single user-tenant Args: - user: User object. + user_id: UUID of the user. tenant_id: Id of the tenant. Returns: @@ -26,6 +27,7 @@ async def select_tenant(user: User, tenant_id: Union[UUID, None]) -> User: """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: + user = await get_user(user_id) if tenant_id is None: # If no tenant_id is provided set current Tenant to the single user-tenant user.tenant_id = None diff --git a/examples/python/permissions_example.py b/examples/python/permissions_example.py index 4bbd30bea..c0b104023 100644 --- a/examples/python/permissions_example.py +++ b/examples/python/permissions_example.py @@ -145,7 +145,7 @@ async def main(): tenant_id = await create_tenant("CogneeLab", user_2.id) print("User 2 is selecting CogneeLab tenant/organization as active tenant") - await select_tenant(user=user_2, tenant_id=tenant_id) + await select_tenant(user_id=user_2.id, tenant_id=tenant_id) print("\nUser 2 is creating Researcher role") role_id = await create_role(role_name="Researcher", owner_id=user_2.id) @@ -163,7 +163,7 @@ async def main(): await add_user_to_role(user_id=user_3.id, role_id=role_id, owner_id=user_2.id) print("\nOperation as user_3 to select CogneeLab tenant/organization as active tenant") - await select_tenant(user=user_3, tenant_id=tenant_id) + await select_tenant(user_id=user_3.id, tenant_id=tenant_id) print( "\nOperation started as user_2, with CogneeLab as its active tenant, to give read permission to Researcher role for the dataset QUANTUM owned by user_2" @@ -187,6 +187,10 @@ async def main(): ) # We can re-create the QUANTUM dataset in the CogneeLab tenant. The old QUANTUM dataset is still owned by user_2 personally # and can still be accessed by selecting the personal tenant for user 2. + from cognee.modules.users.methods import get_user + + # Note: We need to update user_2 from the database to refresh its tenant context changes + user_2 = await get_user(user_2.id) await cognee.add([text], dataset_name="QUANTUM_COGNEE_LAB", user=user_2) quantum_cognee_lab_cognify_result = await cognee.cognify(["QUANTUM_COGNEE_LAB"], user=user_2) From eaf8d718b0fc52ebc2aefaee9e8fa524abb96125 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:27:54 +0100 Subject: [PATCH 061/129] feat: introduces memify pipeline to save cache sessions into cognee (#1731) ## Description This PR introduces a new memify pipeline to save cache sessions in cognee. The QA sessions are added to the main knowledge base as separate documents. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) None ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../persist_sessions_in_knowledge_graph.py | 55 ++++++ cognee/tasks/memify/__init__.py | 2 + cognee/tasks/memify/cognify_session.py | 40 ++++ cognee/tasks/memify/extract_user_sessions.py | 73 ++++++++ cognee/tests/test_conversation_history.py | 43 ++++- .../memify_tasks/test_cognify_session.py | 107 +++++++++++ .../test_extract_user_sessions.py | 175 ++++++++++++++++++ ...onversation_session_persistence_example.py | 98 ++++++++++ 8 files changed, 592 insertions(+), 1 deletion(-) create mode 100644 cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py create mode 100644 cognee/tasks/memify/cognify_session.py create mode 100644 cognee/tasks/memify/extract_user_sessions.py create mode 100644 cognee/tests/unit/modules/memify_tasks/test_cognify_session.py create mode 100644 cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py create mode 100644 examples/python/conversation_session_persistence_example.py diff --git a/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py b/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py new file mode 100644 index 000000000..c0ba0a4d9 --- /dev/null +++ b/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py @@ -0,0 +1,55 @@ +from typing import Optional, List + +from cognee import memify +from cognee.context_global_variables import ( + set_database_global_context_variables, + set_session_user_context_variable, +) +from cognee.exceptions import CogneeValidationError +from cognee.modules.data.methods import get_authorized_existing_datasets +from cognee.shared.logging_utils import get_logger +from cognee.modules.pipelines.tasks.task import Task +from cognee.modules.users.models import User +from cognee.tasks.memify import extract_user_sessions, cognify_session + + +logger = get_logger("persist_sessions_in_knowledge_graph") + + +async def persist_sessions_in_knowledge_graph_pipeline( + user: User, + session_ids: Optional[List[str]] = None, + dataset: str = "main_dataset", + run_in_background: bool = False, +): + await set_session_user_context_variable(user) + dataset_to_write = await get_authorized_existing_datasets( + user=user, datasets=[dataset], permission_type="write" + ) + + if not dataset_to_write: + raise CogneeValidationError( + message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}", + log=False, + ) + + await set_database_global_context_variables( + dataset_to_write[0].id, dataset_to_write[0].owner_id + ) + + extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)] + + enrichment_tasks = [ + Task(cognify_session), + ] + + result = await memify( + extraction_tasks=extraction_tasks, + enrichment_tasks=enrichment_tasks, + dataset=dataset_to_write[0].id, + data=[{}], + run_in_background=run_in_background, + ) + + logger.info("Session persistence pipeline completed") + return result diff --git a/cognee/tasks/memify/__init__.py b/cognee/tasks/memify/__init__.py index 692bac443..7e590ed47 100644 --- a/cognee/tasks/memify/__init__.py +++ b/cognee/tasks/memify/__init__.py @@ -1,2 +1,4 @@ from .extract_subgraph import extract_subgraph from .extract_subgraph_chunks import extract_subgraph_chunks +from .cognify_session import cognify_session +from .extract_user_sessions import extract_user_sessions diff --git a/cognee/tasks/memify/cognify_session.py b/cognee/tasks/memify/cognify_session.py new file mode 100644 index 000000000..7c276169a --- /dev/null +++ b/cognee/tasks/memify/cognify_session.py @@ -0,0 +1,40 @@ +import cognee + +from cognee.exceptions import CogneeValidationError, CogneeSystemError +from cognee.shared.logging_utils import get_logger + +logger = get_logger("cognify_session") + + +async def cognify_session(data): + """ + Process and cognify session data into the knowledge graph. + + Adds session content to cognee with a dedicated "user_sessions" node set, + then triggers the cognify pipeline to extract entities and relationships + from the session data. + + Args: + data: Session string containing Question, Context, and Answer information. + + Raises: + CogneeValidationError: If data is None or empty. + CogneeSystemError: If cognee operations fail. + """ + try: + if not data or (isinstance(data, str) and not data.strip()): + logger.warning("Empty session data provided to cognify_session task, skipping") + raise CogneeValidationError(message="Session data cannot be empty", log=False) + + logger.info("Processing session data for cognification") + + await cognee.add(data, node_set=["user_sessions_from_cache"]) + logger.debug("Session data added to cognee with node_set: user_sessions") + await cognee.cognify() + logger.info("Session data successfully cognified") + + except CogneeValidationError: + raise + except Exception as e: + logger.error(f"Error cognifying session data: {str(e)}") + raise CogneeSystemError(message=f"Failed to cognify session data: {str(e)}", log=False) diff --git a/cognee/tasks/memify/extract_user_sessions.py b/cognee/tasks/memify/extract_user_sessions.py new file mode 100644 index 000000000..9779a363e --- /dev/null +++ b/cognee/tasks/memify/extract_user_sessions.py @@ -0,0 +1,73 @@ +from typing import Optional, List + +from cognee.context_global_variables import session_user +from cognee.exceptions import CogneeSystemError +from cognee.infrastructure.databases.cache.get_cache_engine import get_cache_engine +from cognee.shared.logging_utils import get_logger +from cognee.modules.users.models import User + +logger = get_logger("extract_user_sessions") + + +async def extract_user_sessions( + data, + session_ids: Optional[List[str]] = None, +): + """ + Extract Q&A sessions for the current user from cache. + + Retrieves all Q&A triplets from specified session IDs and yields them + as formatted strings combining question, context, and answer. + + Args: + data: Data passed from memify. If empty dict ({}), no external data is provided. + session_ids: Optional list of specific session IDs to extract. + + Yields: + String containing session ID and all Q&A pairs formatted. + + Raises: + CogneeSystemError: If cache engine is unavailable or extraction fails. + """ + try: + if not data or data == [{}]: + logger.info("Fetching session metadata for current user") + + user: User = session_user.get() + if not user: + raise CogneeSystemError(message="No authenticated user found in context", log=False) + + user_id = str(user.id) + + cache_engine = get_cache_engine() + if cache_engine is None: + raise CogneeSystemError( + message="Cache engine not available for session extraction, please enable caching in order to have sessions to save", + log=False, + ) + + if session_ids: + for session_id in session_ids: + try: + qa_data = await cache_engine.get_all_qas(user_id, session_id) + if qa_data: + logger.info(f"Extracted session {session_id} with {len(qa_data)} Q&A pairs") + session_string = f"Session ID: {session_id}\n\n" + for qa_pair in qa_data: + question = qa_pair.get("question", "") + answer = qa_pair.get("answer", "") + session_string += f"Question: {question}\n\nAnswer: {answer}\n\n" + yield session_string + except Exception as e: + logger.warning(f"Failed to extract session {session_id}: {str(e)}") + continue + else: + logger.info( + "No specific session_ids provided. Please specify which sessions to extract." + ) + + except CogneeSystemError: + raise + except Exception as e: + logger.error(f"Error extracting user sessions: {str(e)}") + raise CogneeSystemError(message=f"Failed to extract user sessions: {str(e)}", log=False) diff --git a/cognee/tests/test_conversation_history.py b/cognee/tests/test_conversation_history.py index 30bb54ef1..6b5b737f1 100644 --- a/cognee/tests/test_conversation_history.py +++ b/cognee/tests/test_conversation_history.py @@ -16,9 +16,11 @@ import cognee import pathlib from cognee.infrastructure.databases.cache import get_cache_engine +from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.search.types import SearchType from cognee.shared.logging_utils import get_logger from cognee.modules.users.methods import get_default_user +from collections import Counter logger = get_logger() @@ -188,7 +190,6 @@ async def main(): f"GRAPH_SUMMARY_COMPLETION should return non-empty list, got: {result_summary!r}" ) - # Verify saved history_summary = await cache_engine.get_latest_qa(str(user.id), session_id_summary, last_n=10) our_qa_summary = [ h for h in history_summary if h["question"] == "What are the key points about TechCorp?" @@ -228,6 +229,46 @@ async def main(): assert "CONTEXT:" in formatted_history, "Formatted history should contain 'CONTEXT:' prefix" assert "ANSWER:" in formatted_history, "Formatted history should contain 'ANSWER:' prefix" + from cognee.memify_pipelines.persist_sessions_in_knowledge_graph import ( + persist_sessions_in_knowledge_graph_pipeline, + ) + + logger.info("Starting persist_sessions_in_knowledge_graph tests") + + await persist_sessions_in_knowledge_graph_pipeline( + user=user, + session_ids=[session_id_1, session_id_2], + dataset=dataset_name, + run_in_background=False, + ) + + graph_engine = await get_graph_engine() + graph = await graph_engine.get_graph_data() + + type_counts = Counter(node_data[1].get("type", {}) for node_data in graph[0]) + + "Tests the correct number of NodeSet nodes after session persistence" + assert type_counts.get("NodeSet", 0) == 1, ( + f"Number of NodeSets in the graph is incorrect, found {type_counts.get('NodeSet', 0)} but there should be exactly 1." + ) + + "Tests the correct number of DocumentChunk nodes after session persistence" + assert type_counts.get("DocumentChunk", 0) == 4, ( + f"Number of DocumentChunk ndoes in the graph is incorrect, found {type_counts.get('DocumentChunk', 0)} but there should be exactly 4 (2 original documents, 2 sessions)." + ) + + from cognee.infrastructure.databases.vector.get_vector_engine import get_vector_engine + + vector_engine = get_vector_engine() + collection_size = await vector_engine.search( + collection_name="DocumentChunk_text", + query_text="test", + limit=1000, + ) + assert len(collection_size) == 4, ( + f"DocumentChunk_text collection should have exactly 4 embeddings, found {len(collection_size)}" + ) + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py b/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py new file mode 100644 index 000000000..c23640fbd --- /dev/null +++ b/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py @@ -0,0 +1,107 @@ +import pytest +from unittest.mock import AsyncMock, patch + +from cognee.tasks.memify.cognify_session import cognify_session +from cognee.exceptions import CogneeValidationError, CogneeSystemError + + +@pytest.mark.asyncio +async def test_cognify_session_success(): + """Test successful cognification of session data.""" + session_data = ( + "Session ID: test_session\n\nQuestion: What is AI?\n\nAnswer: AI is artificial intelligence" + ) + + with ( + patch("cognee.add", new_callable=AsyncMock) as mock_add, + patch("cognee.cognify", new_callable=AsyncMock) as mock_cognify, + ): + await cognify_session(session_data) + + mock_add.assert_called_once_with(session_data, node_set=["user_sessions_from_cache"]) + mock_cognify.assert_called_once() + + +@pytest.mark.asyncio +async def test_cognify_session_empty_string(): + """Test cognification fails with empty string.""" + with pytest.raises(CogneeValidationError) as exc_info: + await cognify_session("") + + assert "Session data cannot be empty" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_cognify_session_whitespace_string(): + """Test cognification fails with whitespace-only string.""" + with pytest.raises(CogneeValidationError) as exc_info: + await cognify_session(" \n\t ") + + assert "Session data cannot be empty" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_cognify_session_none_data(): + """Test cognification fails with None data.""" + with pytest.raises(CogneeValidationError) as exc_info: + await cognify_session(None) + + assert "Session data cannot be empty" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_cognify_session_add_failure(): + """Test cognification handles cognee.add failure.""" + session_data = "Session ID: test\n\nQuestion: test?" + + with ( + patch("cognee.add", new_callable=AsyncMock) as mock_add, + patch("cognee.cognify", new_callable=AsyncMock), + ): + mock_add.side_effect = Exception("Add operation failed") + + with pytest.raises(CogneeSystemError) as exc_info: + await cognify_session(session_data) + + assert "Failed to cognify session data" in str(exc_info.value) + assert "Add operation failed" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_cognify_session_cognify_failure(): + """Test cognification handles cognify failure.""" + session_data = "Session ID: test\n\nQuestion: test?" + + with ( + patch("cognee.add", new_callable=AsyncMock), + patch("cognee.cognify", new_callable=AsyncMock) as mock_cognify, + ): + mock_cognify.side_effect = Exception("Cognify operation failed") + + with pytest.raises(CogneeSystemError) as exc_info: + await cognify_session(session_data) + + assert "Failed to cognify session data" in str(exc_info.value) + assert "Cognify operation failed" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_cognify_session_re_raises_validation_error(): + """Test that CogneeValidationError is re-raised as-is.""" + with pytest.raises(CogneeValidationError): + await cognify_session("") + + +@pytest.mark.asyncio +async def test_cognify_session_with_special_characters(): + """Test cognification with special characters.""" + session_data = "Session: test™ © Question: What's special? Answer: Cognee is special!" + + with ( + patch("cognee.add", new_callable=AsyncMock) as mock_add, + patch("cognee.cognify", new_callable=AsyncMock) as mock_cognify, + ): + await cognify_session(session_data) + + mock_add.assert_called_once_with(session_data, node_set=["user_sessions_from_cache"]) + mock_cognify.assert_called_once() diff --git a/cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py b/cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py new file mode 100644 index 000000000..8cb27fef3 --- /dev/null +++ b/cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py @@ -0,0 +1,175 @@ +import sys +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from cognee.tasks.memify.extract_user_sessions import extract_user_sessions +from cognee.exceptions import CogneeSystemError +from cognee.modules.users.models import User + +# Get the actual module object (not the function) for patching +extract_user_sessions_module = sys.modules["cognee.tasks.memify.extract_user_sessions"] + + +@pytest.fixture +def mock_user(): + """Create a mock user.""" + user = MagicMock(spec=User) + user.id = "test-user-123" + return user + + +@pytest.fixture +def mock_qa_data(): + """Create mock Q&A data.""" + return [ + { + "question": "What is cognee?", + "context": "context about cognee", + "answer": "Cognee is a knowledge graph solution", + "time": "2025-01-01T12:00:00", + }, + { + "question": "How does it work?", + "context": "how it works context", + "answer": "It processes data and creates graphs", + "time": "2025-01-01T12:05:00", + }, + ] + + +@pytest.mark.asyncio +async def test_extract_user_sessions_success(mock_user, mock_qa_data): + """Test successful extraction of sessions.""" + mock_cache_engine = AsyncMock() + mock_cache_engine.get_all_qas.return_value = mock_qa_data + + with ( + patch.object(extract_user_sessions_module, "session_user") as mock_session_user, + patch.object( + extract_user_sessions_module, "get_cache_engine", return_value=mock_cache_engine + ), + ): + mock_session_user.get.return_value = mock_user + + sessions = [] + async for session in extract_user_sessions([{}], session_ids=["test_session"]): + sessions.append(session) + + assert len(sessions) == 1 + assert "Session ID: test_session" in sessions[0] + assert "Question: What is cognee?" in sessions[0] + assert "Answer: Cognee is a knowledge graph solution" in sessions[0] + assert "Question: How does it work?" in sessions[0] + assert "Answer: It processes data and creates graphs" in sessions[0] + + +@pytest.mark.asyncio +async def test_extract_user_sessions_multiple_sessions(mock_user, mock_qa_data): + """Test extraction of multiple sessions.""" + mock_cache_engine = AsyncMock() + mock_cache_engine.get_all_qas.return_value = mock_qa_data + + with ( + patch.object(extract_user_sessions_module, "session_user") as mock_session_user, + patch.object( + extract_user_sessions_module, "get_cache_engine", return_value=mock_cache_engine + ), + ): + mock_session_user.get.return_value = mock_user + + sessions = [] + async for session in extract_user_sessions([{}], session_ids=["session1", "session2"]): + sessions.append(session) + + assert len(sessions) == 2 + assert mock_cache_engine.get_all_qas.call_count == 2 + + +@pytest.mark.asyncio +async def test_extract_user_sessions_no_data(mock_user, mock_qa_data): + """Test extraction handles empty data parameter.""" + mock_cache_engine = AsyncMock() + mock_cache_engine.get_all_qas.return_value = mock_qa_data + + with ( + patch.object(extract_user_sessions_module, "session_user") as mock_session_user, + patch.object( + extract_user_sessions_module, "get_cache_engine", return_value=mock_cache_engine + ), + ): + mock_session_user.get.return_value = mock_user + + sessions = [] + async for session in extract_user_sessions(None, session_ids=["test_session"]): + sessions.append(session) + + assert len(sessions) == 1 + + +@pytest.mark.asyncio +async def test_extract_user_sessions_no_session_ids(mock_user): + """Test extraction handles no session IDs provided.""" + mock_cache_engine = AsyncMock() + + with ( + patch.object(extract_user_sessions_module, "session_user") as mock_session_user, + patch.object( + extract_user_sessions_module, "get_cache_engine", return_value=mock_cache_engine + ), + ): + mock_session_user.get.return_value = mock_user + + sessions = [] + async for session in extract_user_sessions([{}], session_ids=None): + sessions.append(session) + + assert len(sessions) == 0 + mock_cache_engine.get_all_qas.assert_not_called() + + +@pytest.mark.asyncio +async def test_extract_user_sessions_empty_qa_data(mock_user): + """Test extraction handles empty Q&A data.""" + mock_cache_engine = AsyncMock() + mock_cache_engine.get_all_qas.return_value = [] + + with ( + patch.object(extract_user_sessions_module, "session_user") as mock_session_user, + patch.object( + extract_user_sessions_module, "get_cache_engine", return_value=mock_cache_engine + ), + ): + mock_session_user.get.return_value = mock_user + + sessions = [] + async for session in extract_user_sessions([{}], session_ids=["empty_session"]): + sessions.append(session) + + assert len(sessions) == 0 + + +@pytest.mark.asyncio +async def test_extract_user_sessions_cache_error_handling(mock_user, mock_qa_data): + """Test extraction continues on cache error for specific session.""" + mock_cache_engine = AsyncMock() + mock_cache_engine.get_all_qas.side_effect = [ + mock_qa_data, + Exception("Cache error"), + mock_qa_data, + ] + + with ( + patch.object(extract_user_sessions_module, "session_user") as mock_session_user, + patch.object( + extract_user_sessions_module, "get_cache_engine", return_value=mock_cache_engine + ), + ): + mock_session_user.get.return_value = mock_user + + sessions = [] + async for session in extract_user_sessions( + [{}], session_ids=["session1", "session2", "session3"] + ): + sessions.append(session) + + assert len(sessions) == 2 diff --git a/examples/python/conversation_session_persistence_example.py b/examples/python/conversation_session_persistence_example.py new file mode 100644 index 000000000..5346f5012 --- /dev/null +++ b/examples/python/conversation_session_persistence_example.py @@ -0,0 +1,98 @@ +import asyncio + +import cognee +from cognee import visualize_graph +from cognee.memify_pipelines.persist_sessions_in_knowledge_graph import ( + persist_sessions_in_knowledge_graph_pipeline, +) +from cognee.modules.search.types import SearchType +from cognee.modules.users.methods import get_default_user +from cognee.shared.logging_utils import get_logger + +logger = get_logger("conversation_session_persistence_example") + + +async def main(): + # NOTE: CACHING has to be enabled for this example to work + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + text_1 = "Cognee is a solution that can build knowledge graph from text, creating an AI memory system" + text_2 = "Germany is a country located next to the Netherlands" + + await cognee.add([text_1, text_2]) + await cognee.cognify() + + question = "What can I use to create a knowledge graph?" + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=question, + ) + print("\nSession ID: default_session") + print(f"Question: {question}") + print(f"Answer: {search_results}\n") + + question = "You sure about that?" + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, query_text=question + ) + print("\nSession ID: default_session") + print(f"Question: {question}") + print(f"Answer: {search_results}\n") + + question = "This is awesome!" + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, query_text=question + ) + print("\nSession ID: default_session") + print(f"Question: {question}") + print(f"Answer: {search_results}\n") + + question = "Where is Germany?" + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=question, + session_id="different_session", + ) + print("\nSession ID: different_session") + print(f"Question: {question}") + print(f"Answer: {search_results}\n") + + question = "Next to which country again?" + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=question, + session_id="different_session", + ) + print("\nSession ID: different_session") + print(f"Question: {question}") + print(f"Answer: {search_results}\n") + + question = "So you remember everything I asked from you?" + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=question, + session_id="different_session", + ) + print("\nSession ID: different_session") + print(f"Question: {question}") + print(f"Answer: {search_results}\n") + + session_ids_to_persist = ["default_session", "different_session"] + default_user = await get_default_user() + + await persist_sessions_in_knowledge_graph_pipeline( + user=default_user, + session_ids=session_ids_to_persist, + ) + + await visualize_graph() + + +if __name__ == "__main__": + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(main()) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) From 1643b13c95ba83b08abb0d1afeec80767049db26 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 12:43:01 +0100 Subject: [PATCH 062/129] chore: add table creation for multi-tenancy to migration --- .../c946955da633_multi_tenant_support.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 09781c85c..fc45644d0 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -75,6 +75,28 @@ def upgrade() -> None: dataset = _define_dataset_table() user = _define_user_table() + if "user_tenants" not in insp.get_table_names(): + tenant_id_from_user = sa.select(user.c.tenant_id).scalar_subquery() + # Define table with all necessary columns including primary key + user_tenants = op.create_table( + "user_tenants", + sa.Column("user_id", sa.UUID, sa.ForeignKey("users.id"), primary_key=True), + sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), primary_key=True), + sa.Column("created_at", sa.DateTime(), default=lambda: datetime.now(timezone.utc)), + ) + if op.get_context().dialect.name == "sqlite": + # If column doesn't exist create new original_extension column and update from values of extension column + with op.batch_alter_table("user_tenants") as batch_op: + batch_op.execute( + user_tenants.update().values( + tenant_id=tenant_id_from_user, + user_id=user.c.id, + ) + ) + else: + conn = op.get_bind() + conn.execute(dataset.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id)) + tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: op.add_column("datasets", sa.Column("tenant_id", sa.UUID(), nullable=True)) From 9fc4199958045cb5ed06cfcaf783baae247760d6 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 13:18:47 +0100 Subject: [PATCH 063/129] fix: Resolve issue with cleaning acl table --- .../ab7e313804ae_permission_system_rework.py | 72 +++++++++++-------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/alembic/versions/ab7e313804ae_permission_system_rework.py b/alembic/versions/ab7e313804ae_permission_system_rework.py index bd69b9b41..d83f946a6 100644 --- a/alembic/versions/ab7e313804ae_permission_system_rework.py +++ b/alembic/versions/ab7e313804ae_permission_system_rework.py @@ -144,44 +144,58 @@ def _create_data_permission(conn, user_id, data_id, permission_name): ) +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + def upgrade() -> None: conn = op.get_bind() + insp = sa.inspect(conn) - # Recreate ACLs table with default permissions set to datasets instead of documents - op.drop_table("acls") + dataset_id_column = _get_column(insp, "acls", "dataset_id") + if not dataset_id_column: + # Recreate ACLs table with default permissions set to datasets instead of documents + op.drop_table("acls") - acls_table = op.create_table( - "acls", - sa.Column("id", UUID, primary_key=True, default=uuid4), - sa.Column( - "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc) - ), - sa.Column( - "updated_at", sa.DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc) - ), - sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")), - sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")), - sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")), - ) + acls_table = op.create_table( + "acls", + sa.Column("id", UUID, primary_key=True, default=uuid4), + sa.Column( + "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc) + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + onupdate=lambda: datetime.now(timezone.utc), + ), + sa.Column("principal_id", UUID, sa.ForeignKey("principals.id")), + sa.Column("permission_id", UUID, sa.ForeignKey("permissions.id")), + sa.Column("dataset_id", UUID, sa.ForeignKey("datasets.id", ondelete="CASCADE")), + ) - # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table - # definition or load what is in the database - dataset_table = _define_dataset_table() - datasets = conn.execute(sa.select(dataset_table)).fetchall() + # Note: We can't use any Cognee model info to gather data (as it can change) in database so we must use our own table + # definition or load what is in the database + dataset_table = _define_dataset_table() + datasets = conn.execute(sa.select(dataset_table)).fetchall() - if not datasets: - return + if not datasets: + return - acl_list = [] + acl_list = [] - for dataset in datasets: - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read")) - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write")) - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share")) - acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete")) + for dataset in datasets: + acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "read")) + acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "write")) + acl_list.append(_create_dataset_permission(conn, dataset.owner_id, dataset.id, "share")) + acl_list.append( + _create_dataset_permission(conn, dataset.owner_id, dataset.id, "delete") + ) - if acl_list: - op.bulk_insert(acls_table, acl_list) + if acl_list: + op.bulk_insert(acls_table, acl_list) def downgrade() -> None: From 1b4aa5c67b0412b134e903855f1ad3d12c7eab0e Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 15:15:15 +0100 Subject: [PATCH 064/129] fix: resolve issue with cypher search --- cognee/modules/retrieval/cypher_search_retriever.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/modules/retrieval/cypher_search_retriever.py b/cognee/modules/retrieval/cypher_search_retriever.py index 9978f2536..b8cbce558 100644 --- a/cognee/modules/retrieval/cypher_search_retriever.py +++ b/cognee/modules/retrieval/cypher_search_retriever.py @@ -49,8 +49,9 @@ class CypherSearchRetriever(BaseRetriever): if is_empty: logger.warning("Search attempt on an empty knowledge graph") return [] + from fastapi.encoders import jsonable_encoder - result = await graph_engine.query(query) + result = jsonable_encoder(await graph_engine.query(query)) except Exception as e: logger.error("Failed to execture cypher search retrieval: %s", str(e)) raise CypherSearchError() from e From 1c142734b4f031e466d5a7b111f3ed2e55e338c3 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 15:22:23 +0100 Subject: [PATCH 065/129] refactor: change import to top of file --- cognee/modules/retrieval/cypher_search_retriever.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cognee/modules/retrieval/cypher_search_retriever.py b/cognee/modules/retrieval/cypher_search_retriever.py index b8cbce558..01816f3df 100644 --- a/cognee/modules/retrieval/cypher_search_retriever.py +++ b/cognee/modules/retrieval/cypher_search_retriever.py @@ -1,4 +1,6 @@ from typing import Any, Optional +from fastapi.encoders import jsonable_encoder + from cognee.infrastructure.databases.graph import get_graph_engine from cognee.modules.retrieval.base_retriever import BaseRetriever from cognee.modules.retrieval.utils.completion import generate_completion @@ -49,7 +51,6 @@ class CypherSearchRetriever(BaseRetriever): if is_empty: logger.warning("Search attempt on an empty knowledge graph") return [] - from fastapi.encoders import jsonable_encoder result = jsonable_encoder(await graph_engine.query(query)) except Exception as e: From fa4c50f972e27190fb97d20b2e61726b52bb3f2a Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 16:05:33 +0100 Subject: [PATCH 066/129] fix: Resolve issue with sync migration not working for postgresql --- .../211ab850ef3d_add_sync_operations_table.py | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/alembic/versions/211ab850ef3d_add_sync_operations_table.py b/alembic/versions/211ab850ef3d_add_sync_operations_table.py index 370aab1a4..9c6e81f12 100644 --- a/alembic/versions/211ab850ef3d_add_sync_operations_table.py +++ b/alembic/versions/211ab850ef3d_add_sync_operations_table.py @@ -10,6 +10,7 @@ from typing import Sequence, Union from alembic import op import sqlalchemy as sa +from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. @@ -27,6 +28,27 @@ def upgrade() -> None: inspector = sa.inspect(connection) if "sync_operations" not in inspector.get_table_names(): + if op.get_context().dialect.name == "postgresql": + syncstatus = postgresql.ENUM( + "STARTED", + "IN_PROGRESS", + "COMPLETED", + "FAILED", + "CANCELLED", + name="syncstatus", + create_type=False, + ) + else: + syncstatus = sa.Enum( + "STARTED", + "IN_PROGRESS", + "COMPLETED", + "FAILED", + "CANCELLED", + name="syncstatus", + create_type=False, + ) + # Table doesn't exist, create it normally op.create_table( "sync_operations", @@ -34,15 +56,7 @@ def upgrade() -> None: sa.Column("run_id", sa.Text(), nullable=True), sa.Column( "status", - sa.Enum( - "STARTED", - "IN_PROGRESS", - "COMPLETED", - "FAILED", - "CANCELLED", - name="syncstatus", - create_type=False, - ), + syncstatus, nullable=True, ), sa.Column("progress_percentage", sa.Integer(), nullable=True), From c4807a0c6751e05a4fb04439afa183fa7620c8f5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 16:14:37 +0100 Subject: [PATCH 067/129] refactor: Use user_tenants table to update --- alembic/versions/c946955da633_multi_tenant_support.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index fc45644d0..3f7bde5a2 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -95,7 +95,9 @@ def upgrade() -> None: ) else: conn = op.get_bind() - conn.execute(dataset.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id)) + conn.execute( + user_tenants.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id) + ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: From 9b6cbaf389b172fb86da42ac1c9c8fe544202aae Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 17:24:11 +0100 Subject: [PATCH 068/129] chore: Add multi tenant migration --- .../211ab850ef3d_add_sync_operations_table.py | 6 ++- .../c946955da633_multi_tenant_support.py | 38 +++++++++++-------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/alembic/versions/211ab850ef3d_add_sync_operations_table.py b/alembic/versions/211ab850ef3d_add_sync_operations_table.py index 9c6e81f12..976439a32 100644 --- a/alembic/versions/211ab850ef3d_add_sync_operations_table.py +++ b/alembic/versions/211ab850ef3d_add_sync_operations_table.py @@ -36,7 +36,8 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=False, + create_type=True, + checkfirst=True, ) else: syncstatus = sa.Enum( @@ -46,7 +47,8 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=False, + create_type=True, + checkfirst=True, ) # Table doesn't exist, create it normally diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 3f7bde5a2..6d21f8fc7 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -20,6 +20,10 @@ branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None +def _now(): + return datetime.now(timezone.utc) + + def _define_user_table() -> sa.Table: table = sa.Table( "users", @@ -76,27 +80,29 @@ def upgrade() -> None: user = _define_user_table() if "user_tenants" not in insp.get_table_names(): - tenant_id_from_user = sa.select(user.c.tenant_id).scalar_subquery() # Define table with all necessary columns including primary key user_tenants = op.create_table( "user_tenants", sa.Column("user_id", sa.UUID, sa.ForeignKey("users.id"), primary_key=True), sa.Column("tenant_id", sa.UUID, sa.ForeignKey("tenants.id"), primary_key=True), - sa.Column("created_at", sa.DateTime(), default=lambda: datetime.now(timezone.utc)), + sa.Column( + "created_at", sa.DateTime(timezone=True), default=lambda: datetime.now(timezone.utc) + ), ) - if op.get_context().dialect.name == "sqlite": - # If column doesn't exist create new original_extension column and update from values of extension column - with op.batch_alter_table("user_tenants") as batch_op: - batch_op.execute( - user_tenants.update().values( - tenant_id=tenant_id_from_user, - user_id=user.c.id, - ) - ) - else: - conn = op.get_bind() - conn.execute( - user_tenants.update().values(tenant_id=tenant_id_from_user, user_id=user.c.id) + + # Get all users with their tenant_id + user_data = conn.execute( + sa.select(user.c.id, user.c.tenant_id).where(user.c.tenant_id.isnot(None)) + ).fetchall() + + # Insert into user_tenants table + if user_data: + op.bulk_insert( + user_tenants, + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ], ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") @@ -125,6 +131,6 @@ def upgrade() -> None: def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - + op.drop_table("user_tenants") op.drop_column("datasets", "tenant_id") # ### end Alembic commands ### From 215ef7f3c213ea0be6b0a295be8069ba0364878d Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 5 Nov 2025 17:29:40 +0100 Subject: [PATCH 069/129] test: add retriever tests --- .../entity_completion_retriever_test.py | 65 +++++++++++++++ ...letion_retriever_context_extension_test.py | 6 +- .../graph_completion_retriever_cot_test.py | 6 +- .../graph_completion_retriever_test.py | 57 +++++++++++++ .../rag_completion_retriever_test.py | 79 +++++++++++++++++++ .../retrieval/temporal_retriever_test.py | 64 +++++++++++++++ 6 files changed, 271 insertions(+), 6 deletions(-) create mode 100644 cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py diff --git a/cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py new file mode 100644 index 000000000..064f4a31a --- /dev/null +++ b/cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py @@ -0,0 +1,65 @@ +import os +import pytest +import pathlib +from pydantic import BaseModel + +import cognee +from cognee.low_level import setup +from cognee.tasks.storage import add_data_points +from cognee.modules.engine.models import Entity, EntityType +from cognee.modules.retrieval.EntityCompletionRetriever import EntityCompletionRetriever +from cognee.modules.retrieval.entity_extractors.DummyEntityExtractor import DummyEntityExtractor +from cognee.modules.retrieval.context_providers.DummyContextProvider import DummyContextProvider + + +class TestAnswer(BaseModel): + answer: str + explanation: str + + +# TODO: Add more tests, similar to other retrievers. +# TODO: For the tests, one needs to define an Entity Extractor and a Context Provider. +class TestEntityCompletionRetriever: + @pytest.mark.asyncio + async def test_get_entity_structured_completion(self): + system_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".cognee_system/test_get_entity_structured_completion" + ) + cognee.config.system_root_directory(system_directory_path) + data_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".data_storage/test_get_entity_structured_completion" + ) + cognee.config.data_root_directory(data_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await setup() + + entity_type = EntityType(name="Person", description="A human individual") + entity = Entity(name="Albert Einstein", is_a=entity_type, description="A famous physicist") + + entities = [entity] + await add_data_points(entities) + + retriever = EntityCompletionRetriever(DummyEntityExtractor(), DummyContextProvider()) + + # Test with string response model (default) + string_answer = await retriever.get_completion("Who is Albert Einstein?") + assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" + assert all(isinstance(item, str) and item.strip() for item in string_answer), ( + "Answer should not be empty" + ) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Who is Albert Einstein?", response_model=TestAnswer + ) + assert isinstance(structured_answer, list), ( + f"Expected list, got {type(structured_answer).__name__}" + ) + assert all(isinstance(item, TestAnswer) for item in structured_answer), ( + f"Expected TestAnswer, got {type(structured_answer).__name__}" + ) + + assert structured_answer[0].answer.strip(), "Answer field should not be empty" + assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py index 5335a3ca7..d15e55c23 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py @@ -183,15 +183,15 @@ class TestGraphCompletionWithContextExtensionRetriever: ) @pytest.mark.asyncio - async def test_get_structured_completion_extension_context(self): + async def test_get_graph_structured_completion_extension_context(self): system_directory_path = os.path.join( pathlib.Path(__file__).parent, - ".cognee_system/test_get_structured_completion_extension_context", + ".cognee_system/test_get_graph_structured_completion_extension_context", ) cognee.config.system_root_directory(system_directory_path) data_directory_path = os.path.join( pathlib.Path(__file__).parent, - ".data_storage/test_get_structured_completion_extension_context", + ".data_storage/test_get_graph_structured_completion_extension_context", ) cognee.config.data_root_directory(data_directory_path) diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py index 731e9fccf..79e4bcec3 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py @@ -176,13 +176,13 @@ class TestGraphCompletionCoTRetriever: ) @pytest.mark.asyncio - async def test_get_structured_completion(self): + async def test_get_graph_structured_completion_cot(self): system_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".cognee_system/test_get_structured_completion" + pathlib.Path(__file__).parent, ".cognee_system/test_get_graph_structured_completion_cot" ) cognee.config.system_root_directory(system_directory_path) data_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".data_storage/test_get_structured_completion" + pathlib.Path(__file__).parent, ".data_storage/test_get_graph_structured_completion_cot" ) cognee.config.data_root_directory(data_directory_path) diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py index f462baced..e320fcef1 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py @@ -2,6 +2,7 @@ import os import pytest import pathlib from typing import Optional, Union +from pydantic import BaseModel import cognee from cognee.low_level import setup, DataPoint @@ -10,6 +11,11 @@ from cognee.tasks.storage import add_data_points from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever +class TestAnswer(BaseModel): + answer: str + explanation: str + + class TestGraphCompletionRetriever: @pytest.mark.asyncio async def test_graph_completion_context_simple(self): @@ -221,3 +227,54 @@ class TestGraphCompletionRetriever: context = await retriever.get_context("Who works at Figma?") assert context == [], "Context should be empty on an empty graph" + + @pytest.mark.asyncio + async def test_get_graph_structured_completion(self): + system_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".cognee_system/test_get_graph_structured_completion" + ) + cognee.config.system_root_directory(system_directory_path) + data_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".data_storage/test_get_graph_structured_completion" + ) + cognee.config.data_root_directory(data_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await setup() + + class Company(DataPoint): + name: str + + class Person(DataPoint): + name: str + works_for: Company + + company1 = Company(name="Figma") + person1 = Person(name="Steve Rodger", works_for=company1) + + entities = [company1, person1] + await add_data_points(entities) + + retriever = GraphCompletionRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("Who works at Figma?") + assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" + assert all(isinstance(item, str) and item.strip() for item in string_answer), ( + "Answer should not be empty" + ) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Who works at Figma?", response_model=TestAnswer + ) + assert isinstance(structured_answer, list), ( + f"Expected list, got {type(structured_answer).__name__}" + ) + assert all(isinstance(item, TestAnswer) for item in structured_answer), ( + f"Expected TestAnswer, got {type(structured_answer).__name__}" + ) + + assert structured_answer[0].answer.strip(), "Answer field should not be empty" + assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py index 252af8352..248ecc047 100644 --- a/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py @@ -3,6 +3,7 @@ from typing import List import pytest import pathlib import cognee +from pydantic import BaseModel from cognee.low_level import setup from cognee.tasks.storage import add_data_points from cognee.infrastructure.databases.vector import get_vector_engine @@ -26,6 +27,11 @@ class DocumentChunkWithEntities(DataPoint): metadata: dict = {"index_fields": ["text"]} +class TestAnswer(BaseModel): + answer: str + explanation: str + + class TestRAGCompletionRetriever: @pytest.mark.asyncio async def test_rag_completion_context_simple(self): @@ -202,3 +208,76 @@ class TestRAGCompletionRetriever: context = await retriever.get_context("Christina Mayer") assert context == "", "Returned context should be empty on an empty graph" + + @pytest.mark.asyncio + async def test_get_rag_structured_completion(self): + system_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".cognee_system/test_get_rag_structured_completion" + ) + cognee.config.system_root_directory(system_directory_path) + data_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".data_storage/test_get_rag_structured_completion" + ) + cognee.config.data_root_directory(data_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await setup() + + document = TextDocument( + name="Steve Rodger's career", + raw_data_location="somewhere", + external_metadata="", + mime_type="text/plain", + ) + + chunk1 = DocumentChunk( + text="Steve Rodger", + chunk_size=2, + chunk_index=0, + cut_type="sentence_end", + is_part_of=document, + contains=[], + ) + chunk2 = DocumentChunk( + text="Mike Broski", + chunk_size=2, + chunk_index=1, + cut_type="sentence_end", + is_part_of=document, + contains=[], + ) + chunk3 = DocumentChunk( + text="Christina Mayer", + chunk_size=2, + chunk_index=2, + cut_type="sentence_end", + is_part_of=document, + contains=[], + ) + + entities = [chunk1, chunk2, chunk3] + await add_data_points(entities) + + retriever = CompletionRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("Where does Steve work?") + assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" + assert all(isinstance(item, str) and item.strip() for item in string_answer), ( + "Answer should not be empty" + ) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Where does Steve work?", response_model=TestAnswer + ) + assert isinstance(structured_answer, list), ( + f"Expected list, got {type(structured_answer).__name__}" + ) + assert all(isinstance(item, TestAnswer) for item in structured_answer), ( + f"Expected TestAnswer, got {type(structured_answer).__name__}" + ) + + assert structured_answer[0].answer.strip(), "Answer field should not be empty" + assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py b/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py index a322cb237..5b274c822 100644 --- a/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py @@ -1,7 +1,13 @@ import asyncio +import os +import pathlib +import cognee from types import SimpleNamespace import pytest +from pydantic import BaseModel +from cognee.low_level import setup, DataPoint +from cognee.tasks.storage import add_data_points from cognee.modules.retrieval.temporal_retriever import TemporalRetriever @@ -141,6 +147,64 @@ async def test_filter_top_k_events_error_handling(): await tr.filter_top_k_events([{}], []) +class TestAnswer(BaseModel): + answer: str + explanation: str + + +@pytest.mark.asyncio +async def test_get_temporal_structured_completion(): + system_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".cognee_system/test_get_temporal_structured_completion" + ) + cognee.config.system_root_directory(system_directory_path) + data_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".data_storage/test_get_temporal_structured_completion" + ) + cognee.config.data_root_directory(data_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await setup() + + class Company(DataPoint): + name: str + + class Person(DataPoint): + name: str + works_for: Company + works_since: int + + company1 = Company(name="Figma") + person1 = Person(name="Steve Rodger", works_for=company1, works_since=2015) + + entities = [company1, person1] + await add_data_points(entities) + + retriever = TemporalRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("When did Steve start working at Figma?") + assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" + assert all(isinstance(item, str) and item.strip() for item in string_answer), ( + "Answer should not be empty" + ) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "When did Steve start working at Figma??", response_model=TestAnswer + ) + assert isinstance(structured_answer, list), ( + f"Expected list, got {type(structured_answer).__name__}" + ) + assert all(isinstance(item, TestAnswer) for item in structured_answer), ( + f"Expected TestAnswer, got {type(structured_answer).__name__}" + ) + + assert structured_answer[0].answer.strip(), "Answer field should not be empty" + assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" + + class _FakeRetriever(TemporalRetriever): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From 1ef5805c5708ae82eed17335e891eddafd794cf4 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 17:50:13 +0100 Subject: [PATCH 070/129] fix: Resolve issue with sync migration --- .../211ab850ef3d_add_sync_operations_table.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/alembic/versions/211ab850ef3d_add_sync_operations_table.py b/alembic/versions/211ab850ef3d_add_sync_operations_table.py index 976439a32..30049b44b 100644 --- a/alembic/versions/211ab850ef3d_add_sync_operations_table.py +++ b/alembic/versions/211ab850ef3d_add_sync_operations_table.py @@ -27,6 +27,12 @@ def upgrade() -> None: connection = op.get_bind() inspector = sa.inspect(connection) + if op.get_context().dialect.name == "postgresql": + syncstatus_enum = postgresql.ENUM( + "STARTED", "IN_PROGRESS", "COMPLETED", "FAILED", "CANCELLED", name="syncstatus" + ) + syncstatus_enum.create(op.get_bind(), checkfirst=True) + if "sync_operations" not in inspector.get_table_names(): if op.get_context().dialect.name == "postgresql": syncstatus = postgresql.ENUM( @@ -36,8 +42,7 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=True, - checkfirst=True, + create_type=False, ) else: syncstatus = sa.Enum( @@ -47,8 +52,7 @@ def upgrade() -> None: "FAILED", "CANCELLED", name="syncstatus", - create_type=True, - checkfirst=True, + create_type=False, ) # Table doesn't exist, create it normally From ce64f242b7a5480bbe9763bbb523fafb7b10e9fb Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Wed, 5 Nov 2025 18:04:05 +0100 Subject: [PATCH 071/129] refactor: add droping of index as well --- alembic/versions/c946955da633_multi_tenant_support.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 6d21f8fc7..ba451fc03 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -126,11 +126,10 @@ def upgrade() -> None: conn = op.get_bind() conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner)) - op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"], unique=False) - def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.drop_table("user_tenants") + op.drop_index(op.f("ix_datasets_tenant_id"), table_name="datasets") op.drop_column("datasets", "tenant_id") + op.drop_table("user_tenants") # ### end Alembic commands ### From 79bd2b2576b913528feb92ca6832242133bf9822 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 6 Nov 2025 09:48:01 +0100 Subject: [PATCH 072/129] chore: fixes ruff formatting --- .../v1/cognify/routers/get_cognify_router.py | 15 ++++++++---- cognee/api/v1/ontologies/__init__.py | 2 +- cognee/api/v1/ontologies/ontologies.py | 22 ++++++++++------- .../ontologies/routers/get_ontology_router.py | 11 ++++----- cognee/tests/test_ontology_endpoint.py | 24 ++++++++++++------- 5 files changed, 44 insertions(+), 30 deletions(-) diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index 246cc6c56..252ffe7bf 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -42,8 +42,7 @@ class CognifyPayloadDTO(InDTO): default="", description="Custom prompt for entity extraction and graph generation" ) ontology_key: Optional[str] = Field( - default=None, - description="Reference to previously uploaded ontology" + default=None, description="Reference to previously uploaded ontology" ) @@ -123,16 +122,22 @@ def get_cognify_router() -> APIRouter: if payload.ontology_key: ontology_service = OntologyService() try: - ontology_content = ontology_service.get_ontology_content(payload.ontology_key, user) + ontology_content = ontology_service.get_ontology_content( + payload.ontology_key, user + ) from cognee.modules.ontology.ontology_config import Config - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import ( + RDFLibOntologyResolver, + ) from io import StringIO ontology_stream = StringIO(ontology_content) config_to_use: Config = { "ontology_config": { - "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_stream) + "ontology_resolver": RDFLibOntologyResolver( + ontology_file=ontology_stream + ) } } except ValueError as e: diff --git a/cognee/api/v1/ontologies/__init__.py b/cognee/api/v1/ontologies/__init__.py index c25064edc..b90d46c3d 100644 --- a/cognee/api/v1/ontologies/__init__.py +++ b/cognee/api/v1/ontologies/__init__.py @@ -1,4 +1,4 @@ from .ontologies import OntologyService from .routers.get_ontology_router import get_ontology_router -__all__ = ["OntologyService", "get_ontology_router"] \ No newline at end of file +__all__ = ["OntologyService", "get_ontology_router"] diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index fb7f3cd9a..6bfb7658e 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -6,6 +6,7 @@ from datetime import datetime, timezone from typing import Optional from dataclasses import dataclass + @dataclass class OntologyMetadata: ontology_key: str @@ -14,6 +15,7 @@ class OntologyMetadata: uploaded_at: str description: Optional[str] = None + class OntologyService: def __init__(self): pass @@ -33,18 +35,20 @@ class OntologyService: def _load_metadata(self, user_dir: Path) -> dict: metadata_path = self._get_metadata_path(user_dir) if metadata_path.exists(): - with open(metadata_path, 'r') as f: + with open(metadata_path, "r") as f: return json.load(f) return {} def _save_metadata(self, user_dir: Path, metadata: dict): metadata_path = self._get_metadata_path(user_dir) - with open(metadata_path, 'w') as f: + with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) - async def upload_ontology(self, ontology_key: str, file, user, description: Optional[str] = None) -> OntologyMetadata: + async def upload_ontology( + self, ontology_key: str, file, user, description: Optional[str] = None + ) -> OntologyMetadata: # Validate file format - if not file.filename.lower().endswith('.owl'): + if not file.filename.lower().endswith(".owl"): raise ValueError("File must be in .owl format") user_dir = self._get_user_dir(str(user.id)) @@ -61,7 +65,7 @@ class OntologyService: # Save file file_path = user_dir / f"{ontology_key}.owl" - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: f.write(content) # Update metadata @@ -69,7 +73,7 @@ class OntologyService: "filename": file.filename, "size_bytes": len(content), "uploaded_at": datetime.now(timezone.utc).isoformat(), - "description": description + "description": description, } metadata[ontology_key] = ontology_metadata self._save_metadata(user_dir, metadata) @@ -79,7 +83,7 @@ class OntologyService: filename=file.filename, size_bytes=len(content), uploaded_at=ontology_metadata["uploaded_at"], - description=description + description=description, ) def get_ontology_content(self, ontology_key: str, user) -> str: @@ -93,9 +97,9 @@ class OntologyService: if not file_path.exists(): raise ValueError(f"Ontology file for key '{ontology_key}' not found") - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, "r", encoding="utf-8") as f: return f.read() def list_ontologies(self, user) -> dict: user_dir = self._get_user_dir(str(user.id)) - return self._load_metadata(user_dir) \ No newline at end of file + return self._load_metadata(user_dir) diff --git a/cognee/api/v1/ontologies/routers/get_ontology_router.py b/cognee/api/v1/ontologies/routers/get_ontology_router.py index c171fa7bb..f5c51ba21 100644 --- a/cognee/api/v1/ontologies/routers/get_ontology_router.py +++ b/cognee/api/v1/ontologies/routers/get_ontology_router.py @@ -8,6 +8,7 @@ from cognee.shared.utils import send_telemetry from cognee import __version__ as cognee_version from ..ontologies import OntologyService + def get_ontology_router() -> APIRouter: router = APIRouter() ontology_service = OntologyService() @@ -17,7 +18,7 @@ def get_ontology_router() -> APIRouter: ontology_key: str = Form(...), ontology_file: UploadFile = File(...), description: Optional[str] = Form(None), - user: User = Depends(get_authenticated_user) + user: User = Depends(get_authenticated_user), ): """ Upload an ontology file with a named key for later use in cognify operations. @@ -51,7 +52,7 @@ def get_ontology_router() -> APIRouter: "ontology_key": result.ontology_key, "filename": result.filename, "size_bytes": result.size_bytes, - "uploaded_at": result.uploaded_at + "uploaded_at": result.uploaded_at, } except ValueError as e: return JSONResponse(status_code=400, content={"error": str(e)}) @@ -59,9 +60,7 @@ def get_ontology_router() -> APIRouter: return JSONResponse(status_code=500, content={"error": str(e)}) @router.get("", response_model=dict) - async def list_ontologies( - user: User = Depends(get_authenticated_user) - ): + async def list_ontologies(user: User = Depends(get_authenticated_user)): """ List all uploaded ontologies for the authenticated user. @@ -86,4 +85,4 @@ def get_ontology_router() -> APIRouter: except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)}) - return router \ No newline at end of file + return router diff --git a/cognee/tests/test_ontology_endpoint.py b/cognee/tests/test_ontology_endpoint.py index 4849f8649..b5cedfafe 100644 --- a/cognee/tests/test_ontology_endpoint.py +++ b/cognee/tests/test_ontology_endpoint.py @@ -8,37 +8,40 @@ from cognee.api.client import app gau_mod = importlib.import_module("cognee.modules.users.methods.get_authenticated_user") + @pytest.fixture def client(): return TestClient(app) + @pytest.fixture def mock_user(): user = Mock() user.id = "test-user-123" return user + @pytest.fixture def mock_default_user(): """Mock default user for testing.""" return SimpleNamespace( - id=uuid.uuid4(), - email="default@example.com", - is_active=True, - tenant_id=uuid.uuid4() + id=uuid.uuid4(), email="default@example.com", is_active=True, tenant_id=uuid.uuid4() ) + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_success(mock_get_default_user, client, mock_default_user): """Test successful ontology upload""" mock_get_default_user.return_value = mock_default_user - ontology_content = b"" + ontology_content = ( + b"" + ) unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" response = client.post( "/api/v1/ontologies", files={"ontology_file": ("test.owl", ontology_content)}, - data={"ontology_key": unique_key, "description": "Test"} + data={"ontology_key": unique_key, "description": "Test"}, ) assert response.status_code == 200 @@ -46,6 +49,7 @@ def test_upload_ontology_success(mock_get_default_user, client, mock_default_use assert data["ontology_key"] == unique_key assert "uploaded_at" in data + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_default_user): """Test 400 response for non-.owl files""" @@ -54,10 +58,11 @@ def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_defaul response = client.post( "/api/v1/ontologies", files={"ontology_file": ("test.txt", b"not xml")}, - data={"ontology_key": unique_key} + data={"ontology_key": unique_key}, ) assert response.status_code == 400 + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_missing_data(mock_get_default_user, client, mock_default_user): """Test 400 response for missing file or key""" @@ -70,6 +75,7 @@ def test_upload_ontology_missing_data(mock_get_default_user, client, mock_defaul response = client.post("/api/v1/ontologies", files={"ontology_file": ("test.owl", b"xml")}) assert response.status_code == 400 + @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_default_user): """Test behavior when default user is provided (no explicit authentication)""" @@ -78,7 +84,7 @@ def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_defaul response = client.post( "/api/v1/ontologies", files={"ontology_file": ("test.owl", b"")}, - data={"ontology_key": unique_key} + data={"ontology_key": unique_key}, ) # The current system provides a default user when no explicit authentication is given @@ -86,4 +92,4 @@ def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_defaul assert response.status_code == 200 data = response.json() assert data["ontology_key"] == unique_key - assert "uploaded_at" in data \ No newline at end of file + assert "uploaded_at" in data From 5271ee49019f75196adc3f3fe3069efd7bc72e14 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 11:12:12 +0100 Subject: [PATCH 073/129] fix: Resolve issue with empty node set --- cognee/api/v1/add/routers/get_add_router.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cognee/api/v1/add/routers/get_add_router.py b/cognee/api/v1/add/routers/get_add_router.py index b2e7068b0..39dc1a3e6 100644 --- a/cognee/api/v1/add/routers/get_add_router.py +++ b/cognee/api/v1/add/routers/get_add_router.py @@ -82,7 +82,9 @@ def get_add_router() -> APIRouter: datasetName, user=user, dataset_id=datasetId, - node_set=node_set if node_set else None, + node_set=node_set + if node_set != [""] + else None, # Transform default node_set endpoint value to None ) if isinstance(add_run, PipelineRunErrored): From cc60141c8dec285e48a05e8550782cb4708eb6ad Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 11:46:34 +0100 Subject: [PATCH 074/129] test: change key for wighted edges example --- .github/workflows/weighted_edges_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 874ef6ea4..0425ac797 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -109,12 +109,12 @@ jobs: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini LLM_ENDPOINT: https://api.openai.com/v1/ - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" EMBEDDING_PROVIDER: openai EMBEDDING_MODEL: text-embedding-3-small EMBEDDING_ENDPOINT: https://api.openai.com/v1/ - EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }} + EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} EMBEDDING_API_VERSION: "2024-02-01" steps: - name: Check out repository From cdb06d2157d2602bd0e3f9c696d38368e119a8e4 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 11:50:58 +0100 Subject: [PATCH 075/129] test: add workflow_dispatch for test purposes --- .github/workflows/weighted_edges_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 0425ac797..10778d1a0 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -17,6 +17,7 @@ on: - 'cognee/tests/unit/interfaces/graph/test_weighted_edges.py' - 'examples/python/weighted_edges_example.py' - '.github/workflows/weighted_edges_tests.yml' + workflow_dispatch: env: RUNTIME__LOG_LEVEL: ERROR From 5dc8d4e8a1206e9ce4fbf4df157ac945b83283c7 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 11:52:39 +0100 Subject: [PATCH 076/129] test: remove workflow_dispatch --- .github/workflows/weighted_edges_tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 10778d1a0..0425ac797 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -17,7 +17,6 @@ on: - 'cognee/tests/unit/interfaces/graph/test_weighted_edges.py' - 'examples/python/weighted_edges_example.py' - '.github/workflows/weighted_edges_tests.yml' - workflow_dispatch: env: RUNTIME__LOG_LEVEL: ERROR From 4567ffcfff6ca850252fcced54c9324698484f9b Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 12:27:10 +0100 Subject: [PATCH 077/129] test: change endpoint --- .github/workflows/weighted_edges_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 0425ac797..d571d354f 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -108,12 +108,12 @@ jobs: env: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini - LLM_ENDPOINT: https://api.openai.com/v1/ + LLM_ENDPOINT: https://api.openai.com/v1 LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" EMBEDDING_PROVIDER: openai EMBEDDING_MODEL: text-embedding-3-small - EMBEDDING_ENDPOINT: https://api.openai.com/v1/ + EMBEDDING_ENDPOINT: https://api.openai.com/v1/embeddings EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} EMBEDDING_API_VERSION: "2024-02-01" steps: From 286584eef0039686ac9ad62f0be0a0a7a9637edc Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 12:35:37 +0100 Subject: [PATCH 078/129] test: change embedding config --- .github/workflows/weighted_edges_tests.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index d571d354f..b722c5e7c 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -111,11 +111,11 @@ jobs: LLM_ENDPOINT: https://api.openai.com/v1 LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" - EMBEDDING_PROVIDER: openai - EMBEDDING_MODEL: text-embedding-3-small - EMBEDDING_ENDPOINT: https://api.openai.com/v1/embeddings - EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} - EMBEDDING_API_VERSION: "2024-02-01" + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + steps: - name: Check out repository uses: actions/checkout@v4 From a058250c95390df84f1c44419cff8adf3a4e8269 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Thu, 6 Nov 2025 13:03:11 +0100 Subject: [PATCH 079/129] fix: add cognee to the local run environment --- cognee/modules/notebooks/operations/run_in_local_sandbox.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cognee/modules/notebooks/operations/run_in_local_sandbox.py b/cognee/modules/notebooks/operations/run_in_local_sandbox.py index 071deafb7..46499186e 100644 --- a/cognee/modules/notebooks/operations/run_in_local_sandbox.py +++ b/cognee/modules/notebooks/operations/run_in_local_sandbox.py @@ -2,6 +2,8 @@ import io import sys import traceback +import cognee + def wrap_in_async_handler(user_code: str) -> str: return ( @@ -34,6 +36,7 @@ def run_in_local_sandbox(code, environment=None, loop=None): environment["print"] = customPrintFunction environment["running_loop"] = loop + environment["cognee"] = cognee try: exec(code, environment) From 6b81559bb65d8501dc698761ed2fb51ee7c0d839 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 13:04:01 +0100 Subject: [PATCH 080/129] test: changed endpoints in other tests --- .github/workflows/weighted_edges_tests.yml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index b722c5e7c..0b263cdcf 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -32,7 +32,7 @@ jobs: env: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} steps: - name: Check out repository @@ -67,14 +67,13 @@ jobs: env: LLM_PROVIDER: openai LLM_MODEL: gpt-5-mini - LLM_ENDPOINT: https://api.openai.com/v1/ - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_ENDPOINT: https://api.openai.com/v1 + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_VERSION: "2024-02-01" - EMBEDDING_PROVIDER: openai - EMBEDDING_MODEL: text-embedding-3-small - EMBEDDING_ENDPOINT: https://api.openai.com/v1/ - EMBEDDING_API_KEY: ${{ secrets.LLM_API_KEY }} - EMBEDDING_API_VERSION: "2024-02-01" + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} steps: - name: Check out repository uses: actions/checkout@v4 @@ -115,7 +114,7 @@ jobs: EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - + steps: - name: Check out repository uses: actions/checkout@v4 From 62c599a4994c4e78c33c66925fb879b72c4ff338 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 13:13:35 +0100 Subject: [PATCH 081/129] test: add dev to CI config for weighted edges tests --- .github/workflows/weighted_edges_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weighted_edges_tests.yml b/.github/workflows/weighted_edges_tests.yml index 0b263cdcf..2b4a043bf 100644 --- a/.github/workflows/weighted_edges_tests.yml +++ b/.github/workflows/weighted_edges_tests.yml @@ -2,7 +2,7 @@ name: Weighted Edges Tests on: push: - branches: [ main, weighted_edges ] + branches: [ main, dev, weighted_edges ] paths: - 'cognee/modules/graph/utils/get_graph_from_model.py' - 'cognee/infrastructure/engine/models/Edge.py' @@ -10,7 +10,7 @@ on: - 'examples/python/weighted_edges_example.py' - '.github/workflows/weighted_edges_tests.yml' pull_request: - branches: [ main ] + branches: [ main, dev ] paths: - 'cognee/modules/graph/utils/get_graph_from_model.py' - 'cognee/infrastructure/engine/models/Edge.py' From c0e5ce04cedbf3bc4511e73ed2e2276e21c2f978 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 6 Nov 2025 14:13:55 +0100 Subject: [PATCH 082/129] Fix: fixes session history test for multiuser mode (#1746) ## Description Fixes failing session history test ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../persist_sessions_in_knowledge_graph.py | 2 +- cognee/tasks/memify/cognify_session.py | 7 ++++--- cognee/tests/test_conversation_history.py | 6 +++--- .../modules/memify_tasks/test_cognify_session.py | 12 ++++++++---- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py b/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py index c0ba0a4d9..92d64c156 100644 --- a/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +++ b/cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py @@ -40,7 +40,7 @@ async def persist_sessions_in_knowledge_graph_pipeline( extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)] enrichment_tasks = [ - Task(cognify_session), + Task(cognify_session, dataset_id=dataset_to_write[0].id), ] result = await memify( diff --git a/cognee/tasks/memify/cognify_session.py b/cognee/tasks/memify/cognify_session.py index 7c276169a..f53f9afb1 100644 --- a/cognee/tasks/memify/cognify_session.py +++ b/cognee/tasks/memify/cognify_session.py @@ -6,7 +6,7 @@ from cognee.shared.logging_utils import get_logger logger = get_logger("cognify_session") -async def cognify_session(data): +async def cognify_session(data, dataset_id=None): """ Process and cognify session data into the knowledge graph. @@ -16,6 +16,7 @@ async def cognify_session(data): Args: data: Session string containing Question, Context, and Answer information. + dataset_name: Name of dataset. Raises: CogneeValidationError: If data is None or empty. @@ -28,9 +29,9 @@ async def cognify_session(data): logger.info("Processing session data for cognification") - await cognee.add(data, node_set=["user_sessions_from_cache"]) + await cognee.add(data, dataset_id=dataset_id, node_set=["user_sessions_from_cache"]) logger.debug("Session data added to cognee with node_set: user_sessions") - await cognee.cognify() + await cognee.cognify(datasets=[dataset_id]) logger.info("Session data successfully cognified") except CogneeValidationError: diff --git a/cognee/tests/test_conversation_history.py b/cognee/tests/test_conversation_history.py index 6b5b737f1..783baf563 100644 --- a/cognee/tests/test_conversation_history.py +++ b/cognee/tests/test_conversation_history.py @@ -56,10 +56,10 @@ async def main(): """DataCo is a data analytics company. They help businesses make sense of their data.""" ) - await cognee.add(text_1, dataset_name) - await cognee.add(text_2, dataset_name) + await cognee.add(data=text_1, dataset_name=dataset_name) + await cognee.add(data=text_2, dataset_name=dataset_name) - await cognee.cognify([dataset_name]) + await cognee.cognify(datasets=[dataset_name]) user = await get_default_user() diff --git a/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py b/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py index c23640fbd..8c2448287 100644 --- a/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +++ b/cognee/tests/unit/modules/memify_tasks/test_cognify_session.py @@ -16,9 +16,11 @@ async def test_cognify_session_success(): patch("cognee.add", new_callable=AsyncMock) as mock_add, patch("cognee.cognify", new_callable=AsyncMock) as mock_cognify, ): - await cognify_session(session_data) + await cognify_session(session_data, dataset_id="123") - mock_add.assert_called_once_with(session_data, node_set=["user_sessions_from_cache"]) + mock_add.assert_called_once_with( + session_data, dataset_id="123", node_set=["user_sessions_from_cache"] + ) mock_cognify.assert_called_once() @@ -101,7 +103,9 @@ async def test_cognify_session_with_special_characters(): patch("cognee.add", new_callable=AsyncMock) as mock_add, patch("cognee.cognify", new_callable=AsyncMock) as mock_cognify, ): - await cognify_session(session_data) + await cognify_session(session_data, dataset_id="123") - mock_add.assert_called_once_with(session_data, node_set=["user_sessions_from_cache"]) + mock_add.assert_called_once_with( + session_data, dataset_id="123", node_set=["user_sessions_from_cache"] + ) mock_cognify.assert_called_once() From ac6dd08855e30349b0666e6af48da3e829079948 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 14:35:26 +0100 Subject: [PATCH 083/129] fix: Resolve issue with sqlite index creation --- alembic/versions/c946955da633_multi_tenant_support.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index ba451fc03..c87500907 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -126,6 +126,8 @@ def upgrade() -> None: conn = op.get_bind() conn.execute(dataset.update().values(tenant_id=tenant_id_from_dataset_owner)) + op.create_index(op.f("ix_datasets_tenant_id"), "datasets", ["tenant_id"]) + def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### From ac751bacf09e26b851b5829d46330a2f7ee7f25e Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 14:51:25 +0100 Subject: [PATCH 084/129] fix: Resolve SQLite migration issue --- .../c946955da633_multi_tenant_support.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index c87500907..a87989d9b 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -97,13 +97,23 @@ def upgrade() -> None: # Insert into user_tenants table if user_data: - op.bulk_insert( - user_tenants, - [ - {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} - for user_id, tenant_id in user_data - ], - ) + if op.get_context().dialect.name == "sqlite": + insert_stmt = user_tenants.insert().values( + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ] + ) + conn.execute(insert_stmt) + conn.commit() + else: + op.bulk_insert( + user_tenants, + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ], + ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: From 5bc83968f81982ad90f37e41c205c90bb1b9b3d5 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 6 Nov 2025 15:22:48 +0100 Subject: [PATCH 085/129] feature: text chunker with overlap (#1732) ## Description - Implements `TextChunkerWithOverlap` with configurable `chunk_overlap_ratio` - Abstracts chunk_data generation via `get_chunk_data` callable (defaults to `chunk_by_paragraph`) - Parametrized tests verify `TextChunker` and `TextChunkerWithOverlap` (0% overlap) produce identical output for all edge cases. - Overlap-specific tests validate `TextChunkerWithOverlap` behavior ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- .../chunking/text_chunker_with_overlap.py | 124 +++++++ .../modules/chunking/test_text_chunker.py | 248 ++++++++++++++ .../test_text_chunker_with_overlap.py | 324 ++++++++++++++++++ 3 files changed, 696 insertions(+) create mode 100644 cognee/modules/chunking/text_chunker_with_overlap.py create mode 100644 cognee/tests/unit/modules/chunking/test_text_chunker.py create mode 100644 cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py diff --git a/cognee/modules/chunking/text_chunker_with_overlap.py b/cognee/modules/chunking/text_chunker_with_overlap.py new file mode 100644 index 000000000..4b9c23079 --- /dev/null +++ b/cognee/modules/chunking/text_chunker_with_overlap.py @@ -0,0 +1,124 @@ +from cognee.shared.logging_utils import get_logger +from uuid import NAMESPACE_OID, uuid5 + +from cognee.tasks.chunks import chunk_by_paragraph +from cognee.modules.chunking.Chunker import Chunker +from .models.DocumentChunk import DocumentChunk + +logger = get_logger() + + +class TextChunkerWithOverlap(Chunker): + def __init__( + self, + document, + get_text: callable, + max_chunk_size: int, + chunk_overlap_ratio: float = 0.0, + get_chunk_data: callable = None, + ): + super().__init__(document, get_text, max_chunk_size) + self._accumulated_chunk_data = [] + self._accumulated_size = 0 + self.chunk_overlap_ratio = chunk_overlap_ratio + self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio) + + if get_chunk_data is not None: + self.get_chunk_data = get_chunk_data + elif chunk_overlap_ratio > 0: + paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size) + self.get_chunk_data = lambda text: chunk_by_paragraph( + text, paragraph_max_size, batch_paragraphs=True + ) + else: + self.get_chunk_data = lambda text: chunk_by_paragraph( + text, self.max_chunk_size, batch_paragraphs=True + ) + + def _accumulation_overflows(self, chunk_data): + """Check if adding chunk_data would exceed max_chunk_size.""" + return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size + + def _accumulate_chunk_data(self, chunk_data): + """Add chunk_data to the current accumulation.""" + self._accumulated_chunk_data.append(chunk_data) + self._accumulated_size += chunk_data["chunk_size"] + + def _clear_accumulation(self): + """Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio.""" + if self.chunk_overlap == 0: + self._accumulated_chunk_data = [] + self._accumulated_size = 0 + return + + # Keep chunk_data from the end that fit in overlap + overlap_chunk_data = [] + overlap_size = 0 + + for chunk_data in reversed(self._accumulated_chunk_data): + if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap: + overlap_chunk_data.insert(0, chunk_data) + overlap_size += chunk_data["chunk_size"] + else: + break + + self._accumulated_chunk_data = overlap_chunk_data + self._accumulated_size = overlap_size + + def _create_chunk(self, text, size, cut_type, chunk_id=None): + """Create a DocumentChunk with standard metadata.""" + try: + return DocumentChunk( + id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"), + text=text, + chunk_size=size, + is_part_of=self.document, + chunk_index=self.chunk_index, + cut_type=cut_type, + contains=[], + metadata={"index_fields": ["text"]}, + ) + except Exception as e: + logger.error(e) + raise e + + def _create_chunk_from_accumulation(self): + """Create a DocumentChunk from current accumulated chunk_data.""" + chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data) + return self._create_chunk( + text=chunk_text, + size=self._accumulated_size, + cut_type=self._accumulated_chunk_data[-1]["cut_type"], + ) + + def _emit_chunk(self, chunk_data): + """Emit a chunk when accumulation overflows.""" + if len(self._accumulated_chunk_data) > 0: + chunk = self._create_chunk_from_accumulation() + self._clear_accumulation() + self._accumulate_chunk_data(chunk_data) + else: + # Handle single chunk_data exceeding max_chunk_size + chunk = self._create_chunk( + text=chunk_data["text"], + size=chunk_data["chunk_size"], + cut_type=chunk_data["cut_type"], + chunk_id=chunk_data["chunk_id"], + ) + + self.chunk_index += 1 + return chunk + + async def read(self): + async for content_text in self.get_text(): + for chunk_data in self.get_chunk_data(content_text): + if not self._accumulation_overflows(chunk_data): + self._accumulate_chunk_data(chunk_data) + continue + + yield self._emit_chunk(chunk_data) + + if len(self._accumulated_chunk_data) == 0: + return + + yield self._create_chunk_from_accumulation() diff --git a/cognee/tests/unit/modules/chunking/test_text_chunker.py b/cognee/tests/unit/modules/chunking/test_text_chunker.py new file mode 100644 index 000000000..d535f74b0 --- /dev/null +++ b/cognee/tests/unit/modules/chunking/test_text_chunker.py @@ -0,0 +1,248 @@ +"""Unit tests for TextChunker and TextChunkerWithOverlap behavioral equivalence.""" + +import pytest +from uuid import uuid4 + +from cognee.modules.chunking.TextChunker import TextChunker +from cognee.modules.chunking.text_chunker_with_overlap import TextChunkerWithOverlap +from cognee.modules.data.processing.document_types import Document + + +@pytest.fixture(params=["TextChunker", "TextChunkerWithOverlap"]) +def chunker_class(request): + """Parametrize tests to run against both implementations.""" + return TextChunker if request.param == "TextChunker" else TextChunkerWithOverlap + + +@pytest.fixture +def make_text_generator(): + """Factory for async text generators.""" + + def _factory(*texts): + async def gen(): + for text in texts: + yield text + + return gen + + return _factory + + +async def collect_chunks(chunker): + """Consume async generator and return list of chunks.""" + chunks = [] + async for chunk in chunker.read(): + chunks.append(chunk) + return chunks + + +@pytest.mark.asyncio +async def test_empty_input_produces_no_chunks(chunker_class, make_text_generator): + """Empty input should yield no chunks.""" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator("") + chunker = chunker_class(document, get_text, max_chunk_size=512) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 0, "Empty input should produce no chunks" + + +@pytest.mark.asyncio +async def test_whitespace_only_input_emits_single_chunk(chunker_class, make_text_generator): + """Whitespace-only input should produce exactly one chunk with unchanged text.""" + whitespace_text = " \n\t \r\n " + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(whitespace_text) + chunker = chunker_class(document, get_text, max_chunk_size=512) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 1, "Whitespace-only input should produce exactly one chunk" + assert chunks[0].text == whitespace_text, "Chunk text should equal input (whitespace preserved)" + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + + +@pytest.mark.asyncio +async def test_single_paragraph_below_limit_emits_one_chunk(chunker_class, make_text_generator): + """Single paragraph below limit should emit exactly one chunk.""" + text = "This is a short paragraph." + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=512) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 1, "Single short paragraph should produce exactly one chunk" + assert chunks[0].text == text, "Chunk text should match input" + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[0].chunk_size > 0, "Chunk should have positive size" + + +@pytest.mark.asyncio +async def test_oversized_paragraph_gets_emitted_as_a_single_chunk( + chunker_class, make_text_generator +): + """Oversized paragraph from chunk_by_paragraph should be emitted as single chunk.""" + text = ("A" * 1500) + ". Next sentence." + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=50) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 2, "Should produce 2 chunks (oversized paragraph + next sentence)" + assert chunks[0].chunk_size > 50, "First chunk should be oversized" + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[1].chunk_index == 1, "Second chunk should have index 1" + + +@pytest.mark.asyncio +async def test_overflow_on_next_paragraph_emits_separate_chunk(chunker_class, make_text_generator): + """First paragraph near limit plus small paragraph should produce two separate chunks.""" + first_para = " ".join(["word"] * 5) + second_para = "Short text." + text = first_para + " " + second_para + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=10) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 2, "Should produce 2 chunks due to overflow" + assert chunks[0].text.strip() == first_para, "First chunk should contain only first paragraph" + assert chunks[1].text.strip() == second_para, ( + "Second chunk should contain only second paragraph" + ) + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[1].chunk_index == 1, "Second chunk should have index 1" + + +@pytest.mark.asyncio +async def test_small_paragraphs_batch_correctly(chunker_class, make_text_generator): + """Multiple small paragraphs should batch together with joiner spaces counted.""" + paragraphs = [" ".join(["word"] * 12) for _ in range(40)] + text = " ".join(paragraphs) + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=49) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 20, ( + "Should batch paragraphs (2 per chunk: 12 words × 2 tokens = 24, 24 + 1 joiner + 24 = 49)" + ) + assert all(c.chunk_index == i for i, c in enumerate(chunks)), ( + "Chunk indices should be sequential" + ) + all_text = " ".join(chunk.text.strip() for chunk in chunks) + expected_text = " ".join(paragraphs) + assert all_text == expected_text, "All paragraph text should be preserved with correct spacing" + + +@pytest.mark.asyncio +async def test_alternating_large_and_small_paragraphs_dont_batch( + chunker_class, make_text_generator +): + """Alternating near-max and small paragraphs should each become separate chunks.""" + large1 = "word" * 15 + "." + small1 = "Short." + large2 = "word" * 15 + "." + small2 = "Tiny." + text = large1 + " " + small1 + " " + large2 + " " + small2 + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + max_chunk_size = 10 + get_text = make_text_generator(text) + chunker = chunker_class(document, get_text, max_chunk_size=max_chunk_size) + chunks = await collect_chunks(chunker) + + assert len(chunks) == 4, "Should produce multiple chunks" + assert all(c.chunk_index == i for i, c in enumerate(chunks)), ( + "Chunk indices should be sequential" + ) + assert chunks[0].chunk_size > max_chunk_size, ( + "First chunk should be oversized (large paragraph)" + ) + assert chunks[1].chunk_size <= max_chunk_size, "Second chunk should be small (small paragraph)" + assert chunks[2].chunk_size > max_chunk_size, ( + "Third chunk should be oversized (large paragraph)" + ) + assert chunks[3].chunk_size <= max_chunk_size, "Fourth chunk should be small (small paragraph)" + + +@pytest.mark.asyncio +async def test_chunk_indices_and_ids_are_deterministic(chunker_class, make_text_generator): + """Running chunker twice on identical input should produce identical indices and IDs.""" + sentence1 = "one " * 4 + ". " + sentence2 = "two " * 4 + ". " + sentence3 = "one " * 4 + ". " + sentence4 = "two " * 4 + ". " + text = sentence1 + sentence2 + sentence3 + sentence4 + doc_id = uuid4() + max_chunk_size = 20 + + document1 = Document( + id=doc_id, + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text1 = make_text_generator(text) + chunker1 = chunker_class(document1, get_text1, max_chunk_size=max_chunk_size) + chunks1 = await collect_chunks(chunker1) + + document2 = Document( + id=doc_id, + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text2 = make_text_generator(text) + chunker2 = chunker_class(document2, get_text2, max_chunk_size=max_chunk_size) + chunks2 = await collect_chunks(chunker2) + + assert len(chunks1) == 2, "Should produce exactly 2 chunks (4 sentences, 2 per chunk)" + assert len(chunks2) == 2, "Should produce exactly 2 chunks (4 sentences, 2 per chunk)" + assert [c.chunk_index for c in chunks1] == [0, 1], "First run indices should be [0, 1]" + assert [c.chunk_index for c in chunks2] == [0, 1], "Second run indices should be [0, 1]" + assert chunks1[0].id == chunks2[0].id, "First chunk ID should be deterministic" + assert chunks1[1].id == chunks2[1].id, "Second chunk ID should be deterministic" + assert chunks1[0].id != chunks1[1].id, "Chunk IDs should be unique within a run" diff --git a/cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py b/cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py new file mode 100644 index 000000000..9d7be6936 --- /dev/null +++ b/cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py @@ -0,0 +1,324 @@ +"""Unit tests for TextChunkerWithOverlap overlap behavior.""" + +import sys +import pytest +from uuid import uuid4 +from unittest.mock import patch + +from cognee.modules.chunking.text_chunker_with_overlap import TextChunkerWithOverlap +from cognee.modules.data.processing.document_types import Document +from cognee.tasks.chunks import chunk_by_paragraph + + +@pytest.fixture +def make_text_generator(): + """Factory for async text generators.""" + + def _factory(*texts): + async def gen(): + for text in texts: + yield text + + return gen + + return _factory + + +@pytest.fixture +def make_controlled_chunk_data(): + """Factory for controlled chunk_data generators.""" + + def _factory(*sentences, chunk_size_per_sentence=10): + def _chunk_data(text): + return [ + { + "text": sentence, + "chunk_size": chunk_size_per_sentence, + "cut_type": "sentence", + "chunk_id": uuid4(), + } + for sentence in sentences + ] + + return _chunk_data + + return _factory + + +@pytest.mark.asyncio +async def test_half_overlap_preserves_content_across_chunks( + make_text_generator, make_controlled_chunk_data +): + """With 50% overlap, consecutive chunks should share half their content.""" + s1 = "one" + s2 = "two" + s3 = "three" + s4 = "four" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.5, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 3, "Should produce exactly 3 chunks (s1+s2, s2+s3, s3+s4)" + assert [c.chunk_index for c in chunks] == [0, 1, 2], "Chunk indices should be [0, 1, 2]" + assert "one" in chunks[0].text and "two" in chunks[0].text, "Chunk 0 should contain s1 and s2" + assert "two" in chunks[1].text and "three" in chunks[1].text, ( + "Chunk 1 should contain s2 (overlap) and s3" + ) + assert "three" in chunks[2].text and "four" in chunks[2].text, ( + "Chunk 2 should contain s3 (overlap) and s4" + ) + + +@pytest.mark.asyncio +async def test_zero_overlap_produces_no_duplicate_content( + make_text_generator, make_controlled_chunk_data +): + """With 0% overlap, no content should appear in multiple chunks.""" + s1 = "one" + s2 = "two" + s3 = "three" + s4 = "four" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.0, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 2, "Should produce exactly 2 chunks (s1+s2, s3+s4)" + assert "one" in chunks[0].text and "two" in chunks[0].text, ( + "First chunk should contain s1 and s2" + ) + assert "three" in chunks[1].text and "four" in chunks[1].text, ( + "Second chunk should contain s3 and s4" + ) + assert "two" not in chunks[1].text and "three" not in chunks[0].text, ( + "No overlap: end of chunk 0 should not appear in chunk 1" + ) + + +@pytest.mark.asyncio +async def test_small_overlap_ratio_creates_minimal_overlap( + make_text_generator, make_controlled_chunk_data +): + """With 25% overlap ratio, chunks should have minimal overlap.""" + s1 = "alpha" + s2 = "beta" + s3 = "gamma" + s4 = "delta" + s5 = "epsilon" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, s5, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=40, + chunk_overlap_ratio=0.25, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 2, "Should produce exactly 2 chunks" + assert [c.chunk_index for c in chunks] == [0, 1], "Chunk indices should be [0, 1]" + assert all(token in chunks[0].text for token in [s1, s2, s3, s4]), ( + "Chunk 0 should contain s1 through s4" + ) + assert s4 in chunks[1].text and s5 in chunks[1].text, ( + "Chunk 1 should contain overlap s4 and new content s5" + ) + + +@pytest.mark.asyncio +async def test_high_overlap_ratio_creates_significant_overlap( + make_text_generator, make_controlled_chunk_data +): + """With 75% overlap ratio, consecutive chunks should share most content.""" + s1 = "red" + s2 = "blue" + s3 = "green" + s4 = "yellow" + s5 = "purple" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, s3, s4, s5, chunk_size_per_sentence=5) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.75, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 2, "Should produce exactly 2 chunks with 75% overlap" + assert [c.chunk_index for c in chunks] == [0, 1], "Chunk indices should be [0, 1]" + assert all(token in chunks[0].text for token in [s1, s2, s3, s4]), ( + "Chunk 0 should contain s1, s2, s3, s4" + ) + assert all(token in chunks[1].text for token in [s2, s3, s4, s5]), ( + "Chunk 1 should contain s2, s3, s4 (overlap) and s5" + ) + + +@pytest.mark.asyncio +async def test_single_chunk_no_dangling_overlap(make_text_generator, make_controlled_chunk_data): + """Text that fits in one chunk should produce exactly one chunk, no overlap artifact.""" + s1 = "alpha" + s2 = "beta" + text = "dummy" + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + get_text = make_text_generator(text) + get_chunk_data = make_controlled_chunk_data(s1, s2, chunk_size_per_sentence=10) + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=20, + chunk_overlap_ratio=0.5, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 1, ( + "Should produce exactly 1 chunk when content fits within max_chunk_size" + ) + assert chunks[0].chunk_index == 0, "Single chunk should have index 0" + assert "alpha" in chunks[0].text and "beta" in chunks[0].text, ( + "Single chunk should contain all content" + ) + + +@pytest.mark.asyncio +async def test_paragraph_chunking_with_overlap(make_text_generator): + """Test that chunk_by_paragraph integration produces 25% overlap between chunks.""" + + def mock_get_embedding_engine(): + class MockEngine: + tokenizer = None + + return MockEngine() + + chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence") + + max_chunk_size = 20 + overlap_ratio = 0.25 # 5 token overlap + paragraph_max_size = int(0.5 * overlap_ratio * max_chunk_size) # = 2 + + text = ( + "A0 A1. A2 A3. A4 A5. A6 A7. A8 A9. " # 10 tokens (0-9) + "B0 B1. B2 B3. B4 B5. B6 B7. B8 B9. " # 10 tokens (10-19) + "C0 C1. C2 C3. C4 C5. C6 C7. C8 C9. " # 10 tokens (20-29) + "D0 D1. D2 D3. D4 D5. D6 D7. D8 D9. " # 10 tokens (30-39) + "E0 E1. E2 E3. E4 E5. E6 E7. E8 E9." # 10 tokens (40-49) + ) + + document = Document( + id=uuid4(), + name="test_document", + raw_data_location="/test/path", + external_metadata=None, + mime_type="text/plain", + ) + + get_text = make_text_generator(text) + + def get_chunk_data(text_input): + return chunk_by_paragraph( + text_input, max_chunk_size=paragraph_max_size, batch_paragraphs=True + ) + + with patch.object( + chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine + ): + chunker = TextChunkerWithOverlap( + document, + get_text, + max_chunk_size=max_chunk_size, + chunk_overlap_ratio=overlap_ratio, + get_chunk_data=get_chunk_data, + ) + chunks = [chunk async for chunk in chunker.read()] + + assert len(chunks) == 3, f"Should produce exactly 3 chunks, got {len(chunks)}" + + assert chunks[0].chunk_index == 0, "First chunk should have index 0" + assert chunks[1].chunk_index == 1, "Second chunk should have index 1" + assert chunks[2].chunk_index == 2, "Third chunk should have index 2" + + assert "A0" in chunks[0].text, "Chunk 0 should start with A0" + assert "A9" in chunks[0].text, "Chunk 0 should contain A9" + assert "B0" in chunks[0].text, "Chunk 0 should contain B0" + assert "B9" in chunks[0].text, "Chunk 0 should contain up to B9 (20 tokens)" + + assert "B" in chunks[1].text, "Chunk 1 should have overlap from B section" + assert "C" in chunks[1].text, "Chunk 1 should contain C section" + assert "D" in chunks[1].text, "Chunk 1 should contain D section" + + assert "D" in chunks[2].text, "Chunk 2 should have overlap from D section" + assert "E0" in chunks[2].text, "Chunk 2 should contain E0" + assert "E9" in chunks[2].text, "Chunk 2 should end with E9" + + chunk_0_end_words = chunks[0].text.split()[-4:] + chunk_1_words = chunks[1].text.split() + overlap_0_1 = any(word in chunk_1_words for word in chunk_0_end_words) + assert overlap_0_1, ( + f"No overlap detected between chunks 0 and 1. " + f"Chunk 0 ends with: {chunk_0_end_words}, " + f"Chunk 1 starts with: {chunk_1_words[:6]}" + ) + + chunk_1_end_words = chunks[1].text.split()[-4:] + chunk_2_words = chunks[2].text.split() + overlap_1_2 = any(word in chunk_2_words for word in chunk_1_end_words) + assert overlap_1_2, ( + f"No overlap detected between chunks 1 and 2. " + f"Chunk 1 ends with: {chunk_1_end_words}, " + f"Chunk 2 starts with: {chunk_2_words[:6]}" + ) From d5caba144c603c59b2960e6c6c88028553db0b57 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 6 Nov 2025 15:57:52 +0100 Subject: [PATCH 086/129] CI: Initial Release test workflow --- .github/workflows/e2e_tests.yml | 41 ---------------- .github/workflows/load_tests.yml | 76 ++++++++++++++++++++++++++++++ .github/workflows/release_test.yml | 14 ++++++ 3 files changed, 90 insertions(+), 41 deletions(-) create mode 100644 .github/workflows/load_tests.yml create mode 100644 .github/workflows/release_test.yml diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 0596f22d3..70a4b56e6 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -447,44 +447,3 @@ jobs: DB_USERNAME: cognee DB_PASSWORD: cognee run: uv run python ./cognee/tests/test_conversation_history.py - - test-load: - name: Test Load - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - extra-dependencies: "aws" - - - name: Set File Descriptor Limit - run: sudo prlimit --pid $$ --nofile=4096:4096 - - - name: Verify File Descriptor Limit - run: ulimit -n - - - name: Dependencies already installed - run: echo "Dependencies already installed in setup" - - - name: Run Load Test - env: - ENV: 'dev' - ENABLE_BACKEND_ACCESS_CONTROL: True - LLM_MODEL: ${{ secrets.LLM_MODEL }} - LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} - LLM_API_KEY: ${{ secrets.LLM_API_KEY }} - LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} - EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} - EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} - EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} - EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} - STORAGE_BACKEND: s3 - AWS_REGION: eu-west-1 - AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} - run: uv run python ./cognee/tests/test_load.py \ No newline at end of file diff --git a/.github/workflows/load_tests.yml b/.github/workflows/load_tests.yml new file mode 100644 index 000000000..ff11bb88b --- /dev/null +++ b/.github/workflows/load_tests.yml @@ -0,0 +1,76 @@ +name: Load tests + +permissions: + contents: read + +on: + workflow_dispatch: + workflow_call: + secrets: + LLM_MODEL: + required: true + LLM_ENDPOINT: + required: true + LLM_API_KEY: + required: true + LLM_API_VERSION: + required: true + EMBEDDING_MODEL: + required: true + EMBEDDING_ENDPOINT: + required: true + EMBEDDING_API_KEY: + required: true + EMBEDDING_API_VERSION: + required: true + OPENAI_API_KEY: + required: true + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + +jobs: + test-load: + name: Test Load + runs-on: ubuntu-22.04 + timeout-minutes: 60 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + extra-dependencies: "aws" + + - name: Set File Descriptor Limit + run: sudo prlimit --pid $$ --nofile=4096:4096 + + - name: Verify File Descriptor Limit + run: ulimit -n + + - name: Dependencies already installed + run: echo "Dependencies already installed in setup" + + - name: Run Load Test + env: + ENV: 'dev' + ENABLE_BACKEND_ACCESS_CONTROL: True + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + STORAGE_BACKEND: s3 + AWS_REGION: eu-west-1 + AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} + run: uv run python ./cognee/tests/test_load.py + + diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml new file mode 100644 index 000000000..23679bef4 --- /dev/null +++ b/.github/workflows/release_test.yml @@ -0,0 +1,14 @@ +# Long-running, heavy and resource-consuming tests for release validation +name: Release Test Workflow + +permissions: + contents: read + +on: + workflow_dispatch: + +jobs: + load-tests: + name: Load Tests + uses: ./.github/workflows/load_tests.yml + secrets: inherit \ No newline at end of file From ef3a3826698d89cfdf5bed62b6ea9f93576122d8 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:23:54 +0100 Subject: [PATCH 087/129] refactor: use batch insert for SQLite as well --- .../c946955da633_multi_tenant_support.py | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index a87989d9b..d8fccdfbf 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -97,23 +97,13 @@ def upgrade() -> None: # Insert into user_tenants table if user_data: - if op.get_context().dialect.name == "sqlite": - insert_stmt = user_tenants.insert().values( - [ - {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} - for user_id, tenant_id in user_data - ] - ) - conn.execute(insert_stmt) - conn.commit() - else: - op.bulk_insert( - user_tenants, - [ - {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} - for user_id, tenant_id in user_data - ], - ) + op.bulk_insert( + user_tenants, + [ + {"user_id": user_id, "tenant_id": tenant_id, "created_at": _now()} + for user_id, tenant_id in user_data + ], + ) tenant_id_column = _get_column(insp, "datasets", "tenant_id") if not tenant_id_column: @@ -141,7 +131,7 @@ def upgrade() -> None: def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("user_tenants") op.drop_index(op.f("ix_datasets_tenant_id"), table_name="datasets") op.drop_column("datasets", "tenant_id") - op.drop_table("user_tenants") # ### end Alembic commands ### From c146de3a4d2f5327f4cffd347a8a782e39906da0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:41:00 +0100 Subject: [PATCH 088/129] fix: Remove creation of database and db tables from env.py --- alembic/env.py | 5 ----- alembic/versions/c946955da633_multi_tenant_support.py | 4 ++++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/alembic/env.py b/alembic/env.py index 1cbef65f7..8ca09968d 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -87,11 +87,6 @@ db_engine = get_relational_engine() print("Using database:", db_engine.db_uri) -if "sqlite" in db_engine.db_uri: - from cognee.infrastructure.utils.run_sync import run_sync - - run_sync(db_engine.create_database()) - config.set_section_option( config.config_ini_section, "SQLALCHEMY_DATABASE_URI", diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index d8fccdfbf..7806fdde8 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -79,6 +79,10 @@ def upgrade() -> None: dataset = _define_dataset_table() user = _define_user_table() + print(insp.get_table_names()) + + print(_get_column(insp, "user_tenants", "tenant_id")) + if "user_tenants" not in insp.get_table_names(): # Define table with all necessary columns including primary key user_tenants = op.create_table( From 49fbad9ec0cee49048c97ee6e80c78963a656042 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 6 Nov 2025 16:44:43 +0100 Subject: [PATCH 089/129] CI: Run release workflow on PR to main --- .github/workflows/release_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index 23679bef4..6ac3ca515 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -6,6 +6,9 @@ permissions: on: workflow_dispatch: + pull_request: + branches: + - main jobs: load-tests: From efb46c99f9d0d5ac426540b95aadd0a1bfd3e5de Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:47:42 +0100 Subject: [PATCH 090/129] fix: resolve issue with sqlite migration --- alembic/versions/c946955da633_multi_tenant_support.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/alembic/versions/c946955da633_multi_tenant_support.py b/alembic/versions/c946955da633_multi_tenant_support.py index 7806fdde8..d8fccdfbf 100644 --- a/alembic/versions/c946955da633_multi_tenant_support.py +++ b/alembic/versions/c946955da633_multi_tenant_support.py @@ -79,10 +79,6 @@ def upgrade() -> None: dataset = _define_dataset_table() user = _define_user_table() - print(insp.get_table_names()) - - print(_get_column(insp, "user_tenants", "tenant_id")) - if "user_tenants" not in insp.get_table_names(): # Define table with all necessary columns including primary key user_tenants = op.create_table( From 0d68175167af5da9b69e4024f434cbf0bd64b2ae Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:53:22 +0100 Subject: [PATCH 091/129] fix: remove database creation from migrations --- alembic/versions/8057ae7329c2_initial_migration.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/alembic/versions/8057ae7329c2_initial_migration.py b/alembic/versions/8057ae7329c2_initial_migration.py index aa0ecd4b8..42e9904a8 100644 --- a/alembic/versions/8057ae7329c2_initial_migration.py +++ b/alembic/versions/8057ae7329c2_initial_migration.py @@ -18,11 +18,8 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - db_engine = get_relational_engine() - # we might want to delete this - await_only(db_engine.create_database()) + pass def downgrade() -> None: - db_engine = get_relational_engine() - await_only(db_engine.delete_database()) + pass From bcc59cf9a0b6f5f765269a7ea2725fbd27e971f5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 16:57:59 +0100 Subject: [PATCH 092/129] fix: Remove default user creation --- alembic/versions/482cd6517ce4_add_default_user.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/alembic/versions/482cd6517ce4_add_default_user.py b/alembic/versions/482cd6517ce4_add_default_user.py index d85f0f146..fafa111f9 100644 --- a/alembic/versions/482cd6517ce4_add_default_user.py +++ b/alembic/versions/482cd6517ce4_add_default_user.py @@ -23,11 +23,8 @@ depends_on: Union[str, Sequence[str], None] = "8057ae7329c2" def upgrade() -> None: - try: - await_only(create_default_user()) - except UserAlreadyExists: - pass # It's fine if the default user already exists + pass # It's fine if the default user already exists def downgrade() -> None: - await_only(delete_user("default_user@example.com")) + pass From 61e1c2903f5f7372e5281a3c7126cc2bcb71bde5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 17:00:46 +0100 Subject: [PATCH 093/129] fix: Remove issue with default user creation --- alembic/versions/482cd6517ce4_add_default_user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/482cd6517ce4_add_default_user.py b/alembic/versions/482cd6517ce4_add_default_user.py index fafa111f9..c8a3dc5d5 100644 --- a/alembic/versions/482cd6517ce4_add_default_user.py +++ b/alembic/versions/482cd6517ce4_add_default_user.py @@ -23,7 +23,7 @@ depends_on: Union[str, Sequence[str], None] = "8057ae7329c2" def upgrade() -> None: - pass # It's fine if the default user already exists + pass def downgrade() -> None: From da5055a0a96ce9647fef7245ab312301e4237165 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 17:11:15 +0100 Subject: [PATCH 094/129] test: add one test that covers all retrievers. delete others --- .../retrieval/EntityCompletionRetriever.py | 2 +- .../modules/retrieval/completion_retriever.py | 4 +- ..._completion_context_extension_retriever.py | 4 +- .../graph_completion_cot_retriever.py | 2 +- .../retrieval/graph_completion_retriever.py | 2 +- .../modules/retrieval/temporal_retriever.py | 2 +- cognee/modules/retrieval/utils/completion.py | 2 +- .../entity_completion_retriever_test.py | 65 ------ ...letion_retriever_context_extension_test.py | 60 ----- .../graph_completion_retriever_cot_test.py | 58 ----- .../graph_completion_retriever_test.py | 58 ----- .../rag_completion_retriever_test.py | 81 +------ .../retrieval/structured_output_tests.py | 206 ++++++++++++++++++ .../retrieval/temporal_retriever_test.py | 66 ------ 14 files changed, 216 insertions(+), 396 deletions(-) delete mode 100644 cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py create mode 100644 cognee/tests/unit/modules/retrieval/structured_output_tests.py diff --git a/cognee/modules/retrieval/EntityCompletionRetriever.py b/cognee/modules/retrieval/EntityCompletionRetriever.py index 1f1ddad0a..14996f902 100644 --- a/cognee/modules/retrieval/EntityCompletionRetriever.py +++ b/cognee/modules/retrieval/EntityCompletionRetriever.py @@ -90,7 +90,7 @@ class EntityCompletionRetriever(BaseRetriever): context: Optional[Any] = None, session_id: Optional[str] = None, response_model: Type = str, - ) -> List[str]: + ) -> List[Any]: """ Generate completion using provided context or fetch new context. diff --git a/cognee/modules/retrieval/completion_retriever.py b/cognee/modules/retrieval/completion_retriever.py index f071e41de..126ebcab8 100644 --- a/cognee/modules/retrieval/completion_retriever.py +++ b/cognee/modules/retrieval/completion_retriever.py @@ -1,5 +1,5 @@ import asyncio -from typing import Any, Optional, Type +from typing import Any, Optional, Type, List from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.vector import get_vector_engine @@ -80,7 +80,7 @@ class CompletionRetriever(BaseRetriever): context: Optional[Any] = None, session_id: Optional[str] = None, response_model: Type = str, - ) -> str: + ) -> List[Any]: """ Generates an LLM completion using the context. diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py index 6b2c6a9e6..b07d11fd2 100644 --- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py +++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py @@ -1,5 +1,5 @@ import asyncio -from typing import Optional, List, Type +from typing import Optional, List, Type, Any from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge from cognee.shared.logging_utils import get_logger from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever @@ -57,7 +57,7 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): session_id: Optional[str] = None, context_extension_rounds=4, response_model: Type = str, - ) -> List[str]: + ) -> List[Any]: """ Extends the context for a given query by retrieving related triplets and generating new completions based on them. diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py index 39255fe68..eb8f502cb 100644 --- a/cognee/modules/retrieval/graph_completion_cot_retriever.py +++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py @@ -171,7 +171,7 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): session_id: Optional[str] = None, max_iter=4, response_model: Type = str, - ) -> List[str]: + ) -> List[Any]: """ Generate completion responses based on a user query and contextual information. diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index b544e8ead..df77a11ac 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -147,7 +147,7 @@ class GraphCompletionRetriever(BaseGraphRetriever): context: Optional[List[Edge]] = None, session_id: Optional[str] = None, response_model: Type = str, - ) -> List[str]: + ) -> List[Any]: """ Generates a completion using graph connections context based on a query. diff --git a/cognee/modules/retrieval/temporal_retriever.py b/cognee/modules/retrieval/temporal_retriever.py index 38d69ec80..f3da02c15 100644 --- a/cognee/modules/retrieval/temporal_retriever.py +++ b/cognee/modules/retrieval/temporal_retriever.py @@ -151,7 +151,7 @@ class TemporalRetriever(GraphCompletionRetriever): context: Optional[str] = None, session_id: Optional[str] = None, response_model: Type = str, - ) -> List[str]: + ) -> List[Any]: """ Generates a response using the query and optional context. diff --git a/cognee/modules/retrieval/utils/completion.py b/cognee/modules/retrieval/utils/completion.py index b77d7ef90..c90ce77f4 100644 --- a/cognee/modules/retrieval/utils/completion.py +++ b/cognee/modules/retrieval/utils/completion.py @@ -11,7 +11,7 @@ async def generate_completion( system_prompt: Optional[str] = None, conversation_history: Optional[str] = None, response_model: Type = str, -) -> str: +) -> Any: """Generates a completion using LLM with given context and prompts.""" args = {"question": query, "context": context} user_prompt = render_prompt(user_prompt_path, args) diff --git a/cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py deleted file mode 100644 index 064f4a31a..000000000 --- a/cognee/tests/unit/modules/retrieval/entity_completion_retriever_test.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import pytest -import pathlib -from pydantic import BaseModel - -import cognee -from cognee.low_level import setup -from cognee.tasks.storage import add_data_points -from cognee.modules.engine.models import Entity, EntityType -from cognee.modules.retrieval.EntityCompletionRetriever import EntityCompletionRetriever -from cognee.modules.retrieval.entity_extractors.DummyEntityExtractor import DummyEntityExtractor -from cognee.modules.retrieval.context_providers.DummyContextProvider import DummyContextProvider - - -class TestAnswer(BaseModel): - answer: str - explanation: str - - -# TODO: Add more tests, similar to other retrievers. -# TODO: For the tests, one needs to define an Entity Extractor and a Context Provider. -class TestEntityCompletionRetriever: - @pytest.mark.asyncio - async def test_get_entity_structured_completion(self): - system_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".cognee_system/test_get_entity_structured_completion" - ) - cognee.config.system_root_directory(system_directory_path) - data_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".data_storage/test_get_entity_structured_completion" - ) - cognee.config.data_root_directory(data_directory_path) - - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await setup() - - entity_type = EntityType(name="Person", description="A human individual") - entity = Entity(name="Albert Einstein", is_a=entity_type, description="A famous physicist") - - entities = [entity] - await add_data_points(entities) - - retriever = EntityCompletionRetriever(DummyEntityExtractor(), DummyContextProvider()) - - # Test with string response model (default) - string_answer = await retriever.get_completion("Who is Albert Einstein?") - assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" - assert all(isinstance(item, str) and item.strip() for item in string_answer), ( - "Answer should not be empty" - ) - - # Test with structured response model - structured_answer = await retriever.get_completion( - "Who is Albert Einstein?", response_model=TestAnswer - ) - assert isinstance(structured_answer, list), ( - f"Expected list, got {type(structured_answer).__name__}" - ) - assert all(isinstance(item, TestAnswer) for item in structured_answer), ( - f"Expected TestAnswer, got {type(structured_answer).__name__}" - ) - - assert structured_answer[0].answer.strip(), "Answer field should not be empty" - assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py index d15e55c23..29c8b7c95 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py @@ -2,7 +2,6 @@ import os import pytest import pathlib from typing import Optional, Union -from pydantic import BaseModel import cognee from cognee.low_level import setup, DataPoint @@ -12,12 +11,6 @@ from cognee.modules.retrieval.graph_completion_context_extension_retriever impor GraphCompletionContextExtensionRetriever, ) - -class TestAnswer(BaseModel): - answer: str - explanation: str - - class TestGraphCompletionWithContextExtensionRetriever: @pytest.mark.asyncio async def test_graph_completion_extension_context_simple(self): @@ -181,56 +174,3 @@ class TestGraphCompletionWithContextExtensionRetriever: assert all(isinstance(item, str) and item.strip() for item in answer), ( "Answer must contain only non-empty strings" ) - - @pytest.mark.asyncio - async def test_get_graph_structured_completion_extension_context(self): - system_directory_path = os.path.join( - pathlib.Path(__file__).parent, - ".cognee_system/test_get_graph_structured_completion_extension_context", - ) - cognee.config.system_root_directory(system_directory_path) - data_directory_path = os.path.join( - pathlib.Path(__file__).parent, - ".data_storage/test_get_graph_structured_completion_extension_context", - ) - cognee.config.data_root_directory(data_directory_path) - - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await setup() - - class Company(DataPoint): - name: str - - class Person(DataPoint): - name: str - works_for: Company - - company1 = Company(name="Figma") - person1 = Person(name="Steve Rodger", works_for=company1) - - entities = [company1, person1] - await add_data_points(entities) - - retriever = GraphCompletionContextExtensionRetriever() - - # Test with string response model (default) - string_answer = await retriever.get_completion("Who works at Figma?") - assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" - assert all(isinstance(item, str) and item.strip() for item in string_answer), ( - "Answer should not be empty" - ) - - # Test with structured response model - structured_answer = await retriever.get_completion( - "Who works at Figma?", response_model=TestAnswer - ) - assert isinstance(structured_answer, list), ( - f"Expected list, got {type(structured_answer).__name__}" - ) - assert all(isinstance(item, TestAnswer) for item in structured_answer), ( - f"Expected TestAnswer, got {type(structured_answer).__name__}" - ) - - assert structured_answer[0].answer.strip(), "Answer field should not be empty" - assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py index 79e4bcec3..ac58793be 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py @@ -2,7 +2,6 @@ import os import pytest import pathlib from typing import Optional, Union -from pydantic import BaseModel import cognee from cognee.low_level import setup, DataPoint @@ -10,12 +9,6 @@ from cognee.modules.graph.utils import resolve_edges_to_text from cognee.tasks.storage import add_data_points from cognee.modules.retrieval.graph_completion_cot_retriever import GraphCompletionCotRetriever - -class TestAnswer(BaseModel): - answer: str - explanation: str - - class TestGraphCompletionCoTRetriever: @pytest.mark.asyncio async def test_graph_completion_cot_context_simple(self): @@ -174,54 +167,3 @@ class TestGraphCompletionCoTRetriever: assert all(isinstance(item, str) and item.strip() for item in answer), ( "Answer must contain only non-empty strings" ) - - @pytest.mark.asyncio - async def test_get_graph_structured_completion_cot(self): - system_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".cognee_system/test_get_graph_structured_completion_cot" - ) - cognee.config.system_root_directory(system_directory_path) - data_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".data_storage/test_get_graph_structured_completion_cot" - ) - cognee.config.data_root_directory(data_directory_path) - - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await setup() - - class Company(DataPoint): - name: str - - class Person(DataPoint): - name: str - works_for: Company - - company1 = Company(name="Figma") - person1 = Person(name="Steve Rodger", works_for=company1) - - entities = [company1, person1] - await add_data_points(entities) - - retriever = GraphCompletionCotRetriever() - - # Test with string response model (default) - string_answer = await retriever.get_completion("Who works at Figma?") - assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" - assert all(isinstance(item, str) and item.strip() for item in string_answer), ( - "Answer should not be empty" - ) - - # Test with structured response model - structured_answer = await retriever.get_completion( - "Who works at Figma?", response_model=TestAnswer - ) - assert isinstance(structured_answer, list), ( - f"Expected list, got {type(structured_answer).__name__}" - ) - assert all(isinstance(item, TestAnswer) for item in structured_answer), ( - f"Expected TestAnswer, got {type(structured_answer).__name__}" - ) - - assert structured_answer[0].answer.strip(), "Answer field should not be empty" - assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py index e320fcef1..21e2af199 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py @@ -2,7 +2,6 @@ import os import pytest import pathlib from typing import Optional, Union -from pydantic import BaseModel import cognee from cognee.low_level import setup, DataPoint @@ -10,12 +9,6 @@ from cognee.modules.graph.utils import resolve_edges_to_text from cognee.tasks.storage import add_data_points from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever - -class TestAnswer(BaseModel): - answer: str - explanation: str - - class TestGraphCompletionRetriever: @pytest.mark.asyncio async def test_graph_completion_context_simple(self): @@ -227,54 +220,3 @@ class TestGraphCompletionRetriever: context = await retriever.get_context("Who works at Figma?") assert context == [], "Context should be empty on an empty graph" - - @pytest.mark.asyncio - async def test_get_graph_structured_completion(self): - system_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".cognee_system/test_get_graph_structured_completion" - ) - cognee.config.system_root_directory(system_directory_path) - data_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".data_storage/test_get_graph_structured_completion" - ) - cognee.config.data_root_directory(data_directory_path) - - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await setup() - - class Company(DataPoint): - name: str - - class Person(DataPoint): - name: str - works_for: Company - - company1 = Company(name="Figma") - person1 = Person(name="Steve Rodger", works_for=company1) - - entities = [company1, person1] - await add_data_points(entities) - - retriever = GraphCompletionRetriever() - - # Test with string response model (default) - string_answer = await retriever.get_completion("Who works at Figma?") - assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" - assert all(isinstance(item, str) and item.strip() for item in string_answer), ( - "Answer should not be empty" - ) - - # Test with structured response model - structured_answer = await retriever.get_completion( - "Who works at Figma?", response_model=TestAnswer - ) - assert isinstance(structured_answer, list), ( - f"Expected list, got {type(structured_answer).__name__}" - ) - assert all(isinstance(item, TestAnswer) for item in structured_answer), ( - f"Expected TestAnswer, got {type(structured_answer).__name__}" - ) - - assert structured_answer[0].answer.strip(), "Answer field should not be empty" - assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py index 248ecc047..37876794f 100644 --- a/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py @@ -3,7 +3,7 @@ from typing import List import pytest import pathlib import cognee -from pydantic import BaseModel + from cognee.low_level import setup from cognee.tasks.storage import add_data_points from cognee.infrastructure.databases.vector import get_vector_engine @@ -26,12 +26,6 @@ class DocumentChunkWithEntities(DataPoint): metadata: dict = {"index_fields": ["text"]} - -class TestAnswer(BaseModel): - answer: str - explanation: str - - class TestRAGCompletionRetriever: @pytest.mark.asyncio async def test_rag_completion_context_simple(self): @@ -208,76 +202,3 @@ class TestRAGCompletionRetriever: context = await retriever.get_context("Christina Mayer") assert context == "", "Returned context should be empty on an empty graph" - - @pytest.mark.asyncio - async def test_get_rag_structured_completion(self): - system_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".cognee_system/test_get_rag_structured_completion" - ) - cognee.config.system_root_directory(system_directory_path) - data_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".data_storage/test_get_rag_structured_completion" - ) - cognee.config.data_root_directory(data_directory_path) - - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await setup() - - document = TextDocument( - name="Steve Rodger's career", - raw_data_location="somewhere", - external_metadata="", - mime_type="text/plain", - ) - - chunk1 = DocumentChunk( - text="Steve Rodger", - chunk_size=2, - chunk_index=0, - cut_type="sentence_end", - is_part_of=document, - contains=[], - ) - chunk2 = DocumentChunk( - text="Mike Broski", - chunk_size=2, - chunk_index=1, - cut_type="sentence_end", - is_part_of=document, - contains=[], - ) - chunk3 = DocumentChunk( - text="Christina Mayer", - chunk_size=2, - chunk_index=2, - cut_type="sentence_end", - is_part_of=document, - contains=[], - ) - - entities = [chunk1, chunk2, chunk3] - await add_data_points(entities) - - retriever = CompletionRetriever() - - # Test with string response model (default) - string_answer = await retriever.get_completion("Where does Steve work?") - assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" - assert all(isinstance(item, str) and item.strip() for item in string_answer), ( - "Answer should not be empty" - ) - - # Test with structured response model - structured_answer = await retriever.get_completion( - "Where does Steve work?", response_model=TestAnswer - ) - assert isinstance(structured_answer, list), ( - f"Expected list, got {type(structured_answer).__name__}" - ) - assert all(isinstance(item, TestAnswer) for item in structured_answer), ( - f"Expected TestAnswer, got {type(structured_answer).__name__}" - ) - - assert structured_answer[0].answer.strip(), "Answer field should not be empty" - assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" diff --git a/cognee/tests/unit/modules/retrieval/structured_output_tests.py b/cognee/tests/unit/modules/retrieval/structured_output_tests.py new file mode 100644 index 000000000..95b4b9c20 --- /dev/null +++ b/cognee/tests/unit/modules/retrieval/structured_output_tests.py @@ -0,0 +1,206 @@ +import asyncio + +import pytest +import cognee +import pathlib +import os + +from pydantic import BaseModel +from cognee.low_level import setup, DataPoint +from cognee.tasks.storage import add_data_points +from cognee.modules.chunking.models import DocumentChunk +from cognee.modules.data.processing.document_types import TextDocument +from cognee.modules.engine.models import Entity, EntityType +from cognee.modules.retrieval.entity_extractors.DummyEntityExtractor import DummyEntityExtractor +from cognee.modules.retrieval.context_providers.DummyContextProvider import DummyContextProvider +from cognee.modules.retrieval.graph_completion_cot_retriever import GraphCompletionCotRetriever +from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever +from cognee.modules.retrieval.graph_completion_context_extension_retriever import ( + GraphCompletionContextExtensionRetriever, +) +from cognee.modules.retrieval.EntityCompletionRetriever import EntityCompletionRetriever +from cognee.modules.retrieval.temporal_retriever import TemporalRetriever +from cognee.modules.retrieval.completion_retriever import CompletionRetriever + + +class TestAnswer(BaseModel): + answer: str + explanation: str + + +def _assert_string_answer(answer: list[str]): + assert isinstance(answer, list), f"Expected str, got {type(answer).__name__}" + assert all(isinstance(item, str) and item.strip() for item in answer), "Items should be strings" + assert all(item.strip() for item in answer), "Items should not be empty" + + +def _assert_structured_answer(answer: list[TestAnswer]): + assert isinstance(answer, list), f"Expected list, got {type(answer).__name__}" + assert all(isinstance(x, TestAnswer) for x in answer), "Items should be TestAnswer" + assert all(x.answer.strip() for x in answer), "Answer text should not be empty" + assert all(x.explanation.strip() for x in answer), "Explanation should not be empty" + + +async def _test_get_structured_graph_completion_cot(): + retriever = GraphCompletionCotRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("Who works at Figma?") + _assert_string_answer(string_answer) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Who works at Figma?", response_model=TestAnswer + ) + _assert_structured_answer(structured_answer) + + +async def _test_get_structured_graph_completion(): + retriever = GraphCompletionRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("Who works at Figma?") + _assert_string_answer(string_answer) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Who works at Figma?", response_model=TestAnswer + ) + _assert_structured_answer(structured_answer) + + +async def _test_get_structured_graph_completion_temporal(): + retriever = TemporalRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("When did Steve start working at Figma?") + _assert_string_answer(string_answer) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "When did Steve start working at Figma??", response_model=TestAnswer + ) + _assert_structured_answer(structured_answer) + + +async def _test_get_structured_graph_completion_rag(): + retriever = CompletionRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("Where does Steve work?") + _assert_string_answer(string_answer) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Where does Steve work?", response_model=TestAnswer + ) + _assert_structured_answer(structured_answer) + + +async def _test_get_structured_graph_completion_context_extension(): + retriever = GraphCompletionContextExtensionRetriever() + + # Test with string response model (default) + string_answer = await retriever.get_completion("Who works at Figma?") + _assert_string_answer(string_answer) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Who works at Figma?", response_model=TestAnswer + ) + _assert_structured_answer(structured_answer) + + +async def _test_get_structured_entity_completion(): + retriever = EntityCompletionRetriever(DummyEntityExtractor(), DummyContextProvider()) + + # Test with string response model (default) + string_answer = await retriever.get_completion("Who is Albert Einstein?") + _assert_string_answer(string_answer) + + # Test with structured response model + structured_answer = await retriever.get_completion( + "Who is Albert Einstein?", response_model=TestAnswer + ) + _assert_structured_answer(structured_answer) + + +class TestStructuredOutputCompletion: + @pytest.mark.asyncio + async def test_get_structured_completion(self): + system_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".cognee_system/test_get_structured_completion" + ) + cognee.config.system_root_directory(system_directory_path) + data_directory_path = os.path.join( + pathlib.Path(__file__).parent, ".data_storage/test_get_structured_completion" + ) + cognee.config.data_root_directory(data_directory_path) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + await setup() + + class Company(DataPoint): + name: str + + class Person(DataPoint): + name: str + works_for: Company + works_since: int + + company1 = Company(name="Figma") + person1 = Person(name="Steve Rodger", works_for=company1, works_since=2015) + + entities = [company1, person1] + await add_data_points(entities) + + document = TextDocument( + name="Steve Rodger's career", + raw_data_location="somewhere", + external_metadata="", + mime_type="text/plain", + ) + + chunk1 = DocumentChunk( + text="Steve Rodger", + chunk_size=2, + chunk_index=0, + cut_type="sentence_end", + is_part_of=document, + contains=[], + ) + chunk2 = DocumentChunk( + text="Mike Broski", + chunk_size=2, + chunk_index=1, + cut_type="sentence_end", + is_part_of=document, + contains=[], + ) + chunk3 = DocumentChunk( + text="Christina Mayer", + chunk_size=2, + chunk_index=2, + cut_type="sentence_end", + is_part_of=document, + contains=[], + ) + + entities = [chunk1, chunk2, chunk3] + await add_data_points(entities) + + entity_type = EntityType(name="Person", description="A human individual") + entity = Entity(name="Albert Einstein", is_a=entity_type, description="A famous physicist") + + entities = [entity] + await add_data_points(entities) + + await asyncio.gather( + _test_get_structured_graph_completion_cot(), + _test_get_structured_graph_completion(), + _test_get_structured_graph_completion_temporal(), + _test_get_structured_graph_completion_rag(), + _test_get_structured_graph_completion_context_extension(), + _test_get_structured_entity_completion(), + ) diff --git a/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py b/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py index 5b274c822..22b2d3fe9 100644 --- a/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py @@ -1,13 +1,6 @@ -import asyncio -import os -import pathlib -import cognee from types import SimpleNamespace import pytest -from pydantic import BaseModel -from cognee.low_level import setup, DataPoint -from cognee.tasks.storage import add_data_points from cognee.modules.retrieval.temporal_retriever import TemporalRetriever @@ -146,65 +139,6 @@ async def test_filter_top_k_events_error_handling(): with pytest.raises((KeyError, TypeError)): await tr.filter_top_k_events([{}], []) - -class TestAnswer(BaseModel): - answer: str - explanation: str - - -@pytest.mark.asyncio -async def test_get_temporal_structured_completion(): - system_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".cognee_system/test_get_temporal_structured_completion" - ) - cognee.config.system_root_directory(system_directory_path) - data_directory_path = os.path.join( - pathlib.Path(__file__).parent, ".data_storage/test_get_temporal_structured_completion" - ) - cognee.config.data_root_directory(data_directory_path) - - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - await setup() - - class Company(DataPoint): - name: str - - class Person(DataPoint): - name: str - works_for: Company - works_since: int - - company1 = Company(name="Figma") - person1 = Person(name="Steve Rodger", works_for=company1, works_since=2015) - - entities = [company1, person1] - await add_data_points(entities) - - retriever = TemporalRetriever() - - # Test with string response model (default) - string_answer = await retriever.get_completion("When did Steve start working at Figma?") - assert isinstance(string_answer, list), f"Expected str, got {type(string_answer).__name__}" - assert all(isinstance(item, str) and item.strip() for item in string_answer), ( - "Answer should not be empty" - ) - - # Test with structured response model - structured_answer = await retriever.get_completion( - "When did Steve start working at Figma??", response_model=TestAnswer - ) - assert isinstance(structured_answer, list), ( - f"Expected list, got {type(structured_answer).__name__}" - ) - assert all(isinstance(item, TestAnswer) for item in structured_answer), ( - f"Expected TestAnswer, got {type(structured_answer).__name__}" - ) - - assert structured_answer[0].answer.strip(), "Answer field should not be empty" - assert structured_answer[0].explanation.strip(), "Explanation field should not be empty" - - class _FakeRetriever(TemporalRetriever): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From 72ba8d0dcb0306cfc5c618c854089bd68f4b9d3f Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 6 Nov 2025 17:12:33 +0100 Subject: [PATCH 095/129] chore: ruff format --- .../graph_completion_retriever_context_extension_test.py | 1 + .../modules/retrieval/graph_completion_retriever_cot_test.py | 1 + .../unit/modules/retrieval/graph_completion_retriever_test.py | 1 + .../unit/modules/retrieval/rag_completion_retriever_test.py | 1 + cognee/tests/unit/modules/retrieval/temporal_retriever_test.py | 1 + 5 files changed, 5 insertions(+) diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py index 29c8b7c95..0e21fe351 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py @@ -11,6 +11,7 @@ from cognee.modules.retrieval.graph_completion_context_extension_retriever impor GraphCompletionContextExtensionRetriever, ) + class TestGraphCompletionWithContextExtensionRetriever: @pytest.mark.asyncio async def test_graph_completion_extension_context_simple(self): diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py index ac58793be..206cfaf84 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py @@ -9,6 +9,7 @@ from cognee.modules.graph.utils import resolve_edges_to_text from cognee.tasks.storage import add_data_points from cognee.modules.retrieval.graph_completion_cot_retriever import GraphCompletionCotRetriever + class TestGraphCompletionCoTRetriever: @pytest.mark.asyncio async def test_graph_completion_cot_context_simple(self): diff --git a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py index 21e2af199..f462baced 100644 --- a/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py @@ -9,6 +9,7 @@ from cognee.modules.graph.utils import resolve_edges_to_text from cognee.tasks.storage import add_data_points from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever + class TestGraphCompletionRetriever: @pytest.mark.asyncio async def test_graph_completion_context_simple(self): diff --git a/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py b/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py index 37876794f..9bfed68f3 100644 --- a/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py @@ -26,6 +26,7 @@ class DocumentChunkWithEntities(DataPoint): metadata: dict = {"index_fields": ["text"]} + class TestRAGCompletionRetriever: @pytest.mark.asyncio async def test_rag_completion_context_simple(self): diff --git a/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py b/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py index 22b2d3fe9..c3c6a47f6 100644 --- a/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/temporal_retriever_test.py @@ -139,6 +139,7 @@ async def test_filter_top_k_events_error_handling(): with pytest.raises((KeyError, TypeError)): await tr.filter_top_k_events([{}], []) + class _FakeRetriever(TemporalRetriever): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From 7dec6bfdedf30149113a25aa66bf6c21980b605b Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 18:10:04 +0100 Subject: [PATCH 096/129] refactor: Add migrations as part of python package --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5f0aef1d8..8af35113c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,7 +167,6 @@ exclude = [ "/dist", "/.data", "/.github", - "/alembic", "/deployment", "/cognee-mcp", "/cognee-frontend", From 96c8bba5807e13cf376802da28817788ae4d6dbd Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 6 Nov 2025 19:12:09 +0100 Subject: [PATCH 097/129] refactor: Add db creation as step in MCP creation --- cognee-mcp/src/server.py | 4 ++++ cognee/modules/data/models/Dataset.py | 1 + 2 files changed, 5 insertions(+) diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py index ce6dad88a..7c708638c 100755 --- a/cognee-mcp/src/server.py +++ b/cognee-mcp/src/server.py @@ -1096,6 +1096,10 @@ async def main(): # Skip migrations when in API mode (the API server handles its own database) if not args.no_migration and not args.api_url: + from cognee.modules.engine.operations.setup import setup + + await setup() + # Run Alembic migrations from the main cognee directory where alembic.ini is located logger.info("Running database migrations...") migration_result = subprocess.run( diff --git a/cognee/modules/data/models/Dataset.py b/cognee/modules/data/models/Dataset.py index 00ed4da96..fba065253 100644 --- a/cognee/modules/data/models/Dataset.py +++ b/cognee/modules/data/models/Dataset.py @@ -37,5 +37,6 @@ class Dataset(Base): "createdAt": self.created_at.isoformat(), "updatedAt": self.updated_at.isoformat() if self.updated_at else None, "ownerId": str(self.owner_id), + "tenantId": str(self.tenant_id), "data": [data.to_json() for data in self.data], } From 4ab53c9d64a1cde20c6b38e78eb2583bb43fbf65 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Fri, 7 Nov 2025 10:00:17 +0100 Subject: [PATCH 098/129] changes based on PR comments --- cognee/modules/retrieval/base_graph_retriever.py | 10 +++++++--- cognee/modules/retrieval/base_retriever.py | 10 +++++++--- ...d_output_tests.py => structured_output_test.py} | 14 ++++++-------- .../modules/retrieval/summaries_retriever_test.py | 2 +- 4 files changed, 21 insertions(+), 15 deletions(-) rename cognee/tests/unit/modules/retrieval/{structured_output_tests.py => structured_output_test.py} (94%) diff --git a/cognee/modules/retrieval/base_graph_retriever.py b/cognee/modules/retrieval/base_graph_retriever.py index b0abc2991..b203309ba 100644 --- a/cognee/modules/retrieval/base_graph_retriever.py +++ b/cognee/modules/retrieval/base_graph_retriever.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Any, List, Optional, Type from abc import ABC, abstractmethod from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge @@ -14,7 +14,11 @@ class BaseGraphRetriever(ABC): @abstractmethod async def get_completion( - self, query: str, context: Optional[List[Edge]] = None, session_id: Optional[str] = None - ) -> str: + self, + query: str, + context: Optional[List[Edge]] = None, + session_id: Optional[str] = None, + response_model: Type = str, + ) -> List[Any]: """Generates a response using the query and optional context (triplets).""" pass diff --git a/cognee/modules/retrieval/base_retriever.py b/cognee/modules/retrieval/base_retriever.py index 1533dd44f..b88c741b8 100644 --- a/cognee/modules/retrieval/base_retriever.py +++ b/cognee/modules/retrieval/base_retriever.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, Type, List class BaseRetriever(ABC): @@ -12,7 +12,11 @@ class BaseRetriever(ABC): @abstractmethod async def get_completion( - self, query: str, context: Optional[Any] = None, session_id: Optional[str] = None - ) -> Any: + self, + query: str, + context: Optional[Any] = None, + session_id: Optional[str] = None, + response_model: Type = str, + ) -> List[Any]: """Generates a response using the query and optional context.""" pass diff --git a/cognee/tests/unit/modules/retrieval/structured_output_tests.py b/cognee/tests/unit/modules/retrieval/structured_output_test.py similarity index 94% rename from cognee/tests/unit/modules/retrieval/structured_output_tests.py rename to cognee/tests/unit/modules/retrieval/structured_output_test.py index 95b4b9c20..4ad3019ff 100644 --- a/cognee/tests/unit/modules/retrieval/structured_output_tests.py +++ b/cognee/tests/unit/modules/retrieval/structured_output_test.py @@ -196,11 +196,9 @@ class TestStructuredOutputCompletion: entities = [entity] await add_data_points(entities) - await asyncio.gather( - _test_get_structured_graph_completion_cot(), - _test_get_structured_graph_completion(), - _test_get_structured_graph_completion_temporal(), - _test_get_structured_graph_completion_rag(), - _test_get_structured_graph_completion_context_extension(), - _test_get_structured_entity_completion(), - ) + await _test_get_structured_graph_completion_cot() + await _test_get_structured_graph_completion() + await _test_get_structured_graph_completion_temporal() + await _test_get_structured_graph_completion_rag() + await _test_get_structured_graph_completion_context_extension() + await _test_get_structured_entity_completion() diff --git a/cognee/tests/unit/modules/retrieval/summaries_retriever_test.py b/cognee/tests/unit/modules/retrieval/summaries_retriever_test.py index fc96081bf..5f4b93425 100644 --- a/cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +++ b/cognee/tests/unit/modules/retrieval/summaries_retriever_test.py @@ -13,7 +13,7 @@ from cognee.modules.retrieval.exceptions.exceptions import NoDataError from cognee.modules.retrieval.summaries_retriever import SummariesRetriever -class TextSummariesRetriever: +class TestSummariesRetriever: @pytest.mark.asyncio async def test_chunk_context(self): system_directory_path = os.path.join( From 7d5d19347a4ccd44eb3bbbe23c821c4c600b7989 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Fri, 7 Nov 2025 11:11:05 +0100 Subject: [PATCH 099/129] CI: removed unnecessary ulimit --- .github/workflows/load_tests.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/load_tests.yml b/.github/workflows/load_tests.yml index ff11bb88b..f5b64d8ce 100644 --- a/.github/workflows/load_tests.yml +++ b/.github/workflows/load_tests.yml @@ -45,15 +45,9 @@ jobs: python-version: '3.11.x' extra-dependencies: "aws" - - name: Set File Descriptor Limit - run: sudo prlimit --pid $$ --nofile=4096:4096 - - name: Verify File Descriptor Limit run: ulimit -n - - name: Dependencies already installed - run: echo "Dependencies already installed in setup" - - name: Run Load Test env: ENV: 'dev' From 59f758d5c227b04f91e8086915fde078be3089db Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 7 Nov 2025 15:50:49 +0100 Subject: [PATCH 100/129] feat: Add test for multi tenancy, add ability to share name for dataset across tenants for one user --- .github/workflows/e2e_tests.yml | 29 ++- cognee/modules/data/methods/create_dataset.py | 1 + .../modules/data/methods/get_dataset_ids.py | 6 +- cognee/modules/search/methods/search.py | 2 + cognee/tests/test_multi_tenancy.py | 165 ++++++++++++++++++ 5 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 cognee/tests/test_multi_tenancy.py diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 0596f22d3..715487372 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -226,7 +226,7 @@ jobs: - name: Dependencies already installed run: echo "Dependencies already installed in setup" - - name: Run parallel databases test + - name: Run permissions test env: ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -239,6 +239,31 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_permissions.py + test-multi-tenancy: + name: Test multi tenancy with different situations in Cognee + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run multi tenancy test + env: + ENV: 'dev' + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_multi_tenancy.py + test-graph-edges: name: Test graph edge ingestion runs-on: ubuntu-22.04 @@ -487,4 +512,4 @@ jobs: AWS_ENDPOINT_URL: https://s3-eu-west-1.amazonaws.com AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_DEV_USER_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_DEV_USER_SECRET_KEY }} - run: uv run python ./cognee/tests/test_load.py \ No newline at end of file + run: uv run python ./cognee/tests/test_load.py diff --git a/cognee/modules/data/methods/create_dataset.py b/cognee/modules/data/methods/create_dataset.py index 280c9e105..7e28a8255 100644 --- a/cognee/modules/data/methods/create_dataset.py +++ b/cognee/modules/data/methods/create_dataset.py @@ -16,6 +16,7 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) - .options(joinedload(Dataset.data)) .filter(Dataset.name == dataset_name) .filter(Dataset.owner_id == owner_id) + .filter(Dataset.tenant_id == user.tenant_id) ) ).first() diff --git a/cognee/modules/data/methods/get_dataset_ids.py b/cognee/modules/data/methods/get_dataset_ids.py index d4402ff36..a61e85310 100644 --- a/cognee/modules/data/methods/get_dataset_ids.py +++ b/cognee/modules/data/methods/get_dataset_ids.py @@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user): # Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.) user_datasets = await get_datasets(user.id) # Filter out non name mentioned datasets - dataset_ids = [dataset.id for dataset in user_datasets if dataset.name in datasets] + dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets] + # Filter out non current tenant datasets + dataset_ids = [ + dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id + ] else: raise DatasetTypeError( f"One or more of the provided dataset types is not handled: f{datasets}" diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 5e465b239..b4278424b 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -172,6 +172,7 @@ async def search( "search_result": [context] if context else None, "dataset_id": datasets[0].id, "dataset_name": datasets[0].name, + "dataset_tenant_id": datasets[0].tenant_id, "graphs": graphs, } ) @@ -181,6 +182,7 @@ async def search( "search_result": [result] if result else None, "dataset_id": datasets[0].id, "dataset_name": datasets[0].name, + "dataset_tenant_id": datasets[0].tenant_id, "graphs": graphs, } ) diff --git a/cognee/tests/test_multi_tenancy.py b/cognee/tests/test_multi_tenancy.py new file mode 100644 index 000000000..7cdcda8d8 --- /dev/null +++ b/cognee/tests/test_multi_tenancy.py @@ -0,0 +1,165 @@ +import cognee +import pytest + +from cognee.modules.users.exceptions import PermissionDeniedError +from cognee.modules.users.tenants.methods import select_tenant +from cognee.modules.users.methods import get_user +from cognee.shared.logging_utils import get_logger +from cognee.modules.search.types import SearchType +from cognee.modules.users.methods import create_user +from cognee.modules.users.permissions.methods import authorized_give_permission_on_datasets +from cognee.modules.users.roles.methods import add_user_to_role +from cognee.modules.users.roles.methods import create_role +from cognee.modules.users.tenants.methods import create_tenant +from cognee.modules.users.tenants.methods import add_user_to_tenant +from cognee.modules.engine.operations.setup import setup +from cognee.shared.logging_utils import setup_logging, CRITICAL + +logger = get_logger() + + +async def main(): + # Create a clean slate for cognee -- reset data and system state + print("Resetting cognee data...") + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + print("Data reset complete.\n") + + # Set up the necessary databases and tables for user management. + await setup() + + # Add document for user_1, add it under dataset name AI + text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena. + At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages + this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the + preparation and manipulation of quantum state""" + + print("Creating user_1: user_1@example.com") + user_1 = await create_user("user_1@example.com", "example") + await cognee.add([text], dataset_name="AI", user=user_1) + + print("\nCreating user_2: user_2@example.com") + user_2 = await create_user("user_2@example.com", "example") + + # Run cognify for both datasets as the appropriate user/owner + print("\nCreating different datasets for user_1 (AI dataset) and user_2 (QUANTUM dataset)") + ai_cognify_result = await cognee.cognify(["AI"], user=user_1) + + # Extract dataset_ids from cognify results + def extract_dataset_id_from_cognify(cognify_result): + """Extract dataset_id from cognify output dictionary""" + for dataset_id, pipeline_result in cognify_result.items(): + return dataset_id # Return the first dataset_id + return None + + # Get dataset IDs from cognify results + # Note: When we want to work with datasets from other users (search, add, cognify and etc.) we must supply dataset + # information through dataset_id using dataset name only looks for datasets owned by current user + ai_dataset_id = extract_dataset_id_from_cognify(ai_cognify_result) + + # We can see here that user_1 can read his own dataset (AI dataset) + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_1, + datasets=[ai_dataset_id], + ) + + # Verify that user_2 cannot access user_1's dataset without permission + with pytest.raises(PermissionDeniedError): + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_2, + datasets=[ai_dataset_id], + ) + + # Create new tenant and role, add user_2 to tenant and role + tenant_id = await create_tenant("CogneeLab", user_1.id) + await select_tenant(user_id=user_1.id, tenant_id=tenant_id) + role_id = await create_role(role_name="Researcher", owner_id=user_1.id) + await add_user_to_tenant( + user_id=user_2.id, tenant_id=tenant_id, owner_id=user_1.id, set_as_active_tenant=True + ) + await add_user_to_role(user_id=user_2.id, role_id=role_id, owner_id=user_1.id) + + # Assert that user_1 cannot give permissions on his dataset to role before switching to the correct tenant + # AI dataset was made with default tenant and not CogneeLab tenant + with pytest.raises(PermissionDeniedError): + await authorized_give_permission_on_datasets( + role_id, + [ai_dataset_id], + "read", + user_1.id, + ) + + # We need to refresh the user object with changes made when switching tenants + user_1 = await get_user(user_1.id) + await cognee.add([text], dataset_name="AI_COGNEE_LAB", user=user_1) + ai_cognee_lab_cognify_result = await cognee.cognify(["AI_COGNEE_LAB"], user=user_1) + + ai_cognee_lab_dataset_id = extract_dataset_id_from_cognify(ai_cognee_lab_cognify_result) + + await authorized_give_permission_on_datasets( + role_id, + [ai_cognee_lab_dataset_id], + "read", + user_1.id, + ) + + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_2, + dataset_ids=[ai_cognee_lab_dataset_id], + ) + for result in search_results: + print(f"{result}\n") + + # Let's test changing tenants + tenant_id = await create_tenant("CogneeLab2", user_1.id) + await select_tenant(user_id=user_1.id, tenant_id=tenant_id) + + user_1 = await get_user(user_1.id) + await cognee.add([text], dataset_name="AI_COGNEE_LAB", user=user_1) + await cognee.cognify(["AI_COGNEE_LAB"], user=user_1) + + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_1, + ) + + # Assert only AI_COGNEE_LAB dataset from CogneeLab2 tenant is visible as the currently selected tenant + assert len(search_results) == 1, ( + f"Search results must only contain one dataset from current tenant: {search_results}" + ) + assert search_results[0]["dataset_name"] == "AI_COGNEE_LAB", ( + f"Dict must contain dataset name 'AI_COGNEE_LAB': {search_results[0]}" + ) + assert search_results[0]["dataset_tenant_id"] == user_1.tenant_id, ( + f"Dataset tenant_id must be same as user_1 tenant_id: {search_results[0]}" + ) + + # Switch back to no tenant (default tenant) + await select_tenant(user_id=user_1.id, tenant_id=None) + # Refresh user_1 object + user_1 = await get_user(user_1.id) + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text="What is in the document?", + user=user_1, + ) + assert len(search_results) == 1, ( + f"Search results must only contain one dataset from default tenant: {search_results}" + ) + assert search_results[0]["dataset_name"] == "AI", ( + f"Dict must contain dataset name 'AI': {search_results[0]}" + ) + + +if __name__ == "__main__": + import asyncio + + logger = setup_logging(log_level=CRITICAL) + asyncio.run(main()) From d6e2bd132b85d9e038475ec4adf87140f69e53ce Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 7 Nov 2025 16:37:37 +0100 Subject: [PATCH 101/129] refactor: Remove testme comment --- cognee/modules/users/roles/methods/add_user_to_role.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/users/roles/methods/add_user_to_role.py b/cognee/modules/users/roles/methods/add_user_to_role.py index d764ac900..23bb947f0 100644 --- a/cognee/modules/users/roles/methods/add_user_to_role.py +++ b/cognee/modules/users/roles/methods/add_user_to_role.py @@ -48,7 +48,7 @@ async def add_user_to_role(user_id: UUID, role_id: UUID, owner_id: UUID): raise UserNotFoundError elif not role: raise RoleNotFoundError - elif role.tenant_id not in [tenant.id for tenant in user_tenants]: # TESTME + elif role.tenant_id not in [tenant.id for tenant in user_tenants]: raise TenantNotFoundError( message="User tenant does not match role tenant. User cannot be added to role." ) From 234b13a8c9a62ecd7e85c5d250c37b13bd8dace3 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Fri, 7 Nov 2025 17:39:42 +0100 Subject: [PATCH 102/129] feat:add emptiness check to neptune adapter --- .../hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py b/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py index 5357f3d7c..1e16642b5 100644 --- a/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +++ b/cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py @@ -416,6 +416,15 @@ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface): self._client.query(f"MATCH (n :{self._VECTOR_NODE_LABEL}) DETACH DELETE n") pass + async def is_empty(self) -> bool: + query = """ + MATCH (n) + RETURN true + LIMIT 1; + """ + query_result = await self._client.query(query) + return len(query_result) == 0 + @staticmethod def _get_scored_result( item: dict, with_vector: bool = False, with_score: bool = False From 3710eec94ff547fb9fb80f3b3b35223098269ffc Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Mon, 10 Nov 2025 16:23:34 +0100 Subject: [PATCH 103/129] refactor: update docstring message --- cognee/api/v1/permissions/routers/get_permissions_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/api/v1/permissions/routers/get_permissions_router.py b/cognee/api/v1/permissions/routers/get_permissions_router.py index 20d35e748..63de97eaa 100644 --- a/cognee/api/v1/permissions/routers/get_permissions_router.py +++ b/cognee/api/v1/permissions/routers/get_permissions_router.py @@ -246,7 +246,7 @@ def get_permissions_router() -> APIRouter: - **tenant_id** (Union[UUID, None]): UUID of the tenant to select, If null/None is provided use the default single user tenant ## Response - Returns a success message indicating the tenant was created. + Returns a success message along with selected tenant id. """ send_telemetry( "Permissions API Endpoint Invoked", From b5f94c889d00e4043f9f7373b449d4dd165e2391 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Tue, 11 Nov 2025 12:51:09 +0100 Subject: [PATCH 104/129] Update cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py Co-authored-by: Boris --- .../permissions/methods/get_all_user_permission_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index ee1de3c72..5eed992db 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -39,7 +39,7 @@ async def get_all_user_permission_datasets(user: User, permission_type: str) -> # If the dataset id key already exists, leave the dictionary unchanged. unique.setdefault(dataset.id, dataset) - # Filter out dataset that aren't part of the current user's tenant + # Filter out dataset that aren't part of the selected user's tenant filtered_datasets = [] for dataset in list(unique.values()): if dataset.tenant_id == user.tenant_id: From 8e8aecb76ff66a48e4b5fe18a7ffaac48434f89a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrej=20Mili=C4=87evi=C4=87?= <85933103+siillee@users.noreply.github.com> Date: Tue, 11 Nov 2025 17:03:48 +0100 Subject: [PATCH 105/129] feat: enable multi user for falkor (#1689) ## Description Added multi-user support for Falkor. Adding support for the rest of the graph dbs should be a bit easier after this first one, especially since Falkor is hybrid. There are a few things code quality wise that might need changing, I am open to suggestions. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Andrej Milicevic Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Igor Ilic --- ..._expand_dataset_database_for_multi_user.py | 98 +++++++++++++++++++ cognee/context_global_variables.py | 25 ++--- .../infrastructure/databases/graph/config.py | 4 + .../databases/graph/get_graph_engine.py | 2 + .../utils/get_or_create_dataset_database.py | 40 +++++++- .../infrastructure/databases/vector/config.py | 3 + .../databases/vector/create_vector_engine.py | 4 + .../modules/users/models/DatasetDatabase.py | 9 ++ cognee/tests/test_parallel_databases.py | 2 + 9 files changed, 172 insertions(+), 15 deletions(-) create mode 100644 alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py diff --git a/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py new file mode 100644 index 000000000..7e13898ae --- /dev/null +++ b/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py @@ -0,0 +1,98 @@ +"""Expand dataset database for multi user + +Revision ID: 76625596c5c3 +Revises: 211ab850ef3d +Create Date: 2025-10-30 12:55:20.239562 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "76625596c5c3" +down_revision: Union[str, None] = "c946955da633" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _get_column(inspector, table, name, schema=None): + for col in inspector.get_columns(table, schema=schema): + if col["name"] == name: + return col + return None + + +def upgrade() -> None: + conn = op.get_bind() + insp = sa.inspect(conn) + + vector_database_provider_column = _get_column( + insp, "dataset_database", "vector_database_provider" + ) + if not vector_database_provider_column: + op.add_column( + "dataset_database", + sa.Column( + "vector_database_provider", + sa.String(), + unique=False, + nullable=False, + server_default="lancedb", + ), + ) + + graph_database_provider_column = _get_column( + insp, "dataset_database", "graph_database_provider" + ) + if not graph_database_provider_column: + op.add_column( + "dataset_database", + sa.Column( + "graph_database_provider", + sa.String(), + unique=False, + nullable=False, + server_default="kuzu", + ), + ) + + vector_database_url_column = _get_column(insp, "dataset_database", "vector_database_url") + if not vector_database_url_column: + op.add_column( + "dataset_database", + sa.Column("vector_database_url", sa.String(), unique=False, nullable=True), + ) + + graph_database_url_column = _get_column(insp, "dataset_database", "graph_database_url") + if not graph_database_url_column: + op.add_column( + "dataset_database", + sa.Column("graph_database_url", sa.String(), unique=False, nullable=True), + ) + + vector_database_key_column = _get_column(insp, "dataset_database", "vector_database_key") + if not vector_database_key_column: + op.add_column( + "dataset_database", + sa.Column("vector_database_key", sa.String(), unique=False, nullable=True), + ) + + graph_database_key_column = _get_column(insp, "dataset_database", "graph_database_key") + if not graph_database_key_column: + op.add_column( + "dataset_database", + sa.Column("graph_database_key", sa.String(), unique=False, nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("dataset_database", "vector_database_provider") + op.drop_column("dataset_database", "graph_database_provider") + op.drop_column("dataset_database", "vector_database_url") + op.drop_column("dataset_database", "graph_database_url") + op.drop_column("dataset_database", "vector_database_key") + op.drop_column("dataset_database", "graph_database_key") diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index f17c9187a..62e06fc64 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -16,8 +16,8 @@ vector_db_config = ContextVar("vector_db_config", default=None) graph_db_config = ContextVar("graph_db_config", default=None) session_user = ContextVar("session_user", default=None) -vector_dbs_with_multi_user_support = ["lancedb"] -graph_dbs_with_multi_user_support = ["kuzu"] +VECTOR_DBS_WITH_MULTI_USER_SUPPORT = ["lancedb", "falkor"] +GRAPH_DBS_WITH_MULTI_USER_SUPPORT = ["kuzu", "falkor"] async def set_session_user_context_variable(user): @@ -28,8 +28,8 @@ def multi_user_support_possible(): graph_db_config = get_graph_context_config() vector_db_config = get_vectordb_context_config() return ( - graph_db_config["graph_database_provider"] in graph_dbs_with_multi_user_support - and vector_db_config["vector_db_provider"] in vector_dbs_with_multi_user_support + graph_db_config["graph_database_provider"] in GRAPH_DBS_WITH_MULTI_USER_SUPPORT + and vector_db_config["vector_db_provider"] in VECTOR_DBS_WITH_MULTI_USER_SUPPORT ) @@ -69,8 +69,6 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ """ - base_config = get_base_config() - if not backend_access_control_enabled(): return @@ -79,6 +77,7 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ # To ensure permissions are enforced properly all datasets will have their own databases dataset_database = await get_or_create_dataset_database(dataset, user) + base_config = get_base_config() data_root_directory = os.path.join( base_config.data_root_directory, str(user.tenant_id or user.id) ) @@ -88,15 +87,17 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ # Set vector and graph database configuration based on dataset database information vector_config = { - "vector_db_url": os.path.join( - databases_directory_path, dataset_database.vector_database_name - ), - "vector_db_key": "", - "vector_db_provider": "lancedb", + "vector_db_provider": dataset_database.vector_database_provider, + "vector_db_url": dataset_database.vector_database_url, + "vector_db_key": dataset_database.vector_database_key, + "vector_db_name": dataset_database.vector_database_name, } graph_config = { - "graph_database_provider": "kuzu", + "graph_database_provider": dataset_database.graph_database_provider, + "graph_database_url": dataset_database.graph_database_url, + "graph_database_name": dataset_database.graph_database_name, + "graph_database_key": dataset_database.graph_database_key, "graph_file_path": os.path.join( databases_directory_path, dataset_database.graph_database_name ), diff --git a/cognee/infrastructure/databases/graph/config.py b/cognee/infrastructure/databases/graph/config.py index b7907313c..23687b359 100644 --- a/cognee/infrastructure/databases/graph/config.py +++ b/cognee/infrastructure/databases/graph/config.py @@ -26,6 +26,7 @@ class GraphConfig(BaseSettings): - graph_database_username - graph_database_password - graph_database_port + - graph_database_key - graph_file_path - graph_model - graph_topology @@ -41,6 +42,7 @@ class GraphConfig(BaseSettings): graph_database_username: str = "" graph_database_password: str = "" graph_database_port: int = 123 + graph_database_key: str = "" graph_file_path: str = "" graph_filename: str = "" graph_model: object = KnowledgeGraph @@ -90,6 +92,7 @@ class GraphConfig(BaseSettings): "graph_database_username": self.graph_database_username, "graph_database_password": self.graph_database_password, "graph_database_port": self.graph_database_port, + "graph_database_key": self.graph_database_key, "graph_file_path": self.graph_file_path, "graph_model": self.graph_model, "graph_topology": self.graph_topology, @@ -116,6 +119,7 @@ class GraphConfig(BaseSettings): "graph_database_username": self.graph_database_username, "graph_database_password": self.graph_database_password, "graph_database_port": self.graph_database_port, + "graph_database_key": self.graph_database_key, "graph_file_path": self.graph_file_path, } diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 1ea61d29f..82e3cad6e 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -33,6 +33,7 @@ def create_graph_engine( graph_database_username="", graph_database_password="", graph_database_port="", + graph_database_key="", ): """ Create a graph engine based on the specified provider type. @@ -69,6 +70,7 @@ def create_graph_engine( graph_database_url=graph_database_url, graph_database_username=graph_database_username, graph_database_password=graph_database_password, + database_name=graph_database_name, ) if graph_database_provider == "neo4j": diff --git a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py index 29156025d..3684bb100 100644 --- a/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +++ b/cognee/infrastructure/databases/utils/get_or_create_dataset_database.py @@ -1,11 +1,15 @@ +import os from uuid import UUID from typing import Union from sqlalchemy import select from sqlalchemy.exc import IntegrityError -from cognee.modules.data.methods import create_dataset +from cognee.base_config import get_base_config +from cognee.modules.data.methods import create_dataset from cognee.infrastructure.databases.relational import get_relational_engine +from cognee.infrastructure.databases.vector import get_vectordb_config +from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.modules.data.methods import get_unique_dataset_id from cognee.modules.users.models import DatasetDatabase from cognee.modules.users.models import User @@ -32,8 +36,32 @@ async def get_or_create_dataset_database( dataset_id = await get_unique_dataset_id(dataset, user) - vector_db_name = f"{dataset_id}.lance.db" - graph_db_name = f"{dataset_id}.pkl" + vector_config = get_vectordb_config() + graph_config = get_graph_config() + + # Note: for hybrid databases both graph and vector DB name have to be the same + if graph_config.graph_database_provider == "kuzu": + graph_db_name = f"{dataset_id}.pkl" + else: + graph_db_name = f"{dataset_id}" + + if vector_config.vector_db_provider == "lancedb": + vector_db_name = f"{dataset_id}.lance.db" + else: + vector_db_name = f"{dataset_id}" + + base_config = get_base_config() + databases_directory_path = os.path.join( + base_config.system_root_directory, "databases", str(user.id) + ) + + # Determine vector database URL + if vector_config.vector_db_provider == "lancedb": + vector_db_url = os.path.join(databases_directory_path, vector_config.vector_db_name) + else: + vector_db_url = vector_config.vector_database_url + + # Determine graph database URL async with db_engine.get_async_session() as session: # Create dataset if it doesn't exist @@ -55,6 +83,12 @@ async def get_or_create_dataset_database( dataset_id=dataset_id, vector_database_name=vector_db_name, graph_database_name=graph_db_name, + vector_database_provider=vector_config.vector_db_provider, + graph_database_provider=graph_config.graph_database_provider, + vector_database_url=vector_db_url, + graph_database_url=graph_config.graph_database_url, + vector_database_key=vector_config.vector_db_key, + graph_database_key=graph_config.graph_database_key, ) try: diff --git a/cognee/infrastructure/databases/vector/config.py b/cognee/infrastructure/databases/vector/config.py index b6d3ae644..7d28f1668 100644 --- a/cognee/infrastructure/databases/vector/config.py +++ b/cognee/infrastructure/databases/vector/config.py @@ -18,12 +18,14 @@ class VectorConfig(BaseSettings): Instance variables: - vector_db_url: The URL of the vector database. - vector_db_port: The port for the vector database. + - vector_db_name: The name of the vector database. - vector_db_key: The key for accessing the vector database. - vector_db_provider: The provider for the vector database. """ vector_db_url: str = "" vector_db_port: int = 1234 + vector_db_name: str = "" vector_db_key: str = "" vector_db_provider: str = "lancedb" @@ -58,6 +60,7 @@ class VectorConfig(BaseSettings): return { "vector_db_url": self.vector_db_url, "vector_db_port": self.vector_db_port, + "vector_db_name": self.vector_db_name, "vector_db_key": self.vector_db_key, "vector_db_provider": self.vector_db_provider, } diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index c54d94f6c..b182f084b 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -1,5 +1,6 @@ from .supported_databases import supported_databases from .embeddings import get_embedding_engine +from cognee.infrastructure.databases.graph.config import get_graph_context_config from functools import lru_cache @@ -8,6 +9,7 @@ from functools import lru_cache def create_vector_engine( vector_db_provider: str, vector_db_url: str, + vector_db_name: str, vector_db_port: str = "", vector_db_key: str = "", ): @@ -27,6 +29,7 @@ def create_vector_engine( - vector_db_url (str): The URL for the vector database instance. - vector_db_port (str): The port for the vector database instance. Required for some providers. + - vector_db_name (str): The name of the vector database instance. - vector_db_key (str): The API key or access token for the vector database instance. - vector_db_provider (str): The name of the vector database provider to use (e.g., 'pgvector'). @@ -45,6 +48,7 @@ def create_vector_engine( url=vector_db_url, api_key=vector_db_key, embedding_engine=embedding_engine, + database_name=vector_db_name, ) if vector_db_provider.lower() == "pgvector": diff --git a/cognee/modules/users/models/DatasetDatabase.py b/cognee/modules/users/models/DatasetDatabase.py index 0d71d8413..25d610ab9 100644 --- a/cognee/modules/users/models/DatasetDatabase.py +++ b/cognee/modules/users/models/DatasetDatabase.py @@ -15,5 +15,14 @@ class DatasetDatabase(Base): vector_database_name = Column(String, unique=True, nullable=False) graph_database_name = Column(String, unique=True, nullable=False) + vector_database_provider = Column(String, unique=False, nullable=False) + graph_database_provider = Column(String, unique=False, nullable=False) + + vector_database_url = Column(String, unique=False, nullable=True) + graph_database_url = Column(String, unique=False, nullable=True) + + vector_database_key = Column(String, unique=False, nullable=True) + graph_database_key = Column(String, unique=False, nullable=True) + created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) diff --git a/cognee/tests/test_parallel_databases.py b/cognee/tests/test_parallel_databases.py index 9a590921a..3164206ed 100755 --- a/cognee/tests/test_parallel_databases.py +++ b/cognee/tests/test_parallel_databases.py @@ -33,11 +33,13 @@ async def main(): "vector_db_url": "cognee1.test", "vector_db_key": "", "vector_db_provider": "lancedb", + "vector_db_name": "", } task_2_config = { "vector_db_url": "cognee2.test", "vector_db_key": "", "vector_db_provider": "lancedb", + "vector_db_name": "", } task_1_graph_config = { From cc0e1a83ab71f4bb47d1ef0d6308eca185294c86 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 19:55:29 +0100 Subject: [PATCH 106/129] refactor: Disable telemetry for all non telemetry tests --- .github/actions/cognee_setup/action.yml | 4 ++++ .github/workflows/basic_tests.yml | 5 +++++ .github/workflows/cli_tests.yml | 3 +++ .github/workflows/db_examples_tests.yml | 6 +++--- .github/workflows/e2e_tests.yml | 2 +- .github/workflows/examples_tests.yml | 11 +++++++++++ 6 files changed, 27 insertions(+), 4 deletions(-) diff --git a/.github/actions/cognee_setup/action.yml b/.github/actions/cognee_setup/action.yml index 4017d524b..bdc0ae690 100644 --- a/.github/actions/cognee_setup/action.yml +++ b/.github/actions/cognee_setup/action.yml @@ -42,3 +42,7 @@ runs: done fi uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS + + - name: Add telemetry identifier for telemetry test and in case telemetry is enabled by accident + run: | + echo "test-machine" > .anon_id diff --git a/.github/workflows/basic_tests.yml b/.github/workflows/basic_tests.yml index b7f324310..98ced21dc 100644 --- a/.github/workflows/basic_tests.yml +++ b/.github/workflows/basic_tests.yml @@ -75,6 +75,7 @@ jobs: name: Run Unit Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -104,6 +105,7 @@ jobs: name: Run Integration Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -132,6 +134,7 @@ jobs: name: Run Simple Examples runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -161,6 +164,7 @@ jobs: name: Run Simple Examples BAML runs-on: ubuntu-22.04 env: + ENV: 'dev' STRUCTURED_OUTPUT_FRAMEWORK: "BAML" BAML_LLM_PROVIDER: openai BAML_LLM_MODEL: ${{ secrets.OPENAI_MODEL }} @@ -198,6 +202,7 @@ jobs: name: Run Basic Graph Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} diff --git a/.github/workflows/cli_tests.yml b/.github/workflows/cli_tests.yml index 958d341ae..d4f8e5ac0 100644 --- a/.github/workflows/cli_tests.yml +++ b/.github/workflows/cli_tests.yml @@ -39,6 +39,7 @@ jobs: name: CLI Unit Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -66,6 +67,7 @@ jobs: name: CLI Integration Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -93,6 +95,7 @@ jobs: name: CLI Functionality Tests runs-on: ubuntu-22.04 env: + ENV: 'dev' LLM_PROVIDER: openai LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} diff --git a/.github/workflows/db_examples_tests.yml b/.github/workflows/db_examples_tests.yml index 51ac9a82a..c58bc48ef 100644 --- a/.github/workflows/db_examples_tests.yml +++ b/.github/workflows/db_examples_tests.yml @@ -60,7 +60,7 @@ jobs: - name: Run Neo4j Example env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -95,7 +95,7 @@ jobs: - name: Run Kuzu Example env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -141,7 +141,7 @@ jobs: - name: Run PGVector Example env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index bfa75f693..584225afe 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -454,7 +454,7 @@ jobs: - name: Run Conversation session tests env: - ENV: dev + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 36953e259..f7cc278cb 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -21,6 +21,7 @@ jobs: - name: Run Multimedia Example env: + ENV: 'dev' LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: uv run python ./examples/python/multimedia_example.py @@ -40,6 +41,7 @@ jobs: - name: Run Evaluation Framework Example env: + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -69,6 +71,7 @@ jobs: - name: Run Descriptive Graph Metrics Example env: + ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} @@ -99,6 +102,7 @@ jobs: - name: Run Dynamic Steps Tests env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -124,6 +128,7 @@ jobs: - name: Run Temporal Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -149,6 +154,7 @@ jobs: - name: Run Ontology Demo Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -174,6 +180,7 @@ jobs: - name: Run Agentic Reasoning Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -199,6 +206,7 @@ jobs: - name: Run Memify Tests env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -224,6 +232,7 @@ jobs: - name: Run Custom Pipeline Example env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -249,6 +258,7 @@ jobs: - name: Run Memify Tests env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} @@ -274,6 +284,7 @@ jobs: - name: Run Docling Test env: + ENV: 'dev' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} From ba693b7ef46b617a5aa069ad7bb7bee00fb04586 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Tue, 11 Nov 2025 19:58:30 +0100 Subject: [PATCH 107/129] chore: add shell to setting of anon_id in gh action --- .github/actions/cognee_setup/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/cognee_setup/action.yml b/.github/actions/cognee_setup/action.yml index bdc0ae690..3f5726015 100644 --- a/.github/actions/cognee_setup/action.yml +++ b/.github/actions/cognee_setup/action.yml @@ -44,5 +44,6 @@ runs: uv sync --extra api --extra docs --extra evals --extra codegraph --extra ollama --extra dev --extra neo4j --extra redis $EXTRA_ARGS - name: Add telemetry identifier for telemetry test and in case telemetry is enabled by accident + shell: bash run: | echo "test-machine" > .anon_id From b716c2e3c431ce10679a30150afb554d32557212 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Wed, 12 Nov 2025 13:35:04 +0100 Subject: [PATCH 108/129] Chore: Acceptance Criteria for PRs --- .github/pull_request_template.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0e6f74188..be9d219c1 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -6,6 +6,14 @@ Please provide a clear, human-generated description of the changes in this PR. DO NOT use AI-generated descriptions. We want to understand your thought process and reasoning. --> +## Acceptance Criteria + + ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) From 056424f244fe029cc3acdd0127f70796bac25377 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Wed, 12 Nov 2025 14:34:30 +0000 Subject: [PATCH 109/129] feat: fs-cache (#1645) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Implement File-Based Version of the Redis Cache Adapter Description and acceptance criteria: This PR introduces a file-based cache adapter as an alternative to the existing Redis-based adapter. It provides the same core functionality for caching session data and maintaining context across multiple user interactions but stores data locally in files instead of Redis. Because the shared Kùzu lock mechanism relies on Redis, it is not supported in this implementation. If a lock is configured, the adapter will raise an error to prevent misconfiguration. You can test this adapter by enabling caching with the following settings: caching=True cache_backend="fs" When running multiple searches in a session, the system should correctly maintain conversational context. For example: - What is XY? - Are you sure? - What was my first question? In this case, the adapter should preserve previous user–Cognee interactions within the cache file so that follow-up queries remain context-aware. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> --- .github/workflows/e2e_tests.yml | 63 +++++- .../infrastructure/databases/cache/config.py | 4 +- .../databases/cache/fscache/FsCacheAdapter.py | 151 +++++++++++++ .../databases/cache/get_cache_engine.py | 30 ++- .../databases/exceptions/exceptions.py | 16 ++ cognee/shared/logging_utils.py | 2 + .../databases/cache/test_cache_config.py | 5 + poetry.lock | 206 +++++++++++++++--- pyproject.toml | 7 + uv.lock | 104 +++++++++ 10 files changed, 543 insertions(+), 45 deletions(-) create mode 100644 cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 584225afe..3dea2548c 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -333,7 +333,7 @@ jobs: python-version: '3.11.x' extra-dependencies: "postgres redis" - - name: Run Concurrent subprocess access test (Kuzu/Lancedb/Postgres) + - name: Run Concurrent subprocess access test (Kuzu/Lancedb/Postgres/Redis) env: ENV: dev LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -346,6 +346,7 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_PROVIDER: 'kuzu' CACHING: true + CACHE_BACKEND: 'redis' SHARED_KUZU_LOCK: true DB_PROVIDER: 'postgres' DB_NAME: 'cognee_db' @@ -411,8 +412,8 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_feedback_enrichment.py - run_conversation_sessions_test: - name: Conversation sessions test + run_conversation_sessions_test_redis: + name: Conversation sessions test (Redis) runs-on: ubuntu-latest defaults: run: @@ -452,7 +453,7 @@ jobs: python-version: '3.11.x' extra-dependencies: "postgres redis" - - name: Run Conversation session tests + - name: Run Conversation session tests (Redis) env: ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -465,6 +466,60 @@ jobs: EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_PROVIDER: 'kuzu' CACHING: true + CACHE_BACKEND: 'redis' + DB_PROVIDER: 'postgres' + DB_NAME: 'cognee_db' + DB_HOST: '127.0.0.1' + DB_PORT: 5432 + DB_USERNAME: cognee + DB_PASSWORD: cognee + run: uv run python ./cognee/tests/test_conversation_history.py + + run_conversation_sessions_test_fs: + name: Conversation sessions test (FS) + runs-on: ubuntu-latest + defaults: + run: + shell: bash + services: + postgres: + image: pgvector/pgvector:pg17 + env: + POSTGRES_USER: cognee + POSTGRES_PASSWORD: cognee + POSTGRES_DB: cognee_db + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + extra-dependencies: "postgres" + + - name: Run Conversation session tests (FS) + env: + ENV: dev + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + GRAPH_DATABASE_PROVIDER: 'kuzu' + CACHING: true + CACHE_BACKEND: 'fs' DB_PROVIDER: 'postgres' DB_NAME: 'cognee_db' DB_HOST: '127.0.0.1' diff --git a/cognee/infrastructure/databases/cache/config.py b/cognee/infrastructure/databases/cache/config.py index 3a28827fe..88ac05885 100644 --- a/cognee/infrastructure/databases/cache/config.py +++ b/cognee/infrastructure/databases/cache/config.py @@ -1,6 +1,6 @@ from pydantic_settings import BaseSettings, SettingsConfigDict from functools import lru_cache -from typing import Optional +from typing import Optional, Literal class CacheConfig(BaseSettings): @@ -15,6 +15,7 @@ class CacheConfig(BaseSettings): - agentic_lock_timeout: Maximum time (in seconds) to wait for the lock release. """ + cache_backend: Literal["redis", "fs"] = "fs" caching: bool = False shared_kuzu_lock: bool = False cache_host: str = "localhost" @@ -28,6 +29,7 @@ class CacheConfig(BaseSettings): def to_dict(self) -> dict: return { + "cache_backend": self.cache_backend, "caching": self.caching, "shared_kuzu_lock": self.shared_kuzu_lock, "cache_host": self.cache_host, diff --git a/cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py b/cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py new file mode 100644 index 000000000..497e6afec --- /dev/null +++ b/cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py @@ -0,0 +1,151 @@ +import asyncio +import json +import os +from datetime import datetime +import time +import threading +import diskcache as dc + +from cognee.infrastructure.databases.cache.cache_db_interface import CacheDBInterface +from cognee.infrastructure.databases.exceptions.exceptions import ( + CacheConnectionError, + SharedKuzuLockRequiresRedisError, +) +from cognee.infrastructure.files.storage.get_storage_config import get_storage_config +from cognee.shared.logging_utils import get_logger + +logger = get_logger("FSCacheAdapter") + + +class FSCacheAdapter(CacheDBInterface): + def __init__(self): + default_key = "sessions_db" + + storage_config = get_storage_config() + data_root_directory = storage_config["data_root_directory"] + cache_directory = os.path.join(data_root_directory, ".cognee_fs_cache", default_key) + os.makedirs(cache_directory, exist_ok=True) + self.cache = dc.Cache(directory=cache_directory) + self.cache.expire() + + logger.debug(f"FSCacheAdapter initialized with cache directory: {cache_directory}") + + def acquire_lock(self): + """Lock acquisition is not available for filesystem cache backend.""" + message = "Shared Kuzu lock requires Redis cache backend." + logger.error(message) + raise SharedKuzuLockRequiresRedisError() + + def release_lock(self): + """Lock release is not available for filesystem cache backend.""" + message = "Shared Kuzu lock requires Redis cache backend." + logger.error(message) + raise SharedKuzuLockRequiresRedisError() + + async def add_qa( + self, + user_id: str, + session_id: str, + question: str, + context: str, + answer: str, + ttl: int | None = 86400, + ): + try: + session_key = f"agent_sessions:{user_id}:{session_id}" + + qa_entry = { + "time": datetime.utcnow().isoformat(), + "question": question, + "context": context, + "answer": answer, + } + + existing_value = self.cache.get(session_key) + if existing_value is not None: + value: list = json.loads(existing_value) + value.append(qa_entry) + else: + value = [qa_entry] + + self.cache.set(session_key, json.dumps(value), expire=ttl) + except Exception as e: + error_msg = f"Unexpected error while adding Q&A to diskcache: {str(e)}" + logger.error(error_msg) + raise CacheConnectionError(error_msg) from e + + async def get_latest_qa(self, user_id: str, session_id: str, last_n: int = 5): + session_key = f"agent_sessions:{user_id}:{session_id}" + value = self.cache.get(session_key) + if value is None: + return None + entries = json.loads(value) + return entries[-last_n:] if len(entries) > last_n else entries + + async def get_all_qas(self, user_id: str, session_id: str): + session_key = f"agent_sessions:{user_id}:{session_id}" + value = self.cache.get(session_key) + if value is None: + return None + return json.loads(value) + + async def close(self): + if self.cache is not None: + self.cache.expire() + self.cache.close() + + +async def main(): + adapter = FSCacheAdapter() + session_id = "demo_session" + user_id = "demo_user_id" + + print("\nAdding sample Q/A pairs...") + await adapter.add_qa( + user_id, + session_id, + "What is Redis?", + "Basic DB context", + "Redis is an in-memory data store.", + ) + await adapter.add_qa( + user_id, + session_id, + "Who created Redis?", + "Historical context", + "Salvatore Sanfilippo (antirez).", + ) + + print("\nLatest QA:") + latest = await adapter.get_latest_qa(user_id, session_id) + print(json.dumps(latest, indent=2)) + + print("\nLast 2 QAs:") + last_two = await adapter.get_latest_qa(user_id, session_id, last_n=2) + print(json.dumps(last_two, indent=2)) + + session_id = "session_expire_demo" + + await adapter.add_qa( + user_id, + session_id, + "What is Redis?", + "Database context", + "Redis is an in-memory data store.", + ) + + await adapter.add_qa( + user_id, + session_id, + "Who created Redis?", + "History context", + "Salvatore Sanfilippo (antirez).", + ) + + print(await adapter.get_all_qas(user_id, session_id)) + + await adapter.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/infrastructure/databases/cache/get_cache_engine.py b/cognee/infrastructure/databases/cache/get_cache_engine.py index c1fa3311c..f70358607 100644 --- a/cognee/infrastructure/databases/cache/get_cache_engine.py +++ b/cognee/infrastructure/databases/cache/get_cache_engine.py @@ -1,9 +1,11 @@ """Factory to get the appropriate cache coordination engine (e.g., Redis).""" from functools import lru_cache +import os from typing import Optional from cognee.infrastructure.databases.cache.config import get_cache_config from cognee.infrastructure.databases.cache.cache_db_interface import CacheDBInterface +from cognee.infrastructure.databases.cache.fscache.FsCacheAdapter import FSCacheAdapter config = get_cache_config() @@ -33,20 +35,28 @@ def create_cache_engine( Returns: -------- - - CacheDBInterface: An instance of the appropriate cache adapter. :TODO: Now we support only Redis. later if we add more here we can split the logic + - CacheDBInterface: An instance of the appropriate cache adapter. """ if config.caching: from cognee.infrastructure.databases.cache.redis.RedisAdapter import RedisAdapter - return RedisAdapter( - host=cache_host, - port=cache_port, - username=cache_username, - password=cache_password, - lock_name=lock_key, - timeout=agentic_lock_expire, - blocking_timeout=agentic_lock_timeout, - ) + if config.cache_backend == "redis": + return RedisAdapter( + host=cache_host, + port=cache_port, + username=cache_username, + password=cache_password, + lock_name=lock_key, + timeout=agentic_lock_expire, + blocking_timeout=agentic_lock_timeout, + ) + elif config.cache_backend == "fs": + return FSCacheAdapter() + else: + raise ValueError( + f"Unsupported cache backend: '{config.cache_backend}'. " + f"Supported backends are: 'redis', 'fs'" + ) else: return None diff --git a/cognee/infrastructure/databases/exceptions/exceptions.py b/cognee/infrastructure/databases/exceptions/exceptions.py index 72b13e3a2..d8dd99c17 100644 --- a/cognee/infrastructure/databases/exceptions/exceptions.py +++ b/cognee/infrastructure/databases/exceptions/exceptions.py @@ -148,3 +148,19 @@ class CacheConnectionError(CogneeConfigurationError): status_code: int = status.HTTP_503_SERVICE_UNAVAILABLE, ): super().__init__(message, name, status_code) + + +class SharedKuzuLockRequiresRedisError(CogneeConfigurationError): + """ + Raised when shared Kuzu locking is requested without configuring the Redis backend. + """ + + def __init__( + self, + message: str = ( + "Shared Kuzu lock requires Redis cache backend. Configure Redis to enable shared Kuzu locking." + ), + name: str = "SharedKuzuLockRequiresRedisError", + status_code: int = status.HTTP_400_BAD_REQUEST, + ): + super().__init__(message, name, status_code) diff --git a/cognee/shared/logging_utils.py b/cognee/shared/logging_utils.py index 0e5120b1d..e8efde72c 100644 --- a/cognee/shared/logging_utils.py +++ b/cognee/shared/logging_utils.py @@ -450,6 +450,8 @@ def setup_logging(log_level=None, name=None): try: msg = self.format(record) stream = self.stream + if hasattr(stream, "closed") and stream.closed: + return stream.write("\n" + msg + self.terminator) self.flush() except Exception: diff --git a/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py b/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py index a8d3bda82..837a9955c 100644 --- a/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +++ b/cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py @@ -8,6 +8,7 @@ def test_cache_config_defaults(): """Test that CacheConfig has the correct default values.""" config = CacheConfig() + assert config.cache_backend == "fs" assert config.caching is False assert config.shared_kuzu_lock is False assert config.cache_host == "localhost" @@ -19,6 +20,7 @@ def test_cache_config_defaults(): def test_cache_config_custom_values(): """Test that CacheConfig accepts custom values.""" config = CacheConfig( + cache_backend="redis", caching=True, shared_kuzu_lock=True, cache_host="redis.example.com", @@ -27,6 +29,7 @@ def test_cache_config_custom_values(): agentic_lock_timeout=180, ) + assert config.cache_backend == "redis" assert config.caching is True assert config.shared_kuzu_lock is True assert config.cache_host == "redis.example.com" @@ -38,6 +41,7 @@ def test_cache_config_custom_values(): def test_cache_config_to_dict(): """Test the to_dict method returns all configuration values.""" config = CacheConfig( + cache_backend="fs", caching=True, shared_kuzu_lock=True, cache_host="test-host", @@ -49,6 +53,7 @@ def test_cache_config_to_dict(): config_dict = config.to_dict() assert config_dict == { + "cache_backend": "fs", "caching": True, "shared_kuzu_lock": True, "cache_host": "test-host", diff --git a/poetry.lock b/poetry.lock index 08fd42660..67de51633 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "accelerate" @@ -539,7 +539,7 @@ description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"redis\" and python_full_version < \"3.11.3\" or python_version == \"3.10\"" +markers = "python_full_version < \"3.11.3\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -1231,12 +1231,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["main"] -markers = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")" +groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")", dev = "sys_platform == \"win32\""} [[package]] name = "coloredlogs" @@ -2347,7 +2347,7 @@ version = "1.3.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"}, @@ -2408,6 +2408,32 @@ files = [ [package.dependencies] tzdata = "*" +[[package]] +name = "fakeredis" +version = "2.32.0" +description = "Python implementation of redis API, can be used for testing purposes." +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "fakeredis-2.32.0-py3-none-any.whl", hash = "sha256:c9da8228de84060cfdb72c3cf4555c18c59ba7a5ae4d273f75e4822d6f01ecf8"}, + {file = "fakeredis-2.32.0.tar.gz", hash = "sha256:63d745b40eb6c8be4899cf2a53187c097ccca3afbca04fdbc5edc8b936cd1d59"}, +] + +[package.dependencies] +lupa = {version = ">=2.1,<3.0", optional = true, markers = "extra == \"lua\""} +redis = {version = ">=4.3", markers = "python_version > \"3.8\""} +sortedcontainers = ">=2,<3" +typing-extensions = {version = ">=4.7,<5.0", markers = "python_version < \"3.11\""} + +[package.extras] +bf = ["pyprobables (>=0.6)"] +cf = ["pyprobables (>=0.6)"] +json = ["jsonpath-ng (>=1.6,<2.0)"] +lua = ["lupa (>=2.1,<3.0)"] +probabilistic = ["pyprobables (>=0.6)"] +valkey = ["valkey (>=6) ; python_version >= \"3.8\""] + [[package]] name = "fastapi" version = "0.117.1" @@ -2543,6 +2569,7 @@ files = [ {file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9b31dd488d0778c36f8279b306dc92a42f16904cba54acca71e107d65b60b0c"}, {file = "fastuuid-0.12.0-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:b19361ee649365eefc717ec08005972d3d1eb9ee39908022d98e3bfa9da59e37"}, {file = "fastuuid-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:8fc66b11423e6f3e1937385f655bedd67aebe56a3dcec0cb835351cfe7d358c9"}, + {file = "fastuuid-0.12.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:2925f67b88d47cb16aa3eb1ab20fdcf21b94d74490e0818c91ea41434b987493"}, {file = "fastuuid-0.12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7b15c54d300279ab20a9cc0579ada9c9f80d1bc92997fc61fb7bf3103d7cb26b"}, {file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:458f1bc3ebbd76fdb89ad83e6b81ccd3b2a99fa6707cd3650b27606745cfb170"}, {file = "fastuuid-0.12.0-cp38-cp38-manylinux_2_34_x86_64.whl", hash = "sha256:a8f0f83fbba6dc44271a11b22e15838641b8c45612cdf541b4822a5930f6893c"}, @@ -3705,14 +3732,14 @@ type = ["pytest-mypy"] name = "iniconfig" version = "2.1.0" description = "brain-dead simple config-ini parsing" -optional = true +optional = false python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\"" +groups = ["main", "dev"] files = [ {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [[package]] name = "instructor" @@ -4169,6 +4196,8 @@ groups = ["main"] markers = "extra == \"dlt\"" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, + {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, + {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -5082,6 +5111,104 @@ win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} [package.extras] dev = ["Sphinx (==8.1.3) ; python_version >= \"3.11\"", "build (==1.2.2) ; python_version >= \"3.11\"", "colorama (==0.4.5) ; python_version < \"3.8\"", "colorama (==0.4.6) ; python_version >= \"3.8\"", "exceptiongroup (==1.1.3) ; python_version >= \"3.7\" and python_version < \"3.11\"", "freezegun (==1.1.0) ; python_version < \"3.8\"", "freezegun (==1.5.0) ; python_version >= \"3.8\"", "mypy (==v0.910) ; python_version < \"3.6\"", "mypy (==v0.971) ; python_version == \"3.6\"", "mypy (==v1.13.0) ; python_version >= \"3.8\"", "mypy (==v1.4.1) ; python_version == \"3.7\"", "myst-parser (==4.0.0) ; python_version >= \"3.11\"", "pre-commit (==4.0.1) ; python_version >= \"3.9\"", "pytest (==6.1.2) ; python_version < \"3.8\"", "pytest (==8.3.2) ; python_version >= \"3.8\"", "pytest-cov (==2.12.1) ; python_version < \"3.8\"", "pytest-cov (==5.0.0) ; python_version == \"3.8\"", "pytest-cov (==6.0.0) ; python_version >= \"3.9\"", "pytest-mypy-plugins (==1.9.3) ; python_version >= \"3.6\" and python_version < \"3.8\"", "pytest-mypy-plugins (==3.1.0) ; python_version >= \"3.8\"", "sphinx-rtd-theme (==3.0.2) ; python_version >= \"3.11\"", "tox (==3.27.1) ; python_version < \"3.8\"", "tox (==4.23.2) ; python_version >= \"3.8\"", "twine (==6.0.1) ; python_version >= \"3.11\""] +[[package]] +name = "lupa" +version = "2.6" +description = "Python wrapper around Lua and LuaJIT" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "lupa-2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6b3dabda836317e63c5ad052826e156610f356a04b3003dfa0dbe66b5d54d671"}, + {file = "lupa-2.6-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8726d1c123bbe9fbb974ce29825e94121824e66003038ff4532c14cc2ed0c51c"}, + {file = "lupa-2.6-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:f4e159e7d814171199b246f9235ca8961f6461ea8c1165ab428afa13c9289a94"}, + {file = "lupa-2.6-cp310-cp310-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:202160e80dbfddfb79316692a563d843b767e0f6787bbd1c455f9d54052efa6c"}, + {file = "lupa-2.6-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5deede7c5b36ab64f869dae4831720428b67955b0bb186c8349cf6ea121c852b"}, + {file = "lupa-2.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86f04901f920bbf7c0cac56807dc9597e42347123e6f1f3ca920f15f54188ce5"}, + {file = "lupa-2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6deef8f851d6afb965c84849aa5b8c38856942df54597a811ce0369ced678610"}, + {file = "lupa-2.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:21f2b5549681c2a13b1170a26159d30875d367d28f0247b81ca347222c755038"}, + {file = "lupa-2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:66eea57630eab5e6f49fdc5d7811c0a2a41f2011be4ea56a087ea76112011eb7"}, + {file = "lupa-2.6-cp310-cp310-win32.whl", hash = "sha256:60a403de8cab262a4fe813085dd77010effa6e2eb1886db2181df803140533b1"}, + {file = "lupa-2.6-cp310-cp310-win_amd64.whl", hash = "sha256:e4656a39d93dfa947cf3db56dc16c7916cb0cc8024acd3a952071263f675df64"}, + {file = "lupa-2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6d988c0f9331b9f2a5a55186701a25444ab10a1432a1021ee58011499ecbbdd5"}, + {file = "lupa-2.6-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:ebe1bbf48259382c72a6fe363dea61a0fd6fe19eab95e2ae881e20f3654587bf"}, + {file = "lupa-2.6-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:a8fcee258487cf77cdd41560046843bb38c2e18989cd19671dd1e2596f798306"}, + {file = "lupa-2.6-cp311-cp311-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:561a8e3be800827884e767a694727ed8482d066e0d6edfcbf423b05e63b05535"}, + {file = "lupa-2.6-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af880a62d47991cae78b8e9905c008cbfdc4a3a9723a66310c2634fc7644578c"}, + {file = "lupa-2.6-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80b22923aa4023c86c0097b235615f89d469a0c4eee0489699c494d3367c4c85"}, + {file = "lupa-2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:153d2cc6b643f7efb9cfc0c6bb55ec784d5bac1a3660cfc5b958a7b8f38f4a75"}, + {file = "lupa-2.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3fa8777e16f3ded50b72967dc17e23f5a08e4f1e2c9456aff2ebdb57f5b2869f"}, + {file = "lupa-2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8dbdcbe818c02a2f56f5ab5ce2de374dab03e84b25266cfbaef237829bc09b3f"}, + {file = "lupa-2.6-cp311-cp311-win32.whl", hash = "sha256:defaf188fde8f7a1e5ce3a5e6d945e533b8b8d547c11e43b96c9b7fe527f56dc"}, + {file = "lupa-2.6-cp311-cp311-win_amd64.whl", hash = "sha256:9505ae600b5c14f3e17e70f87f88d333717f60411faca1ddc6f3e61dce85fa9e"}, + {file = "lupa-2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47ce718817ef1cc0c40d87c3d5ae56a800d61af00fbc0fad1ca9be12df2f3b56"}, + {file = "lupa-2.6-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7aba985b15b101495aa4b07112cdc08baa0c545390d560ad5cfde2e9e34f4d58"}, + {file = "lupa-2.6-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:b766f62f95b2739f2248977d29b0722e589dcf4f0ccfa827ccbd29f0148bd2e5"}, + {file = "lupa-2.6-cp312-cp312-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:00a934c23331f94cb51760097ebfab14b005d55a6b30a2b480e3c53dd2fa290d"}, + {file = "lupa-2.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21de9f38bd475303e34a042b7081aabdf50bd9bafd36ce4faea2f90fd9f15c31"}, + {file = "lupa-2.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf3bda96d3fc41237e964a69c23647d50d4e28421111360274d4799832c560e9"}, + {file = "lupa-2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a76ead245da54801a81053794aa3975f213221f6542d14ec4b859ee2e7e0323"}, + {file = "lupa-2.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8dd0861741caa20886ddbda0a121d8e52fb9b5bb153d82fa9bba796962bf30e8"}, + {file = "lupa-2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:239e63948b0b23023f81d9a19a395e768ed3da6a299f84e7963b8f813f6e3f9c"}, + {file = "lupa-2.6-cp312-cp312-win32.whl", hash = "sha256:325894e1099499e7a6f9c351147661a2011887603c71086d36fe0f964d52d1ce"}, + {file = "lupa-2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c735a1ce8ee60edb0fe71d665f1e6b7c55c6021f1d340eb8c865952c602cd36f"}, + {file = "lupa-2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:663a6e58a0f60e7d212017d6678639ac8df0119bc13c2145029dcba084391310"}, + {file = "lupa-2.6-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:d1f5afda5c20b1f3217a80e9bc1b77037f8a6eb11612fd3ada19065303c8f380"}, + {file = "lupa-2.6-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:26f2b3c085fe76e9119e48c1013c1cccdc1f51585d456858290475aa38e7089e"}, + {file = "lupa-2.6-cp313-cp313-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:60d2f902c7b96fb8ab98493dcff315e7bb4d0b44dc9dd76eb37de575025d5685"}, + {file = "lupa-2.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a02d25dee3a3250967c36590128d9220ae02f2eda166a24279da0b481519cbff"}, + {file = "lupa-2.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6eae1ee16b886b8914ff292dbefbf2f48abfbdee94b33a88d1d5475e02423203"}, + {file = "lupa-2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0edd5073a4ee74ab36f74fe61450148e6044f3952b8d21248581f3c5d1a58be"}, + {file = "lupa-2.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0c53ee9f22a8a17e7d4266ad48e86f43771951797042dd51d1494aaa4f5f3f0a"}, + {file = "lupa-2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:de7c0f157a9064a400d828789191a96da7f4ce889969a588b87ec80de9b14772"}, + {file = "lupa-2.6-cp313-cp313-win32.whl", hash = "sha256:ee9523941ae0a87b5b703417720c5d78f72d2f5bc23883a2ea80a949a3ed9e75"}, + {file = "lupa-2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b1335a5835b0a25ebdbc75cf0bda195e54d133e4d994877ef025e218c2e59db9"}, + {file = "lupa-2.6-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:dcb6d0a3264873e1653bc188499f48c1fb4b41a779e315eba45256cfe7bc33c1"}, + {file = "lupa-2.6-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:a37e01f2128f8c36106726cb9d360bac087d58c54b4522b033cc5691c584db18"}, + {file = "lupa-2.6-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:458bd7e9ff3c150b245b0fcfbb9bd2593d1152ea7f0a7b91c1d185846da033fe"}, + {file = "lupa-2.6-cp314-cp314-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:052ee82cac5206a02df77119c325339acbc09f5ce66967f66a2e12a0f3211cad"}, + {file = "lupa-2.6-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96594eca3c87dd07938009e95e591e43d554c1dbd0385be03c100367141db5a8"}, + {file = "lupa-2.6-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8faddd9d198688c8884091173a088a8e920ecc96cda2ffed576a23574c4b3f6"}, + {file = "lupa-2.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:daebb3a6b58095c917e76ba727ab37b27477fb926957c825205fbda431552134"}, + {file = "lupa-2.6-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:f3154e68972befe0f81564e37d8142b5d5d79931a18309226a04ec92487d4ea3"}, + {file = "lupa-2.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e4dadf77b9fedc0bfa53417cc28dc2278a26d4cbd95c29f8927ad4d8fe0a7ef9"}, + {file = "lupa-2.6-cp314-cp314-win32.whl", hash = "sha256:cb34169c6fa3bab3e8ac58ca21b8a7102f6a94b6a5d08d3636312f3f02fafd8f"}, + {file = "lupa-2.6-cp314-cp314-win_amd64.whl", hash = "sha256:b74f944fe46c421e25d0f8692aef1e842192f6f7f68034201382ac440ef9ea67"}, + {file = "lupa-2.6-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0e21b716408a21ab65723f8841cf7f2f37a844b7a965eeabb785e27fca4099cf"}, + {file = "lupa-2.6-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:589db872a141bfff828340079bbdf3e9a31f2689f4ca0d88f97d9e8c2eae6142"}, + {file = "lupa-2.6-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:cd852a91a4a9d4dcbb9a58100f820a75a425703ec3e3f049055f60b8533b7953"}, + {file = "lupa-2.6-cp314-cp314t-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:0334753be028358922415ca97a64a3048e4ed155413fc4eaf87dd0a7e2752983"}, + {file = "lupa-2.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:661d895cd38c87658a34780fac54a690ec036ead743e41b74c3fb81a9e65a6aa"}, + {file = "lupa-2.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aa58454ccc13878cc177c62529a2056be734da16369e451987ff92784994ca7"}, + {file = "lupa-2.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1425017264e470c98022bba8cff5bd46d054a827f5df6b80274f9cc71dafd24f"}, + {file = "lupa-2.6-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:224af0532d216e3105f0a127410f12320f7c5f1aa0300bdf9646b8d9afb0048c"}, + {file = "lupa-2.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9abb98d5a8fd27c8285302e82199f0e56e463066f88f619d6594a450bf269d80"}, + {file = "lupa-2.6-cp314-cp314t-win32.whl", hash = "sha256:1849efeba7a8f6fb8aa2c13790bee988fd242ae404bd459509640eeea3d1e291"}, + {file = "lupa-2.6-cp314-cp314t-win_amd64.whl", hash = "sha256:fc1498d1a4fc028bc521c26d0fad4ca00ed63b952e32fb95949bda76a04bad52"}, + {file = "lupa-2.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9591700991e333b70dd92b48f152eb4731b8b24af671a9f6f721b74d68ed4499"}, + {file = "lupa-2.6-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:ef8dfa7fe08bc3f4591411b8945bbeb15af8512c3e7ad5e9b1e3a9036cdbbce7"}, + {file = "lupa-2.6-cp38-cp38-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:728c466e91174dad238f8a9c1cbdb8e69ffe559df85f87ee76edac3395300949"}, + {file = "lupa-2.6-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c781170bc7134704ae317a66204d30688b41d3e471e17e659987ea4947e11f20"}, + {file = "lupa-2.6-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241f4ddab33b9a686fc76667241bebc39a06b74ec40d79ec222f5add9000fe57"}, + {file = "lupa-2.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:c17f6b6193ced33cc7ca0b2b08b319a1b3501b014a3a3f9999c01cafc04c40f5"}, + {file = "lupa-2.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:fa6c1379e83d4104065c151736250a09f3a99e368423c7a20f9c59b15945e9fc"}, + {file = "lupa-2.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aef1a8bc10c50695e1a33a07dbef803b93eb97fc150fdb19858d704a603a67dd"}, + {file = "lupa-2.6-cp38-cp38-win32.whl", hash = "sha256:10c191bc1d5565e4360d884bea58320975ddb33270cdf9a9f55d1a1efe79aa03"}, + {file = "lupa-2.6-cp38-cp38-win_amd64.whl", hash = "sha256:05681f8ffb41f0c7fbb9ca859cc3a7e4006e9c6350d25358b535c5295c6a9928"}, + {file = "lupa-2.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8897dc6c3249786b2cdf2f83324febb436193d4581b6a71dea49f77bf8b19bb0"}, + {file = "lupa-2.6-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:4446396ca3830be0c106c70db4b4f622c37b2d447874c07952cafb9c57949a4a"}, + {file = "lupa-2.6-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:5826e687c89995a6eaafeae242071ba16448eec1a9ee8e17ed48551b5d1e21c2"}, + {file = "lupa-2.6-cp39-cp39-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:5871935cb36d1d22f9c04ac0db75c06751bd95edcfa0d9309f732de908e297a9"}, + {file = "lupa-2.6-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:43eb6e43ea8512d0d65b995d36dd9d77aa02598035e25b84c23a1b58700c9fb2"}, + {file = "lupa-2.6-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:559714053018d9885cc8c36a33c5b7eb9aad30fb6357719cac3ce4dc6b39157e"}, + {file = "lupa-2.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:57ac88a00ce59bd9d4ddcd4fca8e02564765725f5068786b011c9d1be3de20c5"}, + {file = "lupa-2.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:b683fbd867c2e54c44a686361b75eee7e7a790da55afdbe89f1f23b106de0274"}, + {file = "lupa-2.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d2f656903a2ed2e074bf2b7d300968028dfa327a45b055be8e3b51ef0b82f9bf"}, + {file = "lupa-2.6-cp39-cp39-win32.whl", hash = "sha256:bf28f68ae231b72008523ab5ac23835ba0f76e0e99ec38b59766080a84eb596a"}, + {file = "lupa-2.6-cp39-cp39-win_amd64.whl", hash = "sha256:b4b2e9b3795a9897cf6cfcc58d08210fdc0d13ab47c9a0e13858c68932d8353c"}, + {file = "lupa-2.6.tar.gz", hash = "sha256:9a770a6e89576be3447668d7ced312cd6fd41d3c13c2462c9dc2c2ab570e45d9"}, +] + [[package]] name = "lxml" version = "4.9.4" @@ -7507,7 +7634,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -8162,14 +8289,14 @@ kaleido = ["kaleido (>=1.0.0)"] name = "pluggy" version = "1.6.0" description = "plugin and hook calling mechanisms for python" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\"" +groups = ["main", "dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\""} [package.extras] dev = ["pre-commit", "tox"] @@ -8529,6 +8656,7 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, + {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -8590,6 +8718,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -9569,14 +9698,14 @@ files = [ name = "pytest" version = "7.4.4" description = "pytest: simple powerful testing with Python" -optional = true +optional = false python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\"" +groups = ["main", "dev"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -9663,6 +9792,21 @@ files = [ packaging = ">=17.1" pytest = ">=6.2" +[[package]] +name = "pytest-timeout" +version = "2.4.0" +description = "pytest plugin to abort hanging tests" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"}, + {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"}, +] + +[package.dependencies] +pytest = ">=7.0.0" + [[package]] name = "pytest-xdist" version = "3.8.0" @@ -10245,10 +10389,9 @@ orjson = ["orjson (>=3.9.14,<4)"] name = "redis" version = "5.3.1" description = "Python client for Redis database and key-value store" -optional = true +optional = false python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"redis\"" files = [ {file = "redis-5.3.1-py3-none-any.whl", hash = "sha256:dc1909bd24669cc31b5f67a039700b16ec30571096c5f1f0d9d2324bff31af97"}, {file = "redis-5.3.1.tar.gz", hash = "sha256:ca49577a531ea64039b5a36db3d6cd1a0c7a60c34124d46924a45b956e8cf14c"}, @@ -11478,6 +11621,18 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] + [[package]] name = "soupsieve" version = "2.8" @@ -11501,9 +11656,7 @@ groups = ["main"] files = [ {file = "SQLAlchemy-2.0.43-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:21ba7a08a4253c5825d1db389d4299f64a100ef9800e4624c8bf70d8f136e6ed"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11b9503fa6f8721bef9b8567730f664c5a5153d25e247aadc69247c4bc605227"}, - {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07097c0a1886c150ef2adba2ff7437e84d40c0f7dcb44a2c2b9c905ccfc6361c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cdeff998cb294896a34e5b2f00e383e7c5c4ef3b4bfa375d9104723f15186443"}, - {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:bcf0724a62a5670e5718957e05c56ec2d6850267ea859f8ad2481838f889b42c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win32.whl", hash = "sha256:c697575d0e2b0a5f0433f679bda22f63873821d991e95a90e9e52aae517b2e32"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win_amd64.whl", hash = "sha256:d34c0f6dbefd2e816e8f341d0df7d4763d382e3f452423e752ffd1e213da2512"}, {file = "sqlalchemy-2.0.43-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:70322986c0c699dca241418fcf18e637a4369e0ec50540a2b907b184c8bca069"}, @@ -11538,20 +11691,12 @@ files = [ {file = "sqlalchemy-2.0.43-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9df7126fd9db49e3a5a3999442cc67e9ee8971f3cb9644250107d7296cb2a164"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win32.whl", hash = "sha256:7f1ac7828857fcedb0361b48b9ac4821469f7694089d15550bbcf9ab22564a1d"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win_amd64.whl", hash = "sha256:971ba928fcde01869361f504fcff3b7143b47d30de188b11c6357c0505824197"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4e6aeb2e0932f32950cf56a8b4813cb15ff792fc0c9b3752eaf067cfe298496a"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:61f964a05356f4bca4112e6334ed7c208174511bd56e6b8fc86dad4d024d4185"}, {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46293c39252f93ea0910aababa8752ad628bcce3a10d3f260648dd472256983f"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:136063a68644eca9339d02e6693932116f6a8591ac013b0014479a1de664e40a"}, {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6e2bf13d9256398d037fef09fd8bf9b0bf77876e22647d10761d35593b9ac547"}, - {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:44337823462291f17f994d64282a71c51d738fc9ef561bf265f1d0fd9116a782"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win32.whl", hash = "sha256:13194276e69bb2af56198fef7909d48fd34820de01d9c92711a5fa45497cc7ed"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win_amd64.whl", hash = "sha256:334f41fa28de9f9be4b78445e68530da3c5fa054c907176460c81494f4ae1f5e"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ceb5c832cc30663aeaf5e39657712f4c4241ad1f638d487ef7216258f6d41fe7"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11f43c39b4b2ec755573952bbcc58d976779d482f6f832d7f33a8d869ae891bf"}, {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:413391b2239db55be14fa4223034d7e13325a1812c8396ecd4f2c08696d5ccad"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c379e37b08c6c527181a397212346be39319fb64323741d23e46abd97a400d34"}, {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:03d73ab2a37d9e40dec4984d1813d7878e01dbdc742448d44a7341b7a9f408c7"}, - {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8cee08f15d9e238ede42e9bbc1d6e7158d0ca4f176e4eab21f88ac819ae3bd7b"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win32.whl", hash = "sha256:b3edaec7e8b6dc5cd94523c6df4f294014df67097c8217a89929c99975811414"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win_amd64.whl", hash = "sha256:227119ce0a89e762ecd882dc661e0aa677a690c914e358f0dd8932a2e8b2765b"}, {file = "sqlalchemy-2.0.43-py3-none-any.whl", hash = "sha256:1681c21dd2ccee222c2fe0bef671d1aef7c504087c9c4e800371cfcc8ac966fc"}, @@ -11920,7 +12065,7 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, @@ -12392,11 +12537,12 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] +markers = {dev = "python_version == \"3.10\""} [[package]] name = "typing-inspect" @@ -13527,4 +13673,4 @@ scraping = ["APScheduler", "beautifulsoup4", "lxml", "playwright", "protego", "t [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "9490de8c950400c004a87333eda35311109bc1708a98e053bc2f66d883f4f702" +content-hash = "b6ede4c196d086f7159f84142c16d16fcc19bc73fcb9ab274a3b6351e6fcbb7e" diff --git a/pyproject.toml b/pyproject.toml index 8af35113c..13266f83e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,8 @@ dependencies = [ "websockets>=15.0.1,<16.0.0", "mistralai>=1.9.10", "tenacity>=9.0.0", + "fakeredis[lua]>=2.32.0", + "diskcache>=5.6.3", ] [project.optional-dependencies] @@ -198,3 +200,8 @@ exclude = [ [tool.ruff.lint] ignore = ["F401"] + +[dependency-groups] +dev = [ + "pytest-timeout>=2.4.0", +] diff --git a/uv.lock b/uv.lock index e2fc1df83..8c35a3366 100644 --- a/uv.lock +++ b/uv.lock @@ -936,6 +936,8 @@ dependencies = [ { name = "aiohttp" }, { name = "aiosqlite" }, { name = "alembic" }, + { name = "diskcache" }, + { name = "fakeredis", extra = ["lua"] }, { name = "fastapi" }, { name = "fastapi-users", extra = ["sqlalchemy"] }, { name = "fastembed" }, @@ -1097,6 +1099,11 @@ scraping = [ { name = "tavily-python" }, ] +[package.dev-dependencies] +dev = [ + { name = "pytest-timeout" }, +] + [package.metadata] requires-dist = [ { name = "aiofiles", specifier = ">=23.2.1,<24.0.0" }, @@ -1114,8 +1121,10 @@ requires-dist = [ { name = "debugpy", marker = "extra == 'debug'", specifier = ">=1.8.9,<2.0.0" }, { name = "deepeval", marker = "extra == 'deepeval'", specifier = ">=3.0.1,<4" }, { name = "deptry", marker = "extra == 'dev'", specifier = ">=0.20.0,<0.21" }, + { name = "diskcache", specifier = ">=5.6.3" }, { name = "dlt", extras = ["sqlalchemy"], marker = "extra == 'dlt'", specifier = ">=1.9.0,<2" }, { name = "docling", marker = "extra == 'docling'", specifier = ">=2.54" }, + { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.0" }, { name = "fastapi", specifier = ">=0.116.2,<1.0.0" }, { name = "fastapi-users", extras = ["sqlalchemy"], specifier = ">=14.0.1,<15.0.0" }, { name = "fastembed", specifier = "<=0.6.0" }, @@ -1203,6 +1212,9 @@ requires-dist = [ ] provides-extras = ["api", "distributed", "scraping", "neo4j", "neptune", "postgres", "postgres-binary", "notebook", "langchain", "llama-index", "huggingface", "ollama", "mistral", "anthropic", "deepeval", "posthog", "groq", "chromadb", "docs", "codegraph", "evals", "graphiti", "aws", "dlt", "baml", "dev", "debug", "redis", "monitoring", "docling"] +[package.metadata.requires-dev] +dev = [{ name = "pytest-timeout", specifier = ">=2.4.0" }] + [[package]] name = "colorama" version = "0.4.6" @@ -2047,6 +2059,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a3/46/8f4097b55e43af39e8e71e1f7aec59ff7398bca54d975c30889bc844719d/faker-37.11.0-py3-none-any.whl", hash = "sha256:1508d2da94dfd1e0087b36f386126d84f8583b3de19ac18e392a2831a6676c57", size = 1975525, upload-time = "2025-10-07T14:48:58.29Z" }, ] +[[package]] +name = "fakeredis" +version = "2.32.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "redis" }, + { name = "sortedcontainers" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e1/2e/94ca3f2ff35f086d7d3eeb924054e328b2ac851f0a20302d942c8d29726c/fakeredis-2.32.0.tar.gz", hash = "sha256:63d745b40eb6c8be4899cf2a53187c097ccca3afbca04fdbc5edc8b936cd1d59", size = 171097, upload-time = "2025-10-07T10:46:58.876Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/1b/84ab7fd197eba5243b6625c78fbcffaa4cf6ac7dda42f95d22165f52187e/fakeredis-2.32.0-py3-none-any.whl", hash = "sha256:c9da8228de84060cfdb72c3cf4555c18c59ba7a5ae4d273f75e4822d6f01ecf8", size = 118422, upload-time = "2025-10-07T10:46:57.643Z" }, +] + +[package.optional-dependencies] +lua = [ + { name = "lupa" }, +] + [[package]] name = "fastapi" version = "0.119.0" @@ -3880,6 +3911,58 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, ] +[[package]] +name = "lupa" +version = "2.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b8/1c/191c3e6ec6502e3dbe25a53e27f69a5daeac3e56de1f73c0138224171ead/lupa-2.6.tar.gz", hash = "sha256:9a770a6e89576be3447668d7ced312cd6fd41d3c13c2462c9dc2c2ab570e45d9", size = 7240282, upload-time = "2025-10-24T07:20:29.738Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/15/713cab5d0dfa4858f83b99b3e0329072df33dc14fc3ebbaa017e0f9755c4/lupa-2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6b3dabda836317e63c5ad052826e156610f356a04b3003dfa0dbe66b5d54d671", size = 954828, upload-time = "2025-10-24T07:17:15.726Z" }, + { url = "https://files.pythonhosted.org/packages/2e/71/704740cbc6e587dd6cc8dabf2f04820ac6a671784e57cc3c29db795476db/lupa-2.6-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8726d1c123bbe9fbb974ce29825e94121824e66003038ff4532c14cc2ed0c51c", size = 1919259, upload-time = "2025-10-24T07:17:18.586Z" }, + { url = "https://files.pythonhosted.org/packages/eb/18/f248341c423c5d48837e35584c6c3eb4acab7e722b6057d7b3e28e42dae8/lupa-2.6-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:f4e159e7d814171199b246f9235ca8961f6461ea8c1165ab428afa13c9289a94", size = 984998, upload-time = "2025-10-24T07:17:20.428Z" }, + { url = "https://files.pythonhosted.org/packages/44/1e/8a4bd471e018aad76bcb9455d298c2c96d82eced20f2ae8fcec8cd800948/lupa-2.6-cp310-cp310-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:202160e80dbfddfb79316692a563d843b767e0f6787bbd1c455f9d54052efa6c", size = 1174871, upload-time = "2025-10-24T07:17:22.755Z" }, + { url = "https://files.pythonhosted.org/packages/2a/5c/3a3f23fd6a91b0986eea1ceaf82ad3f9b958fe3515a9981fb9c4eb046c8b/lupa-2.6-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5deede7c5b36ab64f869dae4831720428b67955b0bb186c8349cf6ea121c852b", size = 1057471, upload-time = "2025-10-24T07:17:24.908Z" }, + { url = "https://files.pythonhosted.org/packages/45/ac/01be1fed778fb0c8f46ee8cbe344e4d782f6806fac12717f08af87aa4355/lupa-2.6-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86f04901f920bbf7c0cac56807dc9597e42347123e6f1f3ca920f15f54188ce5", size = 2100592, upload-time = "2025-10-24T07:17:27.089Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6c/1a05bb873e30830f8574e10cd0b4cdbc72e9dbad2a09e25810b5e3b1f75d/lupa-2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6deef8f851d6afb965c84849aa5b8c38856942df54597a811ce0369ced678610", size = 1081396, upload-time = "2025-10-24T07:17:29.064Z" }, + { url = "https://files.pythonhosted.org/packages/a2/c2/a19dd80d6dc98b39bbf8135b8198e38aa7ca3360b720eac68d1d7e9286b5/lupa-2.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:21f2b5549681c2a13b1170a26159d30875d367d28f0247b81ca347222c755038", size = 1192007, upload-time = "2025-10-24T07:17:31.362Z" }, + { url = "https://files.pythonhosted.org/packages/4f/43/e1b297225c827f55752e46fdbfb021c8982081b0f24490e42776ea69ae3b/lupa-2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:66eea57630eab5e6f49fdc5d7811c0a2a41f2011be4ea56a087ea76112011eb7", size = 2196661, upload-time = "2025-10-24T07:17:33.484Z" }, + { url = "https://files.pythonhosted.org/packages/2e/8f/2272d429a7fa9dc8dbd6e9c5c9073a03af6007eb22a4c78829fec6a34b80/lupa-2.6-cp310-cp310-win32.whl", hash = "sha256:60a403de8cab262a4fe813085dd77010effa6e2eb1886db2181df803140533b1", size = 1412738, upload-time = "2025-10-24T07:17:35.11Z" }, + { url = "https://files.pythonhosted.org/packages/35/2a/1708911271dd49ad87b4b373b5a4b0e0a0516d3d2af7b76355946c7ee171/lupa-2.6-cp310-cp310-win_amd64.whl", hash = "sha256:e4656a39d93dfa947cf3db56dc16c7916cb0cc8024acd3a952071263f675df64", size = 1656898, upload-time = "2025-10-24T07:17:36.949Z" }, + { url = "https://files.pythonhosted.org/packages/ca/29/1f66907c1ebf1881735afa695e646762c674f00738ebf66d795d59fc0665/lupa-2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6d988c0f9331b9f2a5a55186701a25444ab10a1432a1021ee58011499ecbbdd5", size = 962875, upload-time = "2025-10-24T07:17:39.107Z" }, + { url = "https://files.pythonhosted.org/packages/e6/67/4a748604be360eb9c1c215f6a0da921cd1a2b44b2c5951aae6fb83019d3a/lupa-2.6-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:ebe1bbf48259382c72a6fe363dea61a0fd6fe19eab95e2ae881e20f3654587bf", size = 1935390, upload-time = "2025-10-24T07:17:41.427Z" }, + { url = "https://files.pythonhosted.org/packages/ac/0c/8ef9ee933a350428b7bdb8335a37ef170ab0bb008bbf9ca8f4f4310116b6/lupa-2.6-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:a8fcee258487cf77cdd41560046843bb38c2e18989cd19671dd1e2596f798306", size = 992193, upload-time = "2025-10-24T07:17:43.231Z" }, + { url = "https://files.pythonhosted.org/packages/65/46/e6c7facebdb438db8a65ed247e56908818389c1a5abbf6a36aab14f1057d/lupa-2.6-cp311-cp311-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:561a8e3be800827884e767a694727ed8482d066e0d6edfcbf423b05e63b05535", size = 1165844, upload-time = "2025-10-24T07:17:45.437Z" }, + { url = "https://files.pythonhosted.org/packages/1c/26/9f1154c6c95f175ccbf96aa96c8f569c87f64f463b32473e839137601a8b/lupa-2.6-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:af880a62d47991cae78b8e9905c008cbfdc4a3a9723a66310c2634fc7644578c", size = 1048069, upload-time = "2025-10-24T07:17:47.181Z" }, + { url = "https://files.pythonhosted.org/packages/68/67/2cc52ab73d6af81612b2ea24c870d3fa398443af8e2875e5befe142398b1/lupa-2.6-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80b22923aa4023c86c0097b235615f89d469a0c4eee0489699c494d3367c4c85", size = 2079079, upload-time = "2025-10-24T07:17:49.755Z" }, + { url = "https://files.pythonhosted.org/packages/2e/dc/f843f09bbf325f6e5ee61730cf6c3409fc78c010d968c7c78acba3019ca7/lupa-2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:153d2cc6b643f7efb9cfc0c6bb55ec784d5bac1a3660cfc5b958a7b8f38f4a75", size = 1071428, upload-time = "2025-10-24T07:17:51.991Z" }, + { url = "https://files.pythonhosted.org/packages/2e/60/37533a8d85bf004697449acb97ecdacea851acad28f2ad3803662487dd2a/lupa-2.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3fa8777e16f3ded50b72967dc17e23f5a08e4f1e2c9456aff2ebdb57f5b2869f", size = 1181756, upload-time = "2025-10-24T07:17:53.752Z" }, + { url = "https://files.pythonhosted.org/packages/e4/f2/cf29b20dbb4927b6a3d27c339ac5d73e74306ecc28c8e2c900b2794142ba/lupa-2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8dbdcbe818c02a2f56f5ab5ce2de374dab03e84b25266cfbaef237829bc09b3f", size = 2175687, upload-time = "2025-10-24T07:17:56.228Z" }, + { url = "https://files.pythonhosted.org/packages/94/7c/050e02f80c7131b63db1474bff511e63c545b5a8636a24cbef3fc4da20b6/lupa-2.6-cp311-cp311-win32.whl", hash = "sha256:defaf188fde8f7a1e5ce3a5e6d945e533b8b8d547c11e43b96c9b7fe527f56dc", size = 1412592, upload-time = "2025-10-24T07:17:59.062Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9a/6f2af98aa5d771cea661f66c8eb8f53772ec1ab1dfbce24126cfcd189436/lupa-2.6-cp311-cp311-win_amd64.whl", hash = "sha256:9505ae600b5c14f3e17e70f87f88d333717f60411faca1ddc6f3e61dce85fa9e", size = 1669194, upload-time = "2025-10-24T07:18:01.647Z" }, + { url = "https://files.pythonhosted.org/packages/94/86/ce243390535c39d53ea17ccf0240815e6e457e413e40428a658ea4ee4b8d/lupa-2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47ce718817ef1cc0c40d87c3d5ae56a800d61af00fbc0fad1ca9be12df2f3b56", size = 951707, upload-time = "2025-10-24T07:18:03.884Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/cedea5e6cbeb54396fdcc55f6b741696f3f036d23cfaf986d50d680446da/lupa-2.6-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7aba985b15b101495aa4b07112cdc08baa0c545390d560ad5cfde2e9e34f4d58", size = 1916703, upload-time = "2025-10-24T07:18:05.6Z" }, + { url = "https://files.pythonhosted.org/packages/24/be/3d6b5f9a8588c01a4d88129284c726017b2089f3a3fd3ba8bd977292fea0/lupa-2.6-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:b766f62f95b2739f2248977d29b0722e589dcf4f0ccfa827ccbd29f0148bd2e5", size = 985152, upload-time = "2025-10-24T07:18:08.561Z" }, + { url = "https://files.pythonhosted.org/packages/eb/23/9f9a05beee5d5dce9deca4cb07c91c40a90541fc0a8e09db4ee670da550f/lupa-2.6-cp312-cp312-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:00a934c23331f94cb51760097ebfab14b005d55a6b30a2b480e3c53dd2fa290d", size = 1159599, upload-time = "2025-10-24T07:18:10.346Z" }, + { url = "https://files.pythonhosted.org/packages/40/4e/e7c0583083db9d7f1fd023800a9767d8e4391e8330d56c2373d890ac971b/lupa-2.6-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21de9f38bd475303e34a042b7081aabdf50bd9bafd36ce4faea2f90fd9f15c31", size = 1038686, upload-time = "2025-10-24T07:18:12.112Z" }, + { url = "https://files.pythonhosted.org/packages/1c/9f/5a4f7d959d4feba5e203ff0c31889e74d1ca3153122be4a46dca7d92bf7c/lupa-2.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf3bda96d3fc41237e964a69c23647d50d4e28421111360274d4799832c560e9", size = 2071956, upload-time = "2025-10-24T07:18:14.572Z" }, + { url = "https://files.pythonhosted.org/packages/92/34/2f4f13ca65d01169b1720176aedc4af17bc19ee834598c7292db232cb6dc/lupa-2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a76ead245da54801a81053794aa3975f213221f6542d14ec4b859ee2e7e0323", size = 1057199, upload-time = "2025-10-24T07:18:16.379Z" }, + { url = "https://files.pythonhosted.org/packages/35/2a/5f7d2eebec6993b0dcd428e0184ad71afb06a45ba13e717f6501bfed1da3/lupa-2.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8dd0861741caa20886ddbda0a121d8e52fb9b5bb153d82fa9bba796962bf30e8", size = 1173693, upload-time = "2025-10-24T07:18:18.153Z" }, + { url = "https://files.pythonhosted.org/packages/e4/29/089b4d2f8e34417349af3904bb40bec40b65c8731f45e3fd8d497ca573e5/lupa-2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:239e63948b0b23023f81d9a19a395e768ed3da6a299f84e7963b8f813f6e3f9c", size = 2164394, upload-time = "2025-10-24T07:18:20.403Z" }, + { url = "https://files.pythonhosted.org/packages/f3/1b/79c17b23c921f81468a111cad843b076a17ef4b684c4a8dff32a7969c3f0/lupa-2.6-cp312-cp312-win32.whl", hash = "sha256:325894e1099499e7a6f9c351147661a2011887603c71086d36fe0f964d52d1ce", size = 1420647, upload-time = "2025-10-24T07:18:23.368Z" }, + { url = "https://files.pythonhosted.org/packages/b8/15/5121e68aad3584e26e1425a5c9a79cd898f8a152292059e128c206ee817c/lupa-2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c735a1ce8ee60edb0fe71d665f1e6b7c55c6021f1d340eb8c865952c602cd36f", size = 1688529, upload-time = "2025-10-24T07:18:25.523Z" }, + { url = "https://files.pythonhosted.org/packages/28/1d/21176b682ca5469001199d8b95fa1737e29957a3d185186e7a8b55345f2e/lupa-2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:663a6e58a0f60e7d212017d6678639ac8df0119bc13c2145029dcba084391310", size = 947232, upload-time = "2025-10-24T07:18:27.878Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4c/d327befb684660ca13cf79cd1f1d604331808f9f1b6fb6bf57832f8edf80/lupa-2.6-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:d1f5afda5c20b1f3217a80e9bc1b77037f8a6eb11612fd3ada19065303c8f380", size = 1908625, upload-time = "2025-10-24T07:18:29.944Z" }, + { url = "https://files.pythonhosted.org/packages/66/8e/ad22b0a19454dfd08662237a84c792d6d420d36b061f239e084f29d1a4f3/lupa-2.6-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:26f2b3c085fe76e9119e48c1013c1cccdc1f51585d456858290475aa38e7089e", size = 981057, upload-time = "2025-10-24T07:18:31.553Z" }, + { url = "https://files.pythonhosted.org/packages/5c/48/74859073ab276bd0566c719f9ca0108b0cfc1956ca0d68678d117d47d155/lupa-2.6-cp313-cp313-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:60d2f902c7b96fb8ab98493dcff315e7bb4d0b44dc9dd76eb37de575025d5685", size = 1156227, upload-time = "2025-10-24T07:18:33.981Z" }, + { url = "https://files.pythonhosted.org/packages/09/6c/0e9ded061916877253c2266074060eb71ed99fb21d73c8c114a76725bce2/lupa-2.6-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a02d25dee3a3250967c36590128d9220ae02f2eda166a24279da0b481519cbff", size = 1035752, upload-time = "2025-10-24T07:18:36.32Z" }, + { url = "https://files.pythonhosted.org/packages/dd/ef/f8c32e454ef9f3fe909f6c7d57a39f950996c37a3deb7b391fec7903dab7/lupa-2.6-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6eae1ee16b886b8914ff292dbefbf2f48abfbdee94b33a88d1d5475e02423203", size = 2069009, upload-time = "2025-10-24T07:18:38.072Z" }, + { url = "https://files.pythonhosted.org/packages/53/dc/15b80c226a5225815a890ee1c11f07968e0aba7a852df41e8ae6fe285063/lupa-2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0edd5073a4ee74ab36f74fe61450148e6044f3952b8d21248581f3c5d1a58be", size = 1056301, upload-time = "2025-10-24T07:18:40.165Z" }, + { url = "https://files.pythonhosted.org/packages/31/14/2086c1425c985acfb30997a67e90c39457122df41324d3c179d6ee2292c6/lupa-2.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0c53ee9f22a8a17e7d4266ad48e86f43771951797042dd51d1494aaa4f5f3f0a", size = 1170673, upload-time = "2025-10-24T07:18:42.426Z" }, + { url = "https://files.pythonhosted.org/packages/10/e5/b216c054cf86576c0191bf9a9f05de6f7e8e07164897d95eea0078dca9b2/lupa-2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:de7c0f157a9064a400d828789191a96da7f4ce889969a588b87ec80de9b14772", size = 2162227, upload-time = "2025-10-24T07:18:46.112Z" }, + { url = "https://files.pythonhosted.org/packages/59/2f/33ecb5bedf4f3bc297ceacb7f016ff951331d352f58e7e791589609ea306/lupa-2.6-cp313-cp313-win32.whl", hash = "sha256:ee9523941ae0a87b5b703417720c5d78f72d2f5bc23883a2ea80a949a3ed9e75", size = 1419558, upload-time = "2025-10-24T07:18:48.371Z" }, + { url = "https://files.pythonhosted.org/packages/f9/b4/55e885834c847ea610e111d87b9ed4768f0afdaeebc00cd46810f25029f6/lupa-2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b1335a5835b0a25ebdbc75cf0bda195e54d133e4d994877ef025e218c2e59db9", size = 1683424, upload-time = "2025-10-24T07:18:50.976Z" }, +] + [[package]] name = "lxml" version = "4.9.4" @@ -6996,6 +7079,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/14/e02206388902a828cc26894996dfc68eec50f7583bcddc4b5605d0c18b51/pytest_rerunfailures-12.0-py3-none-any.whl", hash = "sha256:9a1afd04e21b8177faf08a9bbbf44de7a0fe3fc29f8ddbe83b9684bd5f8f92a9", size = 12977, upload-time = "2023-07-05T05:53:43.909Z" }, ] +[[package]] +name = "pytest-timeout" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" }, +] + [[package]] name = "pytest-xdist" version = "3.8.0" @@ -8234,6 +8329,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "soupsieve" version = "2.8" From a5bd504daa688efcbf7358ba2fce74a4da359ce4 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Wed, 12 Nov 2025 21:32:22 +0100 Subject: [PATCH 110/129] Relational DB migration test search (#1752) ## Description Add deterministic Cognee search test after rel DB migration. Test gathers all relevant relationships regarding Customers and their Invoices from relational DB that was migrated and then tries to get the same results with Cognee search. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/tests/test_relational_db_migration.py | 53 +++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/cognee/tests/test_relational_db_migration.py b/cognee/tests/test_relational_db_migration.py index 4557e9e2f..ae06e7c5d 100644 --- a/cognee/tests/test_relational_db_migration.py +++ b/cognee/tests/test_relational_db_migration.py @@ -1,6 +1,5 @@ import pathlib import os -from typing import List from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.relational import ( get_migration_relational_engine, @@ -10,7 +9,7 @@ from cognee.infrastructure.databases.vector.pgvector import ( create_db_and_tables as create_pgvector_db_and_tables, ) from cognee.tasks.ingestion import migrate_relational_database -from cognee.modules.search.types import SearchResult, SearchType +from cognee.modules.search.types import SearchType import cognee @@ -274,6 +273,55 @@ async def test_schema_only_migration(): print(f"Edge counts: {edge_counts}") +async def test_search_result_quality(): + from cognee.infrastructure.databases.relational import ( + get_migration_relational_engine, + ) + + # Get relational database with original data + migration_engine = get_migration_relational_engine() + from sqlalchemy import text + + async with migration_engine.engine.connect() as conn: + result = await conn.execute( + text(""" + SELECT + c.CustomerId, + c.FirstName, + c.LastName, + GROUP_CONCAT(i.InvoiceId, ',') AS invoice_ids + FROM Customer AS c + LEFT JOIN Invoice AS i ON c.CustomerId = i.CustomerId + GROUP BY c.CustomerId, c.FirstName, c.LastName + """) + ) + + for row in result: + # Get expected invoice IDs from relational DB for each Customer + customer_id = row.CustomerId + invoice_ids = row.invoice_ids.split(",") if row.invoice_ids else [] + print(f"Relational DB Customer {customer_id}: {invoice_ids}") + + # Use Cognee search to get invoice IDs for the same Customer but by providing Customer name + search_results = await cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=f"List me all the invoices of Customer:{row.FirstName} {row.LastName}.", + top_k=50, + system_prompt="Just return me the invoiceID as a number without any text. This is an example output: ['1', '2', '3']. Where 1, 2, 3 are invoiceIDs of an invoice", + ) + print(f"Cognee search result: {search_results}") + + import ast + + lst = ast.literal_eval(search_results[0]) # converts string -> Python list + # Transfrom both lists to int for comparison, sorting and type consistency + lst = sorted([int(x) for x in lst]) + invoice_ids = sorted([int(x) for x in invoice_ids]) + assert lst == invoice_ids, ( + f"Search results {lst} do not match expected invoice IDs {invoice_ids} for Customer:{customer_id}" + ) + + async def test_migration_sqlite(): database_to_migrate_path = os.path.join(pathlib.Path(__file__).parent, "test_data/") @@ -286,6 +334,7 @@ async def test_migration_sqlite(): ) await relational_db_migration() + await test_search_result_quality() await test_schema_only_migration() From f9cde2f375be2accf6c9bd7fb5f5c681971f692a Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 13 Nov 2025 13:35:07 +0100 Subject: [PATCH 111/129] Fix: Remove cognee script from pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 13266f83e..2436911e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,7 +156,6 @@ Homepage = "https://www.cognee.ai" Repository = "https://github.com/topoteretes/cognee" [project.scripts] -cognee = "cognee.cli._cognee:main" cognee-cli = "cognee.cli._cognee:main" [build-system] From 3b7d030817cea67f08af121d936a9e31312ae38c Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 13 Nov 2025 16:06:07 +0100 Subject: [PATCH 112/129] fix: remove duplicate mistral adapter creation --- .../litellm_instructor/llm/get_llm_client.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py index c7dcecc56..bbdfe49e9 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py @@ -162,20 +162,5 @@ def get_llm_client(raise_api_key_error: bool = True): endpoint=llm_config.llm_endpoint, ) - elif provider == LLMProvider.MISTRAL: - if llm_config.llm_api_key is None: - raise LLMAPIKeyNotSetError() - - from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.mistral.adapter import ( - MistralAdapter, - ) - - return MistralAdapter( - api_key=llm_config.llm_api_key, - model=llm_config.llm_model, - max_completion_tokens=max_completion_tokens, - endpoint=llm_config.llm_endpoint, - ) - else: raise UnsupportedLLMProviderError(provider) From c6454338f9374c0e871938523eca237d6e5a1d16 Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Thu, 13 Nov 2025 17:35:16 +0100 Subject: [PATCH 113/129] Fix: MCP remove cognee.add() preprequisite from the doc --- cognee-mcp/src/server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py index 7c708638c..4131be988 100755 --- a/cognee-mcp/src/server.py +++ b/cognee-mcp/src/server.py @@ -194,7 +194,6 @@ async def cognify( Prerequisites: - **LLM_API_KEY**: Must be configured (required for entity extraction and graph generation) - - **Data Added**: Must have data previously added via `cognee.add()` - **Vector Database**: Must be accessible for embeddings storage - **Graph Database**: Must be accessible for relationship storage From 2337d36f7b3968cfeff06b00613f7464c8d0ca93 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 13 Nov 2025 18:25:07 +0100 Subject: [PATCH 114/129] feat: add variable to control instructor mode --- cognee/infrastructure/llm/config.py | 2 ++ .../litellm_instructor/llm/anthropic/adapter.py | 8 +++++++- .../litellm_instructor/llm/gemini/adapter.py | 12 +++++++++++- .../llm/generic_llm_api/adapter.py | 12 +++++++++++- .../litellm_instructor/llm/mistral/adapter.py | 8 +++++++- .../litellm_instructor/llm/ollama/adapter.py | 12 +++++++++++- .../litellm_instructor/llm/openai/adapter.py | 11 +++++++++-- 7 files changed, 58 insertions(+), 7 deletions(-) diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index 8fd196eaf..c87054ff6 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -38,6 +38,7 @@ class LLMConfig(BaseSettings): """ structured_output_framework: str = "instructor" + llm_instructor_mode: Optional[str] = None llm_provider: str = "openai" llm_model: str = "openai/gpt-5-mini" llm_endpoint: str = "" @@ -181,6 +182,7 @@ class LLMConfig(BaseSettings): instance. """ return { + "llm_instructor_mode": self.llm_instructor_mode, "provider": self.llm_provider, "model": self.llm_model, "endpoint": self.llm_endpoint, diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py index bf19d6e86..6fb78718e 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py @@ -28,13 +28,19 @@ class AnthropicAdapter(LLMInterface): name = "Anthropic" model: str + default_instructor_mode = "anthropic_tools" def __init__(self, max_completion_tokens: int, model: str = None): import anthropic + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + self.aclient = instructor.patch( create=anthropic.AsyncAnthropic(api_key=get_llm_config().llm_api_key).messages.create, - mode=instructor.Mode.ANTHROPIC_TOOLS, + mode=instructor.Mode(instructor_mode), ) self.model = model diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py index 1187e0cad..68dddc7b7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py @@ -41,6 +41,7 @@ class GeminiAdapter(LLMInterface): name: str model: str api_key: str + default_instructor_mode = "json_mode" def __init__( self, @@ -63,7 +64,16 @@ class GeminiAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + + self.aclient = instructor.from_litellm( + litellm.acompletion, mode=instructor.Mode(instructor_mode) + ) @retry( stop=stop_after_delay(128), diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py index 8bbbaa2cc..ea32dced1 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py @@ -41,6 +41,7 @@ class GenericAPIAdapter(LLMInterface): name: str model: str api_key: str + default_instructor_mode = "json_mode" def __init__( self, @@ -63,7 +64,16 @@ class GenericAPIAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + + self.aclient = instructor.from_litellm( + litellm.acompletion, mode=instructor.Mode(instructor_mode) + ) @retry( stop=stop_after_delay(128), diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py index 78a3cbff5..bed88ce3c 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py @@ -37,6 +37,7 @@ class MistralAdapter(LLMInterface): model: str api_key: str max_completion_tokens: int + default_instructor_mode = "mistral_tools" def __init__(self, api_key: str, model: str, max_completion_tokens: int, endpoint: str = None): from mistralai import Mistral @@ -44,9 +45,14 @@ class MistralAdapter(LLMInterface): self.model = model self.max_completion_tokens = max_completion_tokens + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + self.aclient = instructor.from_litellm( litellm.acompletion, - mode=instructor.Mode.MISTRAL_TOOLS, + mode=instructor.Mode(instructor_mode), api_key=get_llm_config().llm_api_key, ) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py index 9c3d185aa..aa24a7911 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py @@ -42,6 +42,8 @@ class OllamaAPIAdapter(LLMInterface): - aclient """ + default_instructor_mode = "json_mode" + def __init__( self, endpoint: str, api_key: str, model: str, name: str, max_completion_tokens: int ): @@ -51,8 +53,16 @@ class OllamaAPIAdapter(LLMInterface): self.endpoint = endpoint self.max_completion_tokens = max_completion_tokens + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) + self.aclient = instructor.from_openai( - OpenAI(base_url=self.endpoint, api_key=self.api_key), mode=instructor.Mode.JSON + OpenAI(base_url=self.endpoint, api_key=self.api_key), + mode=instructor.Mode(instructor_mode), ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py index 305b426b8..69367602d 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py @@ -56,6 +56,7 @@ class OpenAIAdapter(LLMInterface): model: str api_key: str api_version: str + default_instructor_mode = "json_schema_mode" MAX_RETRIES = 5 @@ -74,14 +75,20 @@ class OpenAIAdapter(LLMInterface): fallback_api_key: str = None, fallback_endpoint: str = None, ): + from cognee.infrastructure.llm.config import get_llm_config + + config_instructor_mode = get_llm_config().llm_instructor_mode + instructor_mode = ( + config_instructor_mode if config_instructor_mode else self.default_instructor_mode + ) # TODO: With gpt5 series models OpenAI expects JSON_SCHEMA as a mode for structured outputs. # Make sure all new gpt models will work with this mode as well. if "gpt-5" in model: self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode.JSON_SCHEMA + litellm.acompletion, mode=instructor.Mode(instructor_mode) ) self.client = instructor.from_litellm( - litellm.completion, mode=instructor.Mode.JSON_SCHEMA + litellm.completion, mode=instructor.Mode(instructor_mode) ) else: self.aclient = instructor.from_litellm(litellm.acompletion) From 661c194f97df5053f70a52d1638c77c23e0d50e3 Mon Sep 17 00:00:00 2001 From: EricXiao Date: Fri, 14 Nov 2025 15:21:47 +0800 Subject: [PATCH 115/129] fix: Resolve issue with csv suffix classification Signed-off-by: EricXiao --- cognee/infrastructure/files/utils/guess_file_type.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index 78b20c93d..4bc96fe80 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -55,6 +55,10 @@ def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type file_type = Type("text/plain", "txt") return file_type + if ext in [".csv"]: + file_type = Type("text/csv", "csv") + return file_type + file_type = filetype.guess(file) # If file type could not be determined consider it a plain text file as they don't have magic number encoding From 205f5a9e0c6d0fb72cc94fa20ce2ba814ebef0d5 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Fri, 14 Nov 2025 11:05:39 +0100 Subject: [PATCH 116/129] fix: Fix based on PR comments --- .../litellm_instructor/llm/anthropic/adapter.py | 9 +++------ .../litellm_instructor/llm/gemini/adapter.py | 10 +++------- .../llm/generic_llm_api/adapter.py | 10 +++------- .../litellm_instructor/llm/get_llm_client.py | 9 ++++++++- .../litellm_instructor/llm/mistral/adapter.py | 16 ++++++++++------ .../litellm_instructor/llm/ollama/adapter.py | 17 +++++++++-------- .../litellm_instructor/llm/openai/adapter.py | 12 ++++-------- 7 files changed, 40 insertions(+), 43 deletions(-) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py index 6fb78718e..dbf0dfbea 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py @@ -30,17 +30,14 @@ class AnthropicAdapter(LLMInterface): model: str default_instructor_mode = "anthropic_tools" - def __init__(self, max_completion_tokens: int, model: str = None): + def __init__(self, max_completion_tokens: int, model: str = None, instructor_mode: str = None): import anthropic - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.patch( create=anthropic.AsyncAnthropic(api_key=get_llm_config().llm_api_key).messages.create, - mode=instructor.Mode(instructor_mode), + mode=instructor.Mode(self.instructor_mode), ) self.model = model diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py index 68dddc7b7..226f291d7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py @@ -50,6 +50,7 @@ class GeminiAdapter(LLMInterface): model: str, api_version: str, max_completion_tokens: int, + instructor_mode: str = None, fallback_model: str = None, fallback_api_key: str = None, fallback_endpoint: str = None, @@ -64,15 +65,10 @@ class GeminiAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode(instructor_mode) + litellm.acompletion, mode=instructor.Mode(self.instructor_mode) ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py index ea32dced1..9d7f25fc5 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py @@ -50,6 +50,7 @@ class GenericAPIAdapter(LLMInterface): model: str, name: str, max_completion_tokens: int, + instructor_mode: str = None, fallback_model: str = None, fallback_api_key: str = None, fallback_endpoint: str = None, @@ -64,15 +65,10 @@ class GenericAPIAdapter(LLMInterface): self.fallback_api_key = fallback_api_key self.fallback_endpoint = fallback_endpoint - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode(instructor_mode) + litellm.acompletion, mode=instructor.Mode(self.instructor_mode) ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py index c7dcecc56..537eda1b2 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py @@ -81,6 +81,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, transcription_model=llm_config.transcription_model, max_completion_tokens=max_completion_tokens, + instructor_mode=llm_config.llm_instructor_mode, streaming=llm_config.llm_streaming, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, @@ -101,6 +102,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Ollama", max_completion_tokens=max_completion_tokens, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.ANTHROPIC: @@ -109,7 +111,9 @@ def get_llm_client(raise_api_key_error: bool = True): ) return AnthropicAdapter( - max_completion_tokens=max_completion_tokens, model=llm_config.llm_model + max_completion_tokens=max_completion_tokens, + model=llm_config.llm_model, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.CUSTOM: @@ -126,6 +130,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Custom", max_completion_tokens=max_completion_tokens, + instructor_mode=llm_config.llm_instructor_mode, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, fallback_model=llm_config.fallback_model, @@ -145,6 +150,7 @@ def get_llm_client(raise_api_key_error: bool = True): max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, api_version=llm_config.llm_api_version, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.MISTRAL: @@ -160,6 +166,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, + instructor_mode=llm_config.llm_instructor_mode, ) elif provider == LLMProvider.MISTRAL: diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py index bed88ce3c..355cdae0b 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py @@ -39,20 +39,24 @@ class MistralAdapter(LLMInterface): max_completion_tokens: int default_instructor_mode = "mistral_tools" - def __init__(self, api_key: str, model: str, max_completion_tokens: int, endpoint: str = None): + def __init__( + self, + api_key: str, + model: str, + max_completion_tokens: int, + endpoint: str = None, + instructor_mode: str = None, + ): from mistralai import Mistral self.model = model self.max_completion_tokens = max_completion_tokens - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_litellm( litellm.acompletion, - mode=instructor.Mode(instructor_mode), + mode=instructor.Mode(self.instructor_mode), api_key=get_llm_config().llm_api_key, ) diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py index aa24a7911..aabd19867 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py @@ -45,7 +45,13 @@ class OllamaAPIAdapter(LLMInterface): default_instructor_mode = "json_mode" def __init__( - self, endpoint: str, api_key: str, model: str, name: str, max_completion_tokens: int + self, + endpoint: str, + api_key: str, + model: str, + name: str, + max_completion_tokens: int, + instructor_mode: str = None, ): self.name = name self.model = model @@ -53,16 +59,11 @@ class OllamaAPIAdapter(LLMInterface): self.endpoint = endpoint self.max_completion_tokens = max_completion_tokens - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode self.aclient = instructor.from_openai( OpenAI(base_url=self.endpoint, api_key=self.api_key), - mode=instructor.Mode(instructor_mode), + mode=instructor.Mode(self.instructor_mode), ) @retry( diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py index 69367602d..778c8eec7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py @@ -70,25 +70,21 @@ class OpenAIAdapter(LLMInterface): model: str, transcription_model: str, max_completion_tokens: int, + instructor_mode: str = None, streaming: bool = False, fallback_model: str = None, fallback_api_key: str = None, fallback_endpoint: str = None, ): - from cognee.infrastructure.llm.config import get_llm_config - - config_instructor_mode = get_llm_config().llm_instructor_mode - instructor_mode = ( - config_instructor_mode if config_instructor_mode else self.default_instructor_mode - ) + self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode # TODO: With gpt5 series models OpenAI expects JSON_SCHEMA as a mode for structured outputs. # Make sure all new gpt models will work with this mode as well. if "gpt-5" in model: self.aclient = instructor.from_litellm( - litellm.acompletion, mode=instructor.Mode(instructor_mode) + litellm.acompletion, mode=instructor.Mode(self.instructor_mode) ) self.client = instructor.from_litellm( - litellm.completion, mode=instructor.Mode(instructor_mode) + litellm.completion, mode=instructor.Mode(self.instructor_mode) ) else: self.aclient = instructor.from_litellm(litellm.acompletion) From 844b8d635a7646750dc63dd4be13de07f2996940 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Fri, 14 Nov 2025 22:13:00 +0500 Subject: [PATCH 117/129] feat: enhance ontology handling to support multiple uploads and retrievals --- .../v1/cognify/routers/get_cognify_router.py | 41 +++---- cognee/api/v1/ontologies/ontologies.py | 108 +++++++++++++++--- .../ontologies/routers/get_ontology_router.py | 51 ++++++--- .../rdf_xml/RDFLibOntologyResolver.py | 93 +++++++++------ 4 files changed, 202 insertions(+), 91 deletions(-) diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index 252ffe7bf..4f1497e3c 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -41,8 +41,8 @@ class CognifyPayloadDTO(InDTO): custom_prompt: Optional[str] = Field( default="", description="Custom prompt for entity extraction and graph generation" ) - ontology_key: Optional[str] = Field( - default=None, description="Reference to previously uploaded ontology" + ontology_key: Optional[List[str]] = Field( + default=None, description="Reference to one or more previously uploaded ontologies" ) @@ -71,7 +71,7 @@ def get_cognify_router() -> APIRouter: - **dataset_ids** (Optional[List[UUID]]): List of existing dataset UUIDs to process. UUIDs allow processing of datasets not owned by the user (if permitted). - **run_in_background** (Optional[bool]): Whether to execute processing asynchronously. Defaults to False (blocking). - **custom_prompt** (Optional[str]): Custom prompt for entity extraction and graph generation. If provided, this prompt will be used instead of the default prompts for knowledge graph extraction. - - **ontology_key** (Optional[str]): Reference to a previously uploaded ontology file to use for knowledge graph construction. + - **ontology_key** (Optional[List[str]]): Reference to one or more previously uploaded ontology files to use for knowledge graph construction. ## Response - **Blocking execution**: Complete pipeline run information with entity counts, processing duration, and success/failure status @@ -87,7 +87,7 @@ def get_cognify_router() -> APIRouter: "datasets": ["research_papers", "documentation"], "run_in_background": false, "custom_prompt": "Extract entities focusing on technical concepts and their relationships. Identify key technologies, methodologies, and their interconnections.", - "ontology_key": "medical_ontology_v1" + "ontology_key": ["medical_ontology_v1"] } ``` @@ -121,29 +121,22 @@ def get_cognify_router() -> APIRouter: if payload.ontology_key: ontology_service = OntologyService() - try: - ontology_content = ontology_service.get_ontology_content( - payload.ontology_key, user - ) + ontology_contents = ontology_service.get_ontology_contents( + payload.ontology_key, user + ) - from cognee.modules.ontology.ontology_config import Config - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import ( - RDFLibOntologyResolver, - ) - from io import StringIO + from cognee.modules.ontology.ontology_config import Config + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import ( + RDFLibOntologyResolver, + ) + from io import StringIO - ontology_stream = StringIO(ontology_content) - config_to_use: Config = { - "ontology_config": { - "ontology_resolver": RDFLibOntologyResolver( - ontology_file=ontology_stream - ) - } + ontology_streams = [StringIO(content) for content in ontology_contents] + config_to_use: Config = { + "ontology_config": { + "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_streams) } - except ValueError as e: - return JSONResponse( - status_code=400, content={"error": f"Ontology error: {str(e)}"} - ) + } cognify_run = await cognee_cognify( datasets, diff --git a/cognee/api/v1/ontologies/ontologies.py b/cognee/api/v1/ontologies/ontologies.py index 6bfb7658e..130b4a862 100644 --- a/cognee/api/v1/ontologies/ontologies.py +++ b/cognee/api/v1/ontologies/ontologies.py @@ -3,7 +3,7 @@ import json import tempfile from pathlib import Path from datetime import datetime, timezone -from typing import Optional +from typing import Optional, List from dataclasses import dataclass @@ -47,28 +47,23 @@ class OntologyService: async def upload_ontology( self, ontology_key: str, file, user, description: Optional[str] = None ) -> OntologyMetadata: - # Validate file format if not file.filename.lower().endswith(".owl"): raise ValueError("File must be in .owl format") user_dir = self._get_user_dir(str(user.id)) metadata = self._load_metadata(user_dir) - # Check for duplicate key if ontology_key in metadata: raise ValueError(f"Ontology key '{ontology_key}' already exists") - # Read file content content = await file.read() - if len(content) > 10 * 1024 * 1024: # 10MB limit + if len(content) > 10 * 1024 * 1024: raise ValueError("File size exceeds 10MB limit") - # Save file file_path = user_dir / f"{ontology_key}.owl" with open(file_path, "wb") as f: f.write(content) - # Update metadata ontology_metadata = { "filename": file.filename, "size_bytes": len(content), @@ -86,19 +81,102 @@ class OntologyService: description=description, ) - def get_ontology_content(self, ontology_key: str, user) -> str: + async def upload_ontologies( + self, ontology_key: List[str], files: List, user, descriptions: Optional[List[str]] = None + ) -> List[OntologyMetadata]: + """ + Upload ontology files with their respective keys. + + Args: + ontology_key: List of unique keys for each ontology + files: List of UploadFile objects (same length as keys) + user: Authenticated user + descriptions: Optional list of descriptions for each file + + Returns: + List of OntologyMetadata objects for uploaded files + + Raises: + ValueError: If keys duplicate, file format invalid, or array lengths don't match + """ + if len(ontology_key) != len(files): + raise ValueError("Number of keys must match number of files") + + if len(set(ontology_key)) != len(ontology_key): + raise ValueError("Duplicate ontology keys not allowed") + + if descriptions and len(descriptions) != len(files): + raise ValueError("Number of descriptions must match number of files") + + results = [] user_dir = self._get_user_dir(str(user.id)) metadata = self._load_metadata(user_dir) - if ontology_key not in metadata: - raise ValueError(f"Ontology key '{ontology_key}' not found") + for i, (key, file) in enumerate(zip(ontology_key, files)): + if key in metadata: + raise ValueError(f"Ontology key '{key}' already exists") - file_path = user_dir / f"{ontology_key}.owl" - if not file_path.exists(): - raise ValueError(f"Ontology file for key '{ontology_key}' not found") + if not file.filename.lower().endswith(".owl"): + raise ValueError(f"File '{file.filename}' must be in .owl format") - with open(file_path, "r", encoding="utf-8") as f: - return f.read() + content = await file.read() + if len(content) > 10 * 1024 * 1024: + raise ValueError(f"File '{file.filename}' exceeds 10MB limit") + + file_path = user_dir / f"{key}.owl" + with open(file_path, "wb") as f: + f.write(content) + + ontology_metadata = { + "filename": file.filename, + "size_bytes": len(content), + "uploaded_at": datetime.now(timezone.utc).isoformat(), + "description": descriptions[i] if descriptions else None, + } + metadata[key] = ontology_metadata + + results.append( + OntologyMetadata( + ontology_key=key, + filename=file.filename, + size_bytes=len(content), + uploaded_at=ontology_metadata["uploaded_at"], + description=descriptions[i] if descriptions else None, + ) + ) + + self._save_metadata(user_dir, metadata) + return results + + def get_ontology_contents(self, ontology_key: List[str], user) -> List[str]: + """ + Retrieve ontology content for one or more keys. + + Args: + ontology_key: List of ontology keys to retrieve (can contain single item) + user: Authenticated user + + Returns: + List of ontology content strings + + Raises: + ValueError: If any ontology key not found + """ + user_dir = self._get_user_dir(str(user.id)) + metadata = self._load_metadata(user_dir) + + contents = [] + for key in ontology_key: + if key not in metadata: + raise ValueError(f"Ontology key '{key}' not found") + + file_path = user_dir / f"{key}.owl" + if not file_path.exists(): + raise ValueError(f"Ontology file for key '{key}' not found") + + with open(file_path, "r", encoding="utf-8") as f: + contents.append(f.read()) + return contents def list_ontologies(self, user) -> dict: user_dir = self._get_user_dir(str(user.id)) diff --git a/cognee/api/v1/ontologies/routers/get_ontology_router.py b/cognee/api/v1/ontologies/routers/get_ontology_router.py index f5c51ba21..ee31c683f 100644 --- a/cognee/api/v1/ontologies/routers/get_ontology_router.py +++ b/cognee/api/v1/ontologies/routers/get_ontology_router.py @@ -1,6 +1,6 @@ from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException from fastapi.responses import JSONResponse -from typing import Optional +from typing import Optional, List from cognee.modules.users.models import User from cognee.modules.users.methods import get_authenticated_user @@ -16,23 +16,27 @@ def get_ontology_router() -> APIRouter: @router.post("", response_model=dict) async def upload_ontology( ontology_key: str = Form(...), - ontology_file: UploadFile = File(...), - description: Optional[str] = Form(None), + ontology_file: List[UploadFile] = File(...), + descriptions: Optional[str] = Form(None), user: User = Depends(get_authenticated_user), ): """ - Upload an ontology file with a named key for later use in cognify operations. + Upload ontology files with their respective keys for later use in cognify operations. + + Supports both single and multiple file uploads: + - Single file: ontology_key=["key"], ontology_file=[file] + - Multiple files: ontology_key=["key1", "key2"], ontology_file=[file1, file2] ## Request Parameters - - **ontology_key** (str): User-defined identifier for the ontology - - **ontology_file** (UploadFile): OWL format ontology file - - **description** (Optional[str]): Optional description of the ontology + - **ontology_key** (str): JSON array string of user-defined identifiers for the ontologies + - **ontology_file** (List[UploadFile]): OWL format ontology files + - **descriptions** (Optional[str]): JSON array string of optional descriptions ## Response - Returns metadata about the uploaded ontology including key, filename, size, and upload timestamp. + Returns metadata about uploaded ontologies including keys, filenames, sizes, and upload timestamps. ## Error Codes - - **400 Bad Request**: Invalid file format, duplicate key, file size exceeded + - **400 Bad Request**: Invalid file format, duplicate keys, array length mismatches, file size exceeded - **500 Internal Server Error**: File system or processing errors """ send_telemetry( @@ -45,16 +49,31 @@ def get_ontology_router() -> APIRouter: ) try: - result = await ontology_service.upload_ontology( - ontology_key, ontology_file, user, description + import json + + ontology_keys = json.loads(ontology_key) + description_list = json.loads(descriptions) if descriptions else None + + if not isinstance(ontology_keys, list): + raise ValueError("ontology_key must be a JSON array") + + results = await ontology_service.upload_ontologies( + ontology_keys, ontology_file, user, description_list ) + return { - "ontology_key": result.ontology_key, - "filename": result.filename, - "size_bytes": result.size_bytes, - "uploaded_at": result.uploaded_at, + "uploaded_ontologies": [ + { + "ontology_key": result.ontology_key, + "filename": result.filename, + "size_bytes": result.size_bytes, + "uploaded_at": result.uploaded_at, + "description": result.description, + } + for result in results + ] } - except ValueError as e: + except (json.JSONDecodeError, ValueError) as e: return JSONResponse(status_code=400, content={"error": str(e)}) except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)}) diff --git a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index 4acc8861b..34d7a946a 100644 --- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -26,7 +26,7 @@ class RDFLibOntologyResolver(BaseOntologyResolver): def __init__( self, - ontology_file: Optional[Union[str, List[str], IO]] = None, + ontology_file: Optional[Union[str, List[str], IO, List[IO]]] = None, matching_strategy: Optional[MatchingStrategy] = None, ) -> None: super().__init__(matching_strategy) @@ -34,47 +34,68 @@ class RDFLibOntologyResolver(BaseOntologyResolver): try: self.graph = None if ontology_file is not None: + files_to_load = [] + file_objects = [] + if hasattr(ontology_file, "read"): - self.graph = Graph() - content = ontology_file.read() - self.graph.parse(data=content, format="xml") - logger.info("Ontology loaded successfully from file object") - else: - files_to_load = [] - if isinstance(ontology_file, str): - files_to_load = [ontology_file] - elif isinstance(ontology_file, list): + file_objects = [ontology_file] + elif isinstance(ontology_file, str): + files_to_load = [ontology_file] + elif isinstance(ontology_file, list): + if all(hasattr(item, "read") for item in ontology_file): + file_objects = ontology_file + else: files_to_load = ontology_file - else: - raise ValueError( - f"ontology_file must be a string, list of strings, file-like object, or None. Got: {type(ontology_file)}" - ) + else: + raise ValueError( + f"ontology_file must be a string, list of strings, file-like object, list of file-like objects, or None. Got: {type(ontology_file)}" + ) - if files_to_load: - self.graph = Graph() - loaded_files = [] - for file_path in files_to_load: - if os.path.exists(file_path): - self.graph.parse(file_path) - loaded_files.append(file_path) - logger.info("Ontology loaded successfully from file: %s", file_path) - else: - logger.warning( - "Ontology file '%s' not found. Skipping this file.", - file_path, - ) + if file_objects: + self.graph = Graph() + loaded_objects = [] + for file_obj in file_objects: + try: + content = file_obj.read() + self.graph.parse(data=content, format="xml") + loaded_objects.append(file_obj) + logger.info("Ontology loaded successfully from file object") + except Exception as e: + logger.warning("Failed to parse ontology file object: %s", str(e)) - if not loaded_files: - logger.info( - "No valid ontology files found. No owl ontology will be attached to the graph." - ) - self.graph = None - else: - logger.info("Total ontology files loaded: %d", len(loaded_files)) - else: + if not loaded_objects: logger.info( - "No ontology file provided. No owl ontology will be attached to the graph." + "No valid ontology file objects found. No owl ontology will be attached to the graph." ) + self.graph = None + else: + logger.info("Total ontology file objects loaded: %d", len(loaded_objects)) + + elif files_to_load: + self.graph = Graph() + loaded_files = [] + for file_path in files_to_load: + if os.path.exists(file_path): + self.graph.parse(file_path) + loaded_files.append(file_path) + logger.info("Ontology loaded successfully from file: %s", file_path) + else: + logger.warning( + "Ontology file '%s' not found. Skipping this file.", + file_path, + ) + + if not loaded_files: + logger.info( + "No valid ontology files found. No owl ontology will be attached to the graph." + ) + self.graph = None + else: + logger.info("Total ontology files loaded: %d", len(loaded_files)) + else: + logger.info( + "No ontology file provided. No owl ontology will be attached to the graph." + ) else: logger.info( "No ontology file provided. No owl ontology will be attached to the graph." From 01f1c099cc972e2222f1174c515e1baf87fbb9d6 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Fri, 14 Nov 2025 22:20:54 +0500 Subject: [PATCH 118/129] test: enhance server start test with ontology upload verification - Extend test_cognee_server_start to upload ontology and verify integration - Move test_ontology_endpoint from tests/ to tests/unit/api/ --- cognee/tests/test_cognee_server_start.py | 45 ++++++++++++++++++- .../{ => unit/api}/test_ontology_endpoint.py | 0 2 files changed, 44 insertions(+), 1 deletion(-) rename cognee/tests/{ => unit/api}/test_ontology_endpoint.py (100%) diff --git a/cognee/tests/test_cognee_server_start.py b/cognee/tests/test_cognee_server_start.py index ab68a8ef1..d6aa55a98 100644 --- a/cognee/tests/test_cognee_server_start.py +++ b/cognee/tests/test_cognee_server_start.py @@ -7,6 +7,7 @@ import requests from pathlib import Path import sys import uuid +import json class TestCogneeServerStart(unittest.TestCase): @@ -90,12 +91,31 @@ class TestCogneeServerStart(unittest.TestCase): ) } - payload = {"datasets": [dataset_name]} + ontology_key = f"test_ontology_{uuid.uuid4().hex[:8]}" + payload = {"datasets": [dataset_name], "ontology_key": [ontology_key]} add_response = requests.post(url, headers=headers, data=form_data, files=file, timeout=50) if add_response.status_code not in [200, 201]: add_response.raise_for_status() + ontology_content = b""" + + + + + """ + + ontology_response = requests.post( + "http://127.0.0.1:8000/api/v1/ontologies", + files=[("ontology_file", ("test.owl", ontology_content, "application/xml"))], + data={ + "ontology_key": json.dumps([ontology_key]), + "description": json.dumps(["Test ontology"]), + }, + ) + self.assertEqual(ontology_response.status_code, 200) + # Cognify request url = "http://127.0.0.1:8000/api/v1/cognify" headers = { @@ -107,6 +127,29 @@ class TestCogneeServerStart(unittest.TestCase): if cognify_response.status_code not in [200, 201]: cognify_response.raise_for_status() + datasets_response = requests.get("http://127.0.0.1:8000/api/v1/datasets", headers=headers) + + datasets = datasets_response.json() + dataset_id = None + for dataset in datasets: + if dataset["name"] == dataset_name: + dataset_id = dataset["id"] + break + + graph_response = requests.get( + f"http://127.0.0.1:8000/api/v1/datasets/{dataset_id}/graph", headers=headers + ) + self.assertEqual(graph_response.status_code, 200) + + graph_data = graph_response.json() + ontology_nodes = [ + node for node in graph_data.get("nodes") if node.get("properties").get("ontology_valid") + ] + + self.assertGreater( + len(ontology_nodes), 0, "No ontology nodes found - ontology was not integrated" + ) + # TODO: Add test to verify cognify pipeline is complete before testing search # Search request diff --git a/cognee/tests/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py similarity index 100% rename from cognee/tests/test_ontology_endpoint.py rename to cognee/tests/unit/api/test_ontology_endpoint.py From 1ded09d0f995fa57c9eaa3feafbe64089525a92f Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Sat, 15 Nov 2025 00:06:55 +0500 Subject: [PATCH 119/129] fix: fixed test ontology file content. Added tests to support multiple files and improved validation. --- cognee/tests/test_cognee_server_start.py | 13 +- .../tests/unit/api/test_ontology_endpoint.py | 189 +++++++++++++++++- 2 files changed, 185 insertions(+), 17 deletions(-) diff --git a/cognee/tests/test_cognee_server_start.py b/cognee/tests/test_cognee_server_start.py index d6aa55a98..b266fc7bf 100644 --- a/cognee/tests/test_cognee_server_start.py +++ b/cognee/tests/test_cognee_server_start.py @@ -98,13 +98,12 @@ class TestCogneeServerStart(unittest.TestCase): if add_response.status_code not in [200, 201]: add_response.raise_for_status() - ontology_content = b""" - - - - - """ + ontology_content = b""" + + + + + """ ontology_response = requests.post( "http://127.0.0.1:8000/api/v1/ontologies", diff --git a/cognee/tests/unit/api/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py index b5cedfafe..c04959998 100644 --- a/cognee/tests/unit/api/test_ontology_endpoint.py +++ b/cognee/tests/unit/api/test_ontology_endpoint.py @@ -32,6 +32,8 @@ def mock_default_user(): @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_success(mock_get_default_user, client, mock_default_user): """Test successful ontology upload""" + import json + mock_get_default_user.return_value = mock_default_user ontology_content = ( b"" @@ -40,14 +42,14 @@ def test_upload_ontology_success(mock_get_default_user, client, mock_default_use response = client.post( "/api/v1/ontologies", - files={"ontology_file": ("test.owl", ontology_content)}, - data={"ontology_key": unique_key, "description": "Test"}, + files=[("ontology_file", ("test.owl", ontology_content, "application/xml"))], + data={"ontology_key": json.dumps([unique_key]), "description": json.dumps(["Test"])}, ) assert response.status_code == 200 data = response.json() - assert data["ontology_key"] == unique_key - assert "uploaded_at" in data + assert data["uploaded_ontologies"][0]["ontology_key"] == unique_key + assert "uploaded_at" in data["uploaded_ontologies"][0] @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) @@ -66,30 +68,197 @@ def test_upload_ontology_invalid_file(mock_get_default_user, client, mock_defaul @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_missing_data(mock_get_default_user, client, mock_default_user): """Test 400 response for missing file or key""" + import json + mock_get_default_user.return_value = mock_default_user # Missing file - response = client.post("/api/v1/ontologies", data={"ontology_key": "test"}) + response = client.post("/api/v1/ontologies", data={"ontology_key": json.dumps(["test"])}) assert response.status_code == 400 # Missing key - response = client.post("/api/v1/ontologies", files={"ontology_file": ("test.owl", b"xml")}) + response = client.post( + "/api/v1/ontologies", files=[("ontology_file", ("test.owl", b"xml", "application/xml"))] + ) assert response.status_code == 400 @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_default_user): """Test behavior when default user is provided (no explicit authentication)""" + import json + unique_key = f"test_ontology_{uuid.uuid4().hex[:8]}" mock_get_default_user.return_value = mock_default_user response = client.post( "/api/v1/ontologies", - files={"ontology_file": ("test.owl", b"")}, - data={"ontology_key": unique_key}, + files=[("ontology_file", ("test.owl", b"", "application/xml"))], + data={"ontology_key": json.dumps([unique_key])}, ) # The current system provides a default user when no explicit authentication is given # This test verifies the system works with conditional authentication assert response.status_code == 200 data = response.json() - assert data["ontology_key"] == unique_key - assert "uploaded_at" in data + assert data["uploaded_ontologies"][0]["ontology_key"] == unique_key + assert "uploaded_at" in data["uploaded_ontologies"][0] + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_user): + """Test uploading multiple ontology files in single request""" + import io + + # Create mock files + file1_content = b"" + file2_content = b"" + + files = [ + ("ontology_file", ("vehicles.owl", io.BytesIO(file1_content), "application/xml")), + ("ontology_file", ("manufacturers.owl", io.BytesIO(file2_content), "application/xml")), + ] + data = { + "ontology_key": '["vehicles", "manufacturers"]', + "descriptions": '["Base vehicles", "Car manufacturers"]', + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + + assert response.status_code == 200 + result = response.json() + assert "uploaded_ontologies" in result + assert len(result["uploaded_ontologies"]) == 2 + assert result["uploaded_ontologies"][0]["ontology_key"] == "vehicles" + assert result["uploaded_ontologies"][1]["ontology_key"] == "manufacturers" + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_default_user): + """Test that upload endpoint accepts array parameters""" + import io + import json + + file_content = b"" + + files = [("ontology_file", ("single.owl", io.BytesIO(file_content), "application/xml"))] + data = { + "ontology_key": json.dumps(["single_key"]), + "descriptions": json.dumps(["Single ontology"]), + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + + assert response.status_code == 200 + result = response.json() + assert result["uploaded_ontologies"][0]["ontology_key"] == "single_key" + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_cognify_with_multiple_ontologies(mock_get_default_user, client, mock_default_user): + """Test cognify endpoint accepts multiple ontology keys""" + payload = { + "datasets": ["test_dataset"], + "ontology_key": ["ontology1", "ontology2"], # Array instead of string + "run_in_background": False, + } + + response = client.post("/api/v1/cognify", json=payload) + + # Should not fail due to ontology_key type + assert response.status_code in [200, 400, 409] # May fail for other reasons, not type + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_complete_multifile_workflow(mock_get_default_user, client, mock_default_user): + """Test complete workflow: upload multiple ontologies → cognify with multiple keys""" + import io + import json + + # Step 1: Upload multiple ontologies + file1_content = b""" + + + """ + + file2_content = b""" + + + """ + + files = [ + ("ontology_file", ("vehicles.owl", io.BytesIO(file1_content), "application/xml")), + ("ontology_file", ("manufacturers.owl", io.BytesIO(file2_content), "application/xml")), + ] + data = { + "ontology_key": json.dumps(["vehicles", "manufacturers"]), + "descriptions": json.dumps(["Vehicle ontology", "Manufacturer ontology"]), + } + + upload_response = client.post("/api/v1/ontologies", files=files, data=data) + assert upload_response.status_code == 200 + + # Step 2: Verify ontologies are listed + list_response = client.get("/api/v1/ontologies") + assert list_response.status_code == 200 + ontologies = list_response.json() + assert "vehicles" in ontologies + assert "manufacturers" in ontologies + + # Step 3: Test cognify with multiple ontologies + cognify_payload = { + "datasets": ["test_dataset"], + "ontology_key": ["vehicles", "manufacturers"], + "run_in_background": False, + } + + cognify_response = client.post("/api/v1/cognify", json=cognify_payload) + # Should not fail due to ontology handling (may fail for dataset reasons) + assert cognify_response.status_code != 400 # Not a validation error + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_multifile_error_handling(mock_get_default_user, client, mock_default_user): + """Test error handling for invalid multifile uploads""" + import io + import json + + # Test mismatched array lengths + file_content = b"" + files = [("ontology_file", ("test.owl", io.BytesIO(file_content), "application/xml"))] + data = { + "ontology_key": json.dumps(["key1", "key2"]), # 2 keys, 1 file + "descriptions": json.dumps(["desc1"]), + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + assert response.status_code == 400 + assert "Number of keys must match number of files" in response.json()["error"] + + # Test duplicate keys + files = [ + ("ontology_file", ("test1.owl", io.BytesIO(file_content), "application/xml")), + ("ontology_file", ("test2.owl", io.BytesIO(file_content), "application/xml")), + ] + data = { + "ontology_key": json.dumps(["duplicate", "duplicate"]), + "descriptions": json.dumps(["desc1", "desc2"]), + } + + response = client.post("/api/v1/ontologies", files=files, data=data) + assert response.status_code == 400 + assert "Duplicate ontology keys not allowed" in response.json()["error"] + + +@patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) +async def test_cognify_missing_ontology_key(mock_get_default_user, client, mock_default_user): + """Test cognify with non-existent ontology key""" + payload = { + "datasets": ["test_dataset"], + "ontology_key": ["nonexistent_key"], + "run_in_background": False, + } + + response = client.post("/api/v1/cognify", json=payload) + assert response.status_code == 409 + assert "Ontology key 'nonexistent_key' not found" in response.json()["error"] From 983bfae4fcc9046fd520f0c24733e491679d54cf Mon Sep 17 00:00:00 2001 From: EricXiao Date: Mon, 17 Nov 2025 14:41:55 +0800 Subject: [PATCH 120/129] chore: remove unnecessary csv file type Signed-off-by: EricXiao --- .../files/utils/is_csv_content.py | 181 ------------------ 1 file changed, 181 deletions(-) delete mode 100644 cognee/infrastructure/files/utils/is_csv_content.py diff --git a/cognee/infrastructure/files/utils/is_csv_content.py b/cognee/infrastructure/files/utils/is_csv_content.py deleted file mode 100644 index 07b7ea69b..000000000 --- a/cognee/infrastructure/files/utils/is_csv_content.py +++ /dev/null @@ -1,181 +0,0 @@ -import csv -from collections import Counter - - -def is_csv_content(content): - """ - Heuristically determine whether a bytes-like object is CSV text. - - Strategy (fail-fast and cheap to expensive): - 1) Decode: Try a small ordered list of common encodings with strict errors. - 2) Line sampling: require >= 2 non-empty lines; sample up to 50 lines. - 3) Delimiter detection: - - Prefer csv.Sniffer() with common delimiters. - - Fallback to a lightweight consistency heuristic. - 4) Lightweight parse check: - - Parse a few lines with the delimiter. - - Ensure at least 2 valid rows and relatively stable column counts. - - Returns: - bool: True if the buffer looks like CSV; False otherwise. - """ - try: - encoding_list = [ - "utf-8", - "utf-8-sig", - "utf-32-le", - "utf-32-be", - "utf-16-le", - "utf-16-be", - "gb18030", - "shift_jis", - "cp949", - "cp1252", - "iso-8859-1", - ] - - # Try to decode strictly—if decoding fails for all encodings, it's not text/CSV. - text = None - for enc in encoding_list: - try: - text = content.decode(enc, errors="strict") - break - except UnicodeDecodeError: - continue - if text is None: - return False - - # Reject empty/whitespace-only payloads. - stripped = text.strip() - if not stripped: - return False - - # Split into logical lines and drop empty ones. Require at least two lines. - lines = [ln for ln in text.splitlines() if ln.strip()] - if len(lines) < 2: - return False - - # Take a small sample to keep sniffing cheap and predictable. - sample_lines = lines[:50] - - # Detect delimiter using csv.Sniffer first; if that fails, use our heuristic. - delimiter = _sniff_delimiter(sample_lines) or _heuristic_delimiter(sample_lines) - if not delimiter: - return False - - # Finally, do a lightweight parse sanity check with the chosen delimiter. - return _lightweight_parse_check(sample_lines, delimiter) - except Exception: - return False - - -def _sniff_delimiter(lines): - """ - Try Python's built-in csv.Sniffer on a sample. - - Args: - lines (list[str]): Sample lines (already decoded). - - Returns: - str | None: The detected delimiter if sniffing succeeds; otherwise None. - """ - # Join up to 50 lines to form the sample string Sniffer will inspect. - sample = "\n".join(lines[:50]) - try: - dialect = csv.Sniffer().sniff(sample, delimiters=",\t;|") - return dialect.delimiter - except Exception: - # Sniffer is known to be brittle on small/dirty samples—silently fallback. - return None - - -def _heuristic_delimiter(lines): - """ - Fallback delimiter detection based on count consistency per line. - - Heuristic: - - For each candidate delimiter, count occurrences per line. - - Keep only lines with count > 0 (line must contain the delimiter). - - Require at least half of lines to contain the delimiter (min 2). - - Compute the mode (most common count). If the proportion of lines that - exhibit the modal count is >= 80%, accept that delimiter. - - Args: - lines (list[str]): Sample lines. - - Returns: - str | None: Best delimiter if one meets the consistency threshold; else None. - """ - candidates = [",", "\t", ";", "|"] - best = None - best_score = 0.0 - - for d in candidates: - # Count how many times the delimiter appears in each line. - counts = [ln.count(d) for ln in lines] - # Consider only lines that actually contain the delimiter at least once. - nonzero = [c for c in counts if c > 0] - - # Require that more than half of lines (and at least 2) contain the delimiter. - if len(nonzero) < max(2, int(0.5 * len(lines))): - continue - - # Find the modal count and its frequency. - cnt = Counter(nonzero) - pairs = cnt.most_common(1) - if not pairs: - continue - - mode, mode_freq = pairs[0] - # Consistency ratio: lines with the modal count / total lines in the sample. - consistency = mode_freq / len(lines) - # Accept if consistent enough and better than any previous candidate. - if mode >= 1 and consistency >= 0.80 and consistency > best_score: - best = d - best_score = consistency - - return best - - -def _lightweight_parse_check(lines, delimiter): - """ - Parse a few lines with csv.reader and check structural stability. - - Heuristic: - - Parse up to 5 lines with the given delimiter. - - Count column widths per parsed row. - - Require at least 2 non-empty rows. - - Allow at most 1 row whose width deviates by >2 columns from the first row. - - Args: - lines (list[str]): Sample lines (decoded). - delimiter (str): Delimiter chosen by sniffing/heuristics. - - Returns: - bool: True if parsing looks stable; False otherwise. - """ - try: - # csv.reader accepts any iterable of strings; feeding the first 10 lines is fine. - reader = csv.reader(lines[:10], delimiter=delimiter) - widths = [] - valid_rows = 0 - for row in reader: - if not row: - continue - - widths.append(len(row)) - valid_rows += 1 - - # Need at least two meaningful rows to make a judgment. - if valid_rows < 2: - return False - - if widths: - first = widths[0] - # Count rows whose width deviates significantly (>2) from the first row. - unstable = sum(1 for w in widths if abs(w - first) > 2) - # Permit at most 1 unstable row among the parsed sample. - return unstable <= 1 - return False - except Exception: - return False From fe55071849c06ecd3fe70602d56bb1f2904538a4 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:33:50 +0100 Subject: [PATCH 121/129] Feature/cog 3407 fixing integration test in ci (#1810) ## Description This PR should fix the web crawler integration test issue in our CI ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../test_default_url_crawler.py | 2 +- .../web_url_crawler/test_tavily_crawler.py | 2 +- .../web_url_crawler/test_url_adding_e2e.py | 40 ++++++------------- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py index 156cc87a4..f48c1cedc 100644 --- a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py @@ -5,7 +5,7 @@ from cognee.tasks.web_scraper import DefaultUrlCrawler @pytest.mark.asyncio async def test_fetch(): crawler = DefaultUrlCrawler() - url = "https://en.wikipedia.org/wiki/Large_language_model" + url = "https://httpbin.org/html" results = await crawler.fetch_urls(url) assert len(results) == 1 assert isinstance(results, dict) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py index 946ce8378..19ffdc4ea 100644 --- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -11,7 +11,7 @@ skip_in_ci = pytest.mark.skipif( @skip_in_ci @pytest.mark.asyncio async def test_fetch(): - url = "https://en.wikipedia.org/wiki/Large_language_model" + url = "https://httpbin.org/html" results = await fetch_with_tavily(url) assert isinstance(results, dict) assert len(results) == 1 diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index d91b075aa..cc8ae24d0 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -14,9 +14,7 @@ async def test_url_saves_as_html_file(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -44,9 +42,7 @@ async def test_saved_html_is_valid(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) content = Path(file_path).read_text() @@ -72,7 +68,7 @@ async def test_add_url(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") + await cognee.add("https://httpbin.org/html") skip_in_ci = pytest.mark.skipif( @@ -88,7 +84,7 @@ async def test_add_url_with_tavily(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://en.wikipedia.org/wiki/Large_language_model") + await cognee.add("https://httpbin.org/html") @pytest.mark.asyncio @@ -98,7 +94,7 @@ async def test_add_url_without_incremental_loading(): try: await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", incremental_loading=False, ) except Exception as e: @@ -112,7 +108,7 @@ async def test_add_url_with_incremental_loading(): try: await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", incremental_loading=True, ) except Exception as e: @@ -125,7 +121,7 @@ async def test_add_url_can_define_preferred_loader_as_list_of_str(): await cognee.prune.prune_system(metadata=True) await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", preferred_loaders=["beautiful_soup_loader"], ) @@ -144,7 +140,7 @@ async def test_add_url_with_extraction_rules(): try: await cognee.add( - "https://en.wikipedia.org/wiki/Large_language_model", + "https://httpbin.org/html", preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) except Exception as e: @@ -163,9 +159,7 @@ async def test_loader_is_none_by_default(): } try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -196,9 +190,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov } try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -225,9 +217,7 @@ async def test_beautiful_soup_loader_works_with_and_without_arguments(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -263,9 +253,7 @@ async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_pr await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -302,9 +290,7 @@ async def test_beautiful_soup_loads_file_successfully(): } try: - original_file_path = await save_data_item_to_storage( - "https://en.wikipedia.org/wiki/Large_language_model" - ) + original_file_path = await save_data_item_to_storage("https://httpbin.org/html") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") original_file = Path(file_path) From 30e3971d44816db50d9e83eee39bf6d69b98a328 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Thu, 20 Nov 2025 15:36:15 +0500 Subject: [PATCH 122/129] fix: add auth headers to ontology upload request and enhance ontology content --- cognee/tests/test_cognee_server_start.py | 53 +++++++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/cognee/tests/test_cognee_server_start.py b/cognee/tests/test_cognee_server_start.py index b266fc7bf..ddffe53a4 100644 --- a/cognee/tests/test_cognee_server_start.py +++ b/cognee/tests/test_cognee_server_start.py @@ -98,15 +98,56 @@ class TestCogneeServerStart(unittest.TestCase): if add_response.status_code not in [200, 201]: add_response.raise_for_status() - ontology_content = b""" - - - - - """ + ontology_content = b""" + + + + + + + + + + + + + + + + A failure caused by physical components. + + + + + An error caused by software logic or configuration. + + + + A human being or individual. + + + + + Programmers + + + + Light Bulb + + + + Hardware Problem + + + """ ontology_response = requests.post( "http://127.0.0.1:8000/api/v1/ontologies", + headers=headers, files=[("ontology_file", ("test.owl", ontology_content, "application/xml"))], data={ "ontology_key": json.dumps([ontology_key]), From 8cfb6c41eeca3b2ad0e34fb4b6043f4b5f6a8c00 Mon Sep 17 00:00:00 2001 From: Fahad Shoaib Date: Thu, 20 Nov 2025 15:54:09 +0500 Subject: [PATCH 123/129] fix: remove async from ontology endpoint test functions --- cognee/tests/unit/api/test_ontology_endpoint.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cognee/tests/unit/api/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py index c04959998..d53c5ab44 100644 --- a/cognee/tests/unit/api/test_ontology_endpoint.py +++ b/cognee/tests/unit/api/test_ontology_endpoint.py @@ -104,7 +104,7 @@ def test_upload_ontology_unauthorized(mock_get_default_user, client, mock_defaul @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_user): +def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_user): """Test uploading multiple ontology files in single request""" import io @@ -132,7 +132,7 @@ async def test_upload_multiple_ontologies(mock_get_default_user, client, mock_de @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_default_user): +def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_default_user): """Test that upload endpoint accepts array parameters""" import io import json @@ -153,7 +153,7 @@ async def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, moc @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_cognify_with_multiple_ontologies(mock_get_default_user, client, mock_default_user): +def test_cognify_with_multiple_ontologies(mock_get_default_user, client, mock_default_user): """Test cognify endpoint accepts multiple ontology keys""" payload = { "datasets": ["test_dataset"], @@ -168,7 +168,7 @@ async def test_cognify_with_multiple_ontologies(mock_get_default_user, client, m @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_complete_multifile_workflow(mock_get_default_user, client, mock_default_user): +def test_complete_multifile_workflow(mock_get_default_user, client, mock_default_user): """Test complete workflow: upload multiple ontologies → cognify with multiple keys""" import io import json @@ -218,7 +218,7 @@ async def test_complete_multifile_workflow(mock_get_default_user, client, mock_d @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_multifile_error_handling(mock_get_default_user, client, mock_default_user): +def test_multifile_error_handling(mock_get_default_user, client, mock_default_user): """Test error handling for invalid multifile uploads""" import io import json @@ -251,7 +251,7 @@ async def test_multifile_error_handling(mock_get_default_user, client, mock_defa @patch.object(gau_mod, "get_default_user", new_callable=AsyncMock) -async def test_cognify_missing_ontology_key(mock_get_default_user, client, mock_default_user): +def test_cognify_missing_ontology_key(mock_get_default_user, client, mock_default_user): """Test cognify with non-existent ontology key""" payload = { "datasets": ["test_dataset"], From 4e880eca8422872b26a568b18b9f1339ce362c18 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Thu, 20 Nov 2025 15:47:22 +0100 Subject: [PATCH 124/129] chore: update env template --- .env.template | 1 + 1 file changed, 1 insertion(+) diff --git a/.env.template b/.env.template index ae2cb1338..376233b1f 100644 --- a/.env.template +++ b/.env.template @@ -21,6 +21,7 @@ LLM_PROVIDER="openai" LLM_ENDPOINT="" LLM_API_VERSION="" LLM_MAX_TOKENS="16384" +LLM_INSTRUCTOR_MODE="json_schema_mode" # this mode is used for gpt-5 models EMBEDDING_PROVIDER="openai" EMBEDDING_MODEL="openai/text-embedding-3-large" From 2176ec16b8e440087f96410fed979528e8159ca2 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:03:36 +0100 Subject: [PATCH 125/129] chore: changes url for crawler tests (#1816) Updates crawler test url to avoid blocking and unavailable sites in CI. ## Description ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../test_default_url_crawler.py | 2 +- .../web_url_crawler/test_tavily_crawler.py | 2 +- .../web_url_crawler/test_url_adding_e2e.py | 26 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py index f48c1cedc..af2595b14 100644 --- a/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_default_url_crawler.py @@ -5,7 +5,7 @@ from cognee.tasks.web_scraper import DefaultUrlCrawler @pytest.mark.asyncio async def test_fetch(): crawler = DefaultUrlCrawler() - url = "https://httpbin.org/html" + url = "http://example.com/" results = await crawler.fetch_urls(url) assert len(results) == 1 assert isinstance(results, dict) diff --git a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py index 19ffdc4ea..5db9b58ce 100644 --- a/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +++ b/cognee/tests/integration/web_url_crawler/test_tavily_crawler.py @@ -11,7 +11,7 @@ skip_in_ci = pytest.mark.skipif( @skip_in_ci @pytest.mark.asyncio async def test_fetch(): - url = "https://httpbin.org/html" + url = "http://example.com/" results = await fetch_with_tavily(url) assert isinstance(results, dict) assert len(results) == 1 diff --git a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py index cc8ae24d0..200f40a94 100644 --- a/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +++ b/cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py @@ -14,7 +14,7 @@ async def test_url_saves_as_html_file(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -42,7 +42,7 @@ async def test_saved_html_is_valid(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) content = Path(file_path).read_text() @@ -68,7 +68,7 @@ async def test_add_url(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://httpbin.org/html") + await cognee.add("http://example.com/") skip_in_ci = pytest.mark.skipif( @@ -84,7 +84,7 @@ async def test_add_url_with_tavily(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("https://httpbin.org/html") + await cognee.add("http://example.com/") @pytest.mark.asyncio @@ -94,7 +94,7 @@ async def test_add_url_without_incremental_loading(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", incremental_loading=False, ) except Exception as e: @@ -108,7 +108,7 @@ async def test_add_url_with_incremental_loading(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", incremental_loading=True, ) except Exception as e: @@ -121,7 +121,7 @@ async def test_add_url_can_define_preferred_loader_as_list_of_str(): await cognee.prune.prune_system(metadata=True) await cognee.add( - "https://httpbin.org/html", + "http://example.com/", preferred_loaders=["beautiful_soup_loader"], ) @@ -140,7 +140,7 @@ async def test_add_url_with_extraction_rules(): try: await cognee.add( - "https://httpbin.org/html", + "http://example.com/", preferred_loaders={"beautiful_soup_loader": {"extraction_rules": extraction_rules}}, ) except Exception as e: @@ -159,7 +159,7 @@ async def test_loader_is_none_by_default(): } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -190,7 +190,7 @@ async def test_beautiful_soup_loader_is_selected_loader_if_preferred_loader_prov } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -217,7 +217,7 @@ async def test_beautiful_soup_loader_works_with_and_without_arguments(): await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -253,7 +253,7 @@ async def test_beautiful_soup_loader_successfully_loads_file_if_required_args_pr await cognee.prune.prune_system(metadata=True) try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") file = Path(file_path) @@ -290,7 +290,7 @@ async def test_beautiful_soup_loads_file_successfully(): } try: - original_file_path = await save_data_item_to_storage("https://httpbin.org/html") + original_file_path = await save_data_item_to_storage("http://example.com/") file_path = get_data_file_path(original_file_path) assert file_path.endswith(".html") original_file = Path(file_path) From 204f9c2e4ad6dc706c03deed68adfbc4744ae6df Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Fri, 21 Nov 2025 16:20:19 +0100 Subject: [PATCH 126/129] fix: PR comment changes --- .env.template | 5 ++++- cognee/infrastructure/llm/config.py | 4 ++-- .../litellm_instructor/llm/get_llm_client.py | 12 ++++++------ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.env.template b/.env.template index 376233b1f..61853b983 100644 --- a/.env.template +++ b/.env.template @@ -21,7 +21,10 @@ LLM_PROVIDER="openai" LLM_ENDPOINT="" LLM_API_VERSION="" LLM_MAX_TOKENS="16384" -LLM_INSTRUCTOR_MODE="json_schema_mode" # this mode is used for gpt-5 models +# Instructor's modes determine how structured data is requested from and extracted from LLM responses +# You can change this type (i.e. mode) via this env variable +# Each LLM has its own default value, e.g. gpt-5 models have "json_schema_mode" +LLM_INSTRUCTOR_MODE="" EMBEDDING_PROVIDER="openai" EMBEDDING_MODEL="openai/text-embedding-3-large" diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index c87054ff6..2e300dc0c 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -38,7 +38,7 @@ class LLMConfig(BaseSettings): """ structured_output_framework: str = "instructor" - llm_instructor_mode: Optional[str] = None + llm_instructor_mode: str = "" llm_provider: str = "openai" llm_model: str = "openai/gpt-5-mini" llm_endpoint: str = "" @@ -182,7 +182,7 @@ class LLMConfig(BaseSettings): instance. """ return { - "llm_instructor_mode": self.llm_instructor_mode, + "llm_instructor_mode": self.llm_instructor_mode.lower(), "provider": self.llm_provider, "model": self.llm_model, "endpoint": self.llm_endpoint, diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py index 537eda1b2..6ab3b91ad 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py @@ -81,7 +81,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, transcription_model=llm_config.transcription_model, max_completion_tokens=max_completion_tokens, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), streaming=llm_config.llm_streaming, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, @@ -102,7 +102,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Ollama", max_completion_tokens=max_completion_tokens, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.ANTHROPIC: @@ -113,7 +113,7 @@ def get_llm_client(raise_api_key_error: bool = True): return AnthropicAdapter( max_completion_tokens=max_completion_tokens, model=llm_config.llm_model, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.CUSTOM: @@ -130,7 +130,7 @@ def get_llm_client(raise_api_key_error: bool = True): llm_config.llm_model, "Custom", max_completion_tokens=max_completion_tokens, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, fallback_model=llm_config.fallback_model, @@ -150,7 +150,7 @@ def get_llm_client(raise_api_key_error: bool = True): max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, api_version=llm_config.llm_api_version, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.MISTRAL: @@ -166,7 +166,7 @@ def get_llm_client(raise_api_key_error: bool = True): model=llm_config.llm_model, max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, - instructor_mode=llm_config.llm_instructor_mode, + instructor_mode=llm_config.llm_instructor_mode.lower(), ) elif provider == LLMProvider.MISTRAL: From af8c55e82bd5dcefb526ffc106f3dcbc1a881a24 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Mon, 24 Nov 2025 16:16:47 +0100 Subject: [PATCH 127/129] version: 0.5.0.dev0 --- poetry.lock | 62 +++++++++++++++++++++----------------------------- pyproject.toml | 2 +- uv.lock | 10 +++++++- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/poetry.lock b/poetry.lock index 67de51633..0736a7bb7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "accelerate" @@ -1231,12 +1231,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["main", "dev"] +groups = ["main"] +markers = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -markers = {main = "(platform_system == \"Windows\" or extra == \"llama-index\" or extra == \"dev\" or extra == \"chromadb\" or sys_platform == \"win32\") and (platform_system == \"Windows\" or os_name == \"nt\" or extra == \"llama-index\" or extra == \"dev\" or sys_platform == \"win32\")", dev = "sys_platform == \"win32\""} [[package]] name = "coloredlogs" @@ -2347,7 +2347,7 @@ version = "1.3.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main"] markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"}, @@ -3732,14 +3732,14 @@ type = ["pytest-mypy"] name = "iniconfig" version = "2.1.0" description = "brain-dead simple config-ini parsing" -optional = false +optional = true python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main"] +markers = "extra == \"deepeval\" or extra == \"dev\"" files = [ {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] -markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [[package]] name = "instructor" @@ -4196,8 +4196,6 @@ groups = ["main"] markers = "extra == \"dlt\"" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, - {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, - {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -7634,7 +7632,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -8289,14 +8287,14 @@ kaleido = ["kaleido (>=1.0.0)"] name = "pluggy" version = "1.6.0" description = "plugin and hook calling mechanisms for python" -optional = false +optional = true python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main"] +markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\"" files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] -markers = {main = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\""} [package.extras] dev = ["pre-commit", "tox"] @@ -8656,7 +8654,6 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, - {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -8718,7 +8715,6 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, - {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -9698,14 +9694,14 @@ files = [ name = "pytest" version = "7.4.4" description = "pytest: simple powerful testing with Python" -optional = false +optional = true python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main"] +markers = "extra == \"deepeval\" or extra == \"dev\"" files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] -markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -9792,21 +9788,6 @@ files = [ packaging = ">=17.1" pytest = ">=6.2" -[[package]] -name = "pytest-timeout" -version = "2.4.0" -description = "pytest plugin to abort hanging tests" -optional = false -python-versions = ">=3.7" -groups = ["dev"] -files = [ - {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"}, - {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"}, -] - -[package.dependencies] -pytest = ">=7.0.0" - [[package]] name = "pytest-xdist" version = "3.8.0" @@ -11656,7 +11637,9 @@ groups = ["main"] files = [ {file = "SQLAlchemy-2.0.43-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:21ba7a08a4253c5825d1db389d4299f64a100ef9800e4624c8bf70d8f136e6ed"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11b9503fa6f8721bef9b8567730f664c5a5153d25e247aadc69247c4bc605227"}, + {file = "SQLAlchemy-2.0.43-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07097c0a1886c150ef2adba2ff7437e84d40c0f7dcb44a2c2b9c905ccfc6361c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cdeff998cb294896a34e5b2f00e383e7c5c4ef3b4bfa375d9104723f15186443"}, + {file = "SQLAlchemy-2.0.43-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:bcf0724a62a5670e5718957e05c56ec2d6850267ea859f8ad2481838f889b42c"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win32.whl", hash = "sha256:c697575d0e2b0a5f0433f679bda22f63873821d991e95a90e9e52aae517b2e32"}, {file = "SQLAlchemy-2.0.43-cp37-cp37m-win_amd64.whl", hash = "sha256:d34c0f6dbefd2e816e8f341d0df7d4763d382e3f452423e752ffd1e213da2512"}, {file = "sqlalchemy-2.0.43-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:70322986c0c699dca241418fcf18e637a4369e0ec50540a2b907b184c8bca069"}, @@ -11691,12 +11674,20 @@ files = [ {file = "sqlalchemy-2.0.43-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9df7126fd9db49e3a5a3999442cc67e9ee8971f3cb9644250107d7296cb2a164"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win32.whl", hash = "sha256:7f1ac7828857fcedb0361b48b9ac4821469f7694089d15550bbcf9ab22564a1d"}, {file = "sqlalchemy-2.0.43-cp313-cp313-win_amd64.whl", hash = "sha256:971ba928fcde01869361f504fcff3b7143b47d30de188b11c6357c0505824197"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4e6aeb2e0932f32950cf56a8b4813cb15ff792fc0c9b3752eaf067cfe298496a"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:61f964a05356f4bca4112e6334ed7c208174511bd56e6b8fc86dad4d024d4185"}, {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46293c39252f93ea0910aababa8752ad628bcce3a10d3f260648dd472256983f"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:136063a68644eca9339d02e6693932116f6a8591ac013b0014479a1de664e40a"}, {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6e2bf13d9256398d037fef09fd8bf9b0bf77876e22647d10761d35593b9ac547"}, + {file = "sqlalchemy-2.0.43-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:44337823462291f17f994d64282a71c51d738fc9ef561bf265f1d0fd9116a782"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win32.whl", hash = "sha256:13194276e69bb2af56198fef7909d48fd34820de01d9c92711a5fa45497cc7ed"}, {file = "sqlalchemy-2.0.43-cp38-cp38-win_amd64.whl", hash = "sha256:334f41fa28de9f9be4b78445e68530da3c5fa054c907176460c81494f4ae1f5e"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ceb5c832cc30663aeaf5e39657712f4c4241ad1f638d487ef7216258f6d41fe7"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11f43c39b4b2ec755573952bbcc58d976779d482f6f832d7f33a8d869ae891bf"}, {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:413391b2239db55be14fa4223034d7e13325a1812c8396ecd4f2c08696d5ccad"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c379e37b08c6c527181a397212346be39319fb64323741d23e46abd97a400d34"}, {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:03d73ab2a37d9e40dec4984d1813d7878e01dbdc742448d44a7341b7a9f408c7"}, + {file = "sqlalchemy-2.0.43-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8cee08f15d9e238ede42e9bbc1d6e7158d0ca4f176e4eab21f88ac819ae3bd7b"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win32.whl", hash = "sha256:b3edaec7e8b6dc5cd94523c6df4f294014df67097c8217a89929c99975811414"}, {file = "sqlalchemy-2.0.43-cp39-cp39-win_amd64.whl", hash = "sha256:227119ce0a89e762ecd882dc661e0aa677a690c914e358f0dd8932a2e8b2765b"}, {file = "sqlalchemy-2.0.43-py3-none-any.whl", hash = "sha256:1681c21dd2ccee222c2fe0bef671d1aef7c504087c9c4e800371cfcc8ac966fc"}, @@ -12065,7 +12056,7 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" -groups = ["main", "dev"] +groups = ["main"] markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, @@ -12537,12 +12528,11 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] -markers = {dev = "python_version == \"3.10\""} [[package]] name = "typing-inspect" diff --git a/pyproject.toml b/pyproject.toml index 2436911e8..a9b895dfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "cognee" -version = "0.3.9" +version = "0.5.0.dev0" description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning." authors = [ { name = "Vasilije Markovic" }, diff --git a/uv.lock b/uv.lock index 8c35a3366..cc66c3d7e 100644 --- a/uv.lock +++ b/uv.lock @@ -929,7 +929,7 @@ wheels = [ [[package]] name = "cognee" -version = "0.3.9" +version = "0.5.0.dev0" source = { editable = "." } dependencies = [ { name = "aiofiles" }, @@ -2560,6 +2560,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" }, { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" }, { url = "https://files.pythonhosted.org/packages/a1/8d/88f3ebd2bc96bf7747093696f4335a0a8a4c5acfcf1b757717c0d2474ba3/greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f", size = 1137126, upload-time = "2025-08-07T13:18:20.239Z" }, + { url = "https://files.pythonhosted.org/packages/f1/29/74242b7d72385e29bcc5563fba67dad94943d7cd03552bac320d597f29b2/greenlet-3.2.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f47617f698838ba98f4ff4189aef02e7343952df3a615f847bb575c3feb177a7", size = 1544904, upload-time = "2025-11-04T12:42:04.763Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e2/1572b8eeab0f77df5f6729d6ab6b141e4a84ee8eb9bc8c1e7918f94eda6d/greenlet-3.2.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af41be48a4f60429d5cad9d22175217805098a9ef7c40bfef44f7669fb9d74d8", size = 1611228, upload-time = "2025-11-04T12:42:08.423Z" }, { url = "https://files.pythonhosted.org/packages/d6/6f/b60b0291d9623c496638c582297ead61f43c4b72eef5e9c926ef4565ec13/greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c", size = 298654, upload-time = "2025-08-07T13:50:00.469Z" }, { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" }, { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" }, @@ -2569,6 +2571,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, @@ -2578,6 +2582,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -2587,6 +2593,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, ] From c2c64a417c4805c5d5aeb59b4e5f9519b729ee85 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 24 Nov 2025 17:44:51 +0100 Subject: [PATCH 128/129] fix: fixes ontology api endpoint tests + poetry lock(#1824) ## Description This PR fixes the failing CI tests related to the new ontology api endpoint. ## Type of Change - [x] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- .../tests/unit/api/test_ontology_endpoint.py | 10 +++- poetry.lock | 52 +++++++++++++------ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/cognee/tests/unit/api/test_ontology_endpoint.py b/cognee/tests/unit/api/test_ontology_endpoint.py index d53c5ab44..af3a4d90e 100644 --- a/cognee/tests/unit/api/test_ontology_endpoint.py +++ b/cognee/tests/unit/api/test_ontology_endpoint.py @@ -25,7 +25,10 @@ def mock_user(): def mock_default_user(): """Mock default user for testing.""" return SimpleNamespace( - id=uuid.uuid4(), email="default@example.com", is_active=True, tenant_id=uuid.uuid4() + id=str(uuid.uuid4()), + email="default@example.com", + is_active=True, + tenant_id=str(uuid.uuid4()), ) @@ -108,6 +111,7 @@ def test_upload_multiple_ontologies(mock_get_default_user, client, mock_default_ """Test uploading multiple ontology files in single request""" import io + mock_get_default_user.return_value = mock_default_user # Create mock files file1_content = b"" file2_content = b"" @@ -137,6 +141,7 @@ def test_upload_endpoint_accepts_arrays(mock_get_default_user, client, mock_defa import io import json + mock_get_default_user.return_value = mock_default_user file_content = b"" files = [("ontology_file", ("single.owl", io.BytesIO(file_content), "application/xml"))] @@ -173,6 +178,7 @@ def test_complete_multifile_workflow(mock_get_default_user, client, mock_default import io import json + mock_get_default_user.return_value = mock_default_user # Step 1: Upload multiple ontologies file1_content = b""" =1.0.0)"] name = "pluggy" version = "1.6.0" description = "plugin and hook calling mechanisms for python" -optional = true +optional = false python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\"" +groups = ["main", "dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\" or extra == \"dlt\" or extra == \"docling\""} [package.extras] dev = ["pre-commit", "tox"] @@ -8654,6 +8656,7 @@ files = [ {file = "psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4"}, {file = "psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067"}, {file = "psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e"}, + {file = "psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2"}, {file = "psycopg2-2.9.10-cp39-cp39-win32.whl", hash = "sha256:9d5b3b94b79a844a986d029eee38998232451119ad653aea42bb9220a8c5066b"}, {file = "psycopg2-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:88138c8dedcbfa96408023ea2b0c369eda40fe5d75002c0964c78f46f11fa442"}, {file = "psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11"}, @@ -8715,6 +8718,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -9694,14 +9698,14 @@ files = [ name = "pytest" version = "7.4.4" description = "pytest: simple powerful testing with Python" -optional = true +optional = false python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"deepeval\" or extra == \"dev\"" +groups = ["main", "dev"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, ] +markers = {main = "extra == \"deepeval\" or extra == \"dev\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -9788,6 +9792,21 @@ files = [ packaging = ">=17.1" pytest = ">=6.2" +[[package]] +name = "pytest-timeout" +version = "2.4.0" +description = "pytest plugin to abort hanging tests" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2"}, + {file = "pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a"}, +] + +[package.dependencies] +pytest = ">=7.0.0" + [[package]] name = "pytest-xdist" version = "3.8.0" @@ -12056,7 +12075,7 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, @@ -12528,11 +12547,12 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] +markers = {dev = "python_version == \"3.10\""} [[package]] name = "typing-inspect" From 508165e883aa9dc3232d8021daa94765439aded1 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 26 Nov 2025 15:18:53 +0100 Subject: [PATCH 129/129] feature: Introduces wide subgraph search in graph completion and improves QA speed (#1736) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces wide vector and graph structure filtering capabilities. With these changes, the graph completion retriever and all retrievers that inherit from it will now filter relevant vector elements and subgraphs based on the query. This improvement significantly increases search speed for large graphs while maintaining—and in some cases slightly improving—accuracy. Changes in This PR: -Introduced new wide_search_top_k parameter: Controls the initial search space size -Added graph adapter level filtering method: Enables relevant subgraph filtering while maintaining backward compatibility. For community or custom graph adapters that don't implement this method, the system gracefully falls back to the original search behavior. -Updated modal dashboard and evaluation framework: Fixed compatibility issues. Added comprehensive unit tests: Introduced unit tests for brute_force_triplet_search (previously untested) and expanded the CogneeGraph test suite. Integration tests: Existing integration tests verify end-to-end search functionality (no changes required). Acceptance Criteria and Testing To verify the new search behavior, run search queries with different wide_search_top_k parameters while logging is enabled: None: Triggers a full graph search (default behavior) 1: Projects a minimal subgraph (demonstrates maximum filtering) Custom values: Test intermediate levels of filtering Internal Testing and results: Performance and accuracy benchmarks are available upon request. The implementation demonstrates measurable improvements in query latency for large graphs without sacrificing result quality. ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [x] Code refactoring - [x] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) None ## Pre-submission Checklist - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Pavel Zorin --- cognee/api/v1/search/search.py | 4 + cognee/eval_framework/Dockerfile | 29 + .../answer_generation_executor.py | 10 + .../run_question_answering_module.py | 2 +- cognee/eval_framework/eval_config.py | 4 +- cognee/eval_framework/modal_run_eval.py | 46 +- .../databases/graph/graph_db_interface.py | 15 + .../databases/graph/kuzu/adapter.py | 97 +++ .../databases/graph/neo4j_driver/adapter.py | 57 ++ .../modules/graph/cognee_graph/CogneeGraph.py | 110 +++- .../graph/cognee_graph/CogneeGraphElements.py | 11 +- ..._completion_context_extension_retriever.py | 4 + .../graph_completion_cot_retriever.py | 4 + .../retrieval/graph_completion_retriever.py | 10 + .../graph_summary_completion_retriever.py | 4 + .../modules/retrieval/temporal_retriever.py | 4 + .../utils/brute_force_triplet_search.py | 45 +- .../search/methods/get_search_type_tools.py | 30 +- .../methods/no_access_control_search.py | 4 + cognee/modules/search/methods/search.py | 21 + .../graph/cognee_graph_elements_test.py | 4 +- .../unit/modules/graph/cognee_graph_test.py | 455 ++++++++++++++ .../test_brute_force_triplet_search.py | 582 ++++++++++++++++++ 23 files changed, 1482 insertions(+), 70 deletions(-) create mode 100644 cognee/eval_framework/Dockerfile create mode 100644 cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index d4e5fbbe6..354331c57 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -31,6 +31,8 @@ async def search( only_context: bool = False, use_combined_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Union[List[SearchResult], CombinedSearchResult]: """ Search and query the knowledge graph for insights, information, and connections. @@ -200,6 +202,8 @@ async def search( only_context=only_context, use_combined_context=use_combined_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) return filtered_search_results diff --git a/cognee/eval_framework/Dockerfile b/cognee/eval_framework/Dockerfile new file mode 100644 index 000000000..e83be3da4 --- /dev/null +++ b/cognee/eval_framework/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11-slim + +# Set environment variables +ENV PIP_NO_CACHE_DIR=true +ENV PATH="${PATH}:/root/.poetry/bin" +ENV PYTHONPATH=/app +ENV SKIP_MIGRATIONS=true + +# System dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + libpq-dev \ + git \ + curl \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY pyproject.toml poetry.lock README.md /app/ + +RUN pip install poetry + +RUN poetry config virtualenvs.create false + +RUN poetry install --extras distributed --extras evals --extras deepeval --no-root + +COPY cognee/ /app/cognee +COPY distributed/ /app/distributed diff --git a/cognee/eval_framework/answer_generation/answer_generation_executor.py b/cognee/eval_framework/answer_generation/answer_generation_executor.py index 6f166657e..29b3ede68 100644 --- a/cognee/eval_framework/answer_generation/answer_generation_executor.py +++ b/cognee/eval_framework/answer_generation/answer_generation_executor.py @@ -35,6 +35,16 @@ class AnswerGeneratorExecutor: retrieval_context = await retriever.get_context(query_text) search_results = await retriever.get_completion(query_text, retrieval_context) + ############ + #:TODO This is a quick fix until we don't structure retriever results properly but lets not leave it like this...this is needed now due to the changed combined retriever structure.. + if isinstance(retrieval_context, list): + retrieval_context = await retriever.convert_retrieved_objects_to_context( + triplets=retrieval_context + ) + + if isinstance(search_results, str): + search_results = [search_results] + ############# answer = { "question": query_text, "answer": search_results[0], diff --git a/cognee/eval_framework/answer_generation/run_question_answering_module.py b/cognee/eval_framework/answer_generation/run_question_answering_module.py index d0a2ebe1e..6b55d84b2 100644 --- a/cognee/eval_framework/answer_generation/run_question_answering_module.py +++ b/cognee/eval_framework/answer_generation/run_question_answering_module.py @@ -35,7 +35,7 @@ async def create_and_insert_answers_table(questions_payload): async def run_question_answering( - params: dict, system_prompt="answer_simple_question.txt", top_k: Optional[int] = None + params: dict, system_prompt="answer_simple_question_benchmark.txt", top_k: Optional[int] = None ) -> List[dict]: if params.get("answering_questions"): logger.info("Question answering started...") diff --git a/cognee/eval_framework/eval_config.py b/cognee/eval_framework/eval_config.py index 6edcc0454..9e6f26688 100644 --- a/cognee/eval_framework/eval_config.py +++ b/cognee/eval_framework/eval_config.py @@ -14,7 +14,7 @@ class EvalConfig(BaseSettings): # Question answering params answering_questions: bool = True - qa_engine: str = "cognee_completion" # Options: 'cognee_completion' or 'cognee_graph_completion' or 'cognee_graph_completion_cot' or 'cognee_graph_completion_context_extension' + qa_engine: str = "cognee_graph_completion" # Options: 'cognee_completion' or 'cognee_graph_completion' or 'cognee_graph_completion_cot' or 'cognee_graph_completion_context_extension' # Evaluation params evaluating_answers: bool = True @@ -25,7 +25,7 @@ class EvalConfig(BaseSettings): "EM", "f1", ] # Use only 'correctness' for DirectLLM - deepeval_model: str = "gpt-5-mini" + deepeval_model: str = "gpt-4o-mini" # Metrics params calculate_metrics: bool = True diff --git a/cognee/eval_framework/modal_run_eval.py b/cognee/eval_framework/modal_run_eval.py index aca2686a5..bc2ff77c5 100644 --- a/cognee/eval_framework/modal_run_eval.py +++ b/cognee/eval_framework/modal_run_eval.py @@ -2,7 +2,6 @@ import modal import os import asyncio import datetime -import hashlib import json from cognee.shared.logging_utils import get_logger from cognee.eval_framework.eval_config import EvalConfig @@ -10,6 +9,9 @@ from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_b from cognee.eval_framework.answer_generation.run_question_answering_module import ( run_question_answering, ) +import pathlib +from os import path +from modal import Image from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation from cognee.eval_framework.metrics_dashboard import create_dashboard @@ -38,22 +40,19 @@ def read_and_combine_metrics(eval_params: dict) -> dict: app = modal.App("modal-run-eval") -image = ( - modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False) - .copy_local_file("pyproject.toml", "pyproject.toml") - .copy_local_file("poetry.lock", "poetry.lock") - .env( - { - "ENV": os.getenv("ENV"), - "LLM_API_KEY": os.getenv("LLM_API_KEY"), - "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), - } - ) - .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly") +image = Image.from_dockerfile( + path=pathlib.Path(path.join(path.dirname(__file__), "Dockerfile")).resolve(), + force_build=False, +).add_local_python_source("cognee") + + +@app.function( + image=image, + max_containers=10, + timeout=86400, + volumes={"/data": vol}, + secrets=[modal.Secret.from_name("eval_secrets")], ) - - -@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol}) async def modal_run_eval(eval_params=None): """Runs evaluation pipeline and returns combined metrics results.""" if eval_params is None: @@ -105,18 +104,7 @@ async def main(): configs = [ EvalConfig( task_getter_type="Default", - number_of_samples_in_corpus=10, - benchmark="HotPotQA", - qa_engine="cognee_graph_completion", - building_corpus_from_scratch=True, - answering_questions=True, - evaluating_answers=True, - calculate_metrics=True, - dashboard=True, - ), - EvalConfig( - task_getter_type="Default", - number_of_samples_in_corpus=10, + number_of_samples_in_corpus=25, benchmark="TwoWikiMultiHop", qa_engine="cognee_graph_completion", building_corpus_from_scratch=True, @@ -127,7 +115,7 @@ async def main(): ), EvalConfig( task_getter_type="Default", - number_of_samples_in_corpus=10, + number_of_samples_in_corpus=25, benchmark="Musique", qa_engine="cognee_graph_completion", building_corpus_from_scratch=True, diff --git a/cognee/infrastructure/databases/graph/graph_db_interface.py b/cognee/infrastructure/databases/graph/graph_db_interface.py index 67df1a27c..8f8c96e79 100644 --- a/cognee/infrastructure/databases/graph/graph_db_interface.py +++ b/cognee/infrastructure/databases/graph/graph_db_interface.py @@ -398,3 +398,18 @@ class GraphDBInterface(ABC): - node_id (Union[str, UUID]): Unique identifier of the node for which to retrieve connections. """ raise NotImplementedError + + @abstractmethod + async def get_filtered_graph_data( + self, attribute_filters: List[Dict[str, List[Union[str, int]]]] + ) -> Tuple[List[Node], List[EdgeData]]: + """ + Retrieve nodes and edges filtered by the provided attribute criteria. + + Parameters: + ----------- + + - attribute_filters: A list of dictionaries where keys are attribute names and values + are lists of attribute values to filter by. + """ + raise NotImplementedError diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index 8dd160665..9dbc9c1bc 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -12,6 +12,7 @@ from contextlib import asynccontextmanager from concurrent.futures import ThreadPoolExecutor from typing import Dict, Any, List, Union, Optional, Tuple, Type +from cognee.exceptions import CogneeValidationError from cognee.shared.logging_utils import get_logger from cognee.infrastructure.utils.run_sync import run_sync from cognee.infrastructure.files.storage import get_file_storage @@ -1186,6 +1187,11 @@ class KuzuAdapter(GraphDBInterface): A tuple with two elements: a list of tuples of (node_id, properties) and a list of tuples of (source_id, target_id, relationship_name, properties). """ + + import time + + start_time = time.time() + try: nodes_query = """ MATCH (n:Node) @@ -1249,6 +1255,11 @@ class KuzuAdapter(GraphDBInterface): }, ) ) + + retrieval_time = time.time() - start_time + logger.info( + f"Retrieved {len(nodes)} nodes and {len(edges)} edges in {retrieval_time:.2f} seconds" + ) return formatted_nodes, formatted_edges except Exception as e: logger.error(f"Failed to get graph data: {e}") @@ -1417,6 +1428,92 @@ class KuzuAdapter(GraphDBInterface): formatted_edges.append((source_id, target_id, rel_type, props)) return formatted_nodes, formatted_edges + async def get_id_filtered_graph_data(self, target_ids: list[str]): + """ + Retrieve graph data filtered by specific node IDs, including their direct neighbors + and only edges where one endpoint matches those IDs. + + Returns: + nodes: List[dict] -> Each dict includes "id" and all node properties + edges: List[dict] -> Each dict includes "source", "target", "type", "properties" + """ + import time + + start_time = time.time() + + try: + if not target_ids: + logger.warning("No target IDs provided for ID-filtered graph retrieval.") + return [], [] + + if not all(isinstance(x, str) for x in target_ids): + raise CogneeValidationError("target_ids must be a list of strings") + + query = """ + MATCH (n:Node)-[r]->(m:Node) + WHERE n.id IN $target_ids OR m.id IN $target_ids + RETURN n.id, { + name: n.name, + type: n.type, + properties: n.properties + }, m.id, { + name: m.name, + type: m.type, + properties: m.properties + }, r.relationship_name, r.properties + """ + + result = await self.query(query, {"target_ids": target_ids}) + + if not result: + logger.info("No data returned for the supplied IDs") + return [], [] + + nodes_dict = {} + edges = [] + + for n_id, n_props, m_id, m_props, r_type, r_props_raw in result: + if n_props.get("properties"): + try: + additional_props = json.loads(n_props["properties"]) + n_props.update(additional_props) + del n_props["properties"] + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties JSON for node {n_id}") + + if m_props.get("properties"): + try: + additional_props = json.loads(m_props["properties"]) + m_props.update(additional_props) + del m_props["properties"] + except json.JSONDecodeError: + logger.warning(f"Failed to parse properties JSON for node {m_id}") + + nodes_dict[n_id] = (n_id, n_props) + nodes_dict[m_id] = (m_id, m_props) + + edge_props = {} + if r_props_raw: + try: + edge_props = json.loads(r_props_raw) + except (json.JSONDecodeError, TypeError): + logger.warning(f"Failed to parse edge properties for {n_id}->{m_id}") + + source_id = edge_props.get("source_node_id", n_id) + target_id = edge_props.get("target_node_id", m_id) + edges.append((source_id, target_id, r_type, edge_props)) + + retrieval_time = time.time() - start_time + logger.info( + f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s" + ) + + return list(nodes_dict.values()), edges + + except Exception as e: + logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}") + raise + async def get_graph_metrics(self, include_optional=False) -> Dict[str, Any]: """ Get metrics on graph structure and connectivity. diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 6216e107e..f3bb8e173 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -964,6 +964,63 @@ class Neo4jAdapter(GraphDBInterface): logger.error(f"Error during graph data retrieval: {str(e)}") raise + async def get_id_filtered_graph_data(self, target_ids: list[str]): + """ + Retrieve graph data filtered by specific node IDs, including their direct neighbors + and only edges where one endpoint matches those IDs. + + This version uses a single Cypher query for efficiency. + """ + import time + + start_time = time.time() + + try: + if not target_ids: + logger.warning("No target IDs provided for ID-filtered graph retrieval.") + return [], [] + + query = """ + MATCH ()-[r]-() + WHERE startNode(r).id IN $target_ids + OR endNode(r).id IN $target_ids + WITH DISTINCT r, startNode(r) AS a, endNode(r) AS b + RETURN + properties(a) AS n_properties, + properties(b) AS m_properties, + type(r) AS type, + properties(r) AS properties + """ + + result = await self.query(query, {"target_ids": target_ids}) + + nodes_dict = {} + edges = [] + + for record in result: + n_props = record["n_properties"] + m_props = record["m_properties"] + r_props = record["properties"] + r_type = record["type"] + + nodes_dict[n_props["id"]] = (n_props["id"], n_props) + nodes_dict[m_props["id"]] = (m_props["id"], m_props) + + source_id = r_props.get("source_node_id", n_props["id"]) + target_id = r_props.get("target_node_id", m_props["id"]) + edges.append((source_id, target_id, r_type, r_props)) + + retrieval_time = time.time() - start_time + logger.info( + f"ID-filtered retrieval: {len(nodes_dict)} nodes and {len(edges)} edges in {retrieval_time:.2f}s" + ) + + return list(nodes_dict.values()), edges + + except Exception as e: + logger.error(f"Error during ID-filtered graph data retrieval: {str(e)}") + raise + async def get_nodeset_subgraph( self, node_type: Type[Any], node_name: List[str] ) -> Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]]: diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index cb7562422..2e0b82e8d 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -56,6 +56,68 @@ class CogneeGraph(CogneeAbstractGraph): def get_edges(self) -> List[Edge]: return self.edges + async def _get_nodeset_subgraph( + self, + adapter, + node_type, + node_name, + ): + """Retrieve subgraph based on node type and name.""" + logger.info("Retrieving graph filtered by node type and node name (NodeSet).") + nodes_data, edges_data = await adapter.get_nodeset_subgraph( + node_type=node_type, node_name=node_name + ) + if not nodes_data or not edges_data: + raise EntityNotFoundError( + message="Nodeset does not exist, or empty nodeset projected from the database." + ) + return nodes_data, edges_data + + async def _get_full_or_id_filtered_graph( + self, + adapter, + relevant_ids_to_filter, + ): + """Retrieve full or ID-filtered graph with fallback.""" + if relevant_ids_to_filter is None: + logger.info("Retrieving full graph.") + nodes_data, edges_data = await adapter.get_graph_data() + if not nodes_data or not edges_data: + raise EntityNotFoundError(message="Empty graph projected from the database.") + return nodes_data, edges_data + + get_graph_data_fn = getattr(adapter, "get_id_filtered_graph_data", adapter.get_graph_data) + if getattr(adapter.__class__, "get_id_filtered_graph_data", None): + logger.info("Retrieving ID-filtered graph from database.") + nodes_data, edges_data = await get_graph_data_fn(target_ids=relevant_ids_to_filter) + else: + logger.info("Retrieving full graph from database.") + nodes_data, edges_data = await get_graph_data_fn() + if hasattr(adapter, "get_id_filtered_graph_data") and (not nodes_data or not edges_data): + logger.warning( + "Id filtered graph returned empty, falling back to full graph retrieval." + ) + logger.info("Retrieving full graph") + nodes_data, edges_data = await adapter.get_graph_data() + + if not nodes_data or not edges_data: + raise EntityNotFoundError("Empty graph projected from the database.") + return nodes_data, edges_data + + async def _get_filtered_graph( + self, + adapter, + memory_fragment_filter, + ): + """Retrieve graph filtered by attributes.""" + logger.info("Retrieving graph filtered by memory fragment") + nodes_data, edges_data = await adapter.get_filtered_graph_data( + attribute_filters=memory_fragment_filter + ) + if not nodes_data or not edges_data: + raise EntityNotFoundError(message="Empty filtered graph projected from the database.") + return nodes_data, edges_data + async def project_graph_from_db( self, adapter: Union[GraphDBInterface], @@ -67,40 +129,39 @@ class CogneeGraph(CogneeAbstractGraph): memory_fragment_filter=[], node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + relevant_ids_to_filter: Optional[List[str]] = None, + triplet_distance_penalty: float = 3.5, ) -> None: if node_dimension < 1 or edge_dimension < 1: raise InvalidDimensionsError() try: + if node_type is not None and node_name not in [None, [], ""]: + nodes_data, edges_data = await self._get_nodeset_subgraph( + adapter, node_type, node_name + ) + elif len(memory_fragment_filter) == 0: + nodes_data, edges_data = await self._get_full_or_id_filtered_graph( + adapter, relevant_ids_to_filter + ) + else: + nodes_data, edges_data = await self._get_filtered_graph( + adapter, memory_fragment_filter + ) + import time start_time = time.time() - - # Determine projection strategy - if node_type is not None and node_name not in [None, [], ""]: - nodes_data, edges_data = await adapter.get_nodeset_subgraph( - node_type=node_type, node_name=node_name - ) - if not nodes_data or not edges_data: - raise EntityNotFoundError( - message="Nodeset does not exist, or empty nodetes projected from the database." - ) - elif len(memory_fragment_filter) == 0: - nodes_data, edges_data = await adapter.get_graph_data() - if not nodes_data or not edges_data: - raise EntityNotFoundError(message="Empty graph projected from the database.") - else: - nodes_data, edges_data = await adapter.get_filtered_graph_data( - attribute_filters=memory_fragment_filter - ) - if not nodes_data or not edges_data: - raise EntityNotFoundError( - message="Empty filtered graph projected from the database." - ) - # Process nodes for node_id, properties in nodes_data: node_attributes = {key: properties.get(key) for key in node_properties_to_project} - self.add_node(Node(str(node_id), node_attributes, dimension=node_dimension)) + self.add_node( + Node( + str(node_id), + node_attributes, + dimension=node_dimension, + node_penalty=triplet_distance_penalty, + ) + ) # Process edges for source_id, target_id, relationship_type, properties in edges_data: @@ -118,6 +179,7 @@ class CogneeGraph(CogneeAbstractGraph): attributes=edge_attributes, directed=directed, dimension=edge_dimension, + edge_penalty=triplet_distance_penalty, ) self.add_edge(edge) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py index 0ca9c4fb9..62ef8d9fd 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py @@ -20,13 +20,17 @@ class Node: status: np.ndarray def __init__( - self, node_id: str, attributes: Optional[Dict[str, Any]] = None, dimension: int = 1 + self, + node_id: str, + attributes: Optional[Dict[str, Any]] = None, + dimension: int = 1, + node_penalty: float = 3.5, ): if dimension <= 0: raise InvalidDimensionsError() self.id = node_id self.attributes = attributes if attributes is not None else {} - self.attributes["vector_distance"] = float("inf") + self.attributes["vector_distance"] = node_penalty self.skeleton_neighbours = [] self.skeleton_edges = [] self.status = np.ones(dimension, dtype=int) @@ -105,13 +109,14 @@ class Edge: attributes: Optional[Dict[str, Any]] = None, directed: bool = True, dimension: int = 1, + edge_penalty: float = 3.5, ): if dimension <= 0: raise InvalidDimensionsError() self.node1 = node1 self.node2 = node2 self.attributes = attributes if attributes is not None else {} - self.attributes["vector_distance"] = float("inf") + self.attributes["vector_distance"] = edge_penalty self.directed = directed self.status = np.ones(dimension, dtype=int) diff --git a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py index b07d11fd2..fc49a139b 100644 --- a/cognee/modules/retrieval/graph_completion_context_extension_retriever.py +++ b/cognee/modules/retrieval/graph_completion_context_extension_retriever.py @@ -39,6 +39,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): super().__init__( user_prompt_path=user_prompt_path, @@ -48,6 +50,8 @@ class GraphCompletionContextExtensionRetriever(GraphCompletionRetriever): node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) async def get_completion( diff --git a/cognee/modules/retrieval/graph_completion_cot_retriever.py b/cognee/modules/retrieval/graph_completion_cot_retriever.py index eb8f502cb..70fcb6cdb 100644 --- a/cognee/modules/retrieval/graph_completion_cot_retriever.py +++ b/cognee/modules/retrieval/graph_completion_cot_retriever.py @@ -65,6 +65,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): super().__init__( user_prompt_path=user_prompt_path, @@ -74,6 +76,8 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever): node_type=node_type, node_name=node_name, save_interaction=save_interaction, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) self.validation_system_prompt_path = validation_system_prompt_path self.validation_user_prompt_path = validation_user_prompt_path diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py index df77a11ac..89e9e47ce 100644 --- a/cognee/modules/retrieval/graph_completion_retriever.py +++ b/cognee/modules/retrieval/graph_completion_retriever.py @@ -47,6 +47,8 @@ class GraphCompletionRetriever(BaseGraphRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): """Initialize retriever with prompt paths and search parameters.""" self.save_interaction = save_interaction @@ -54,8 +56,10 @@ class GraphCompletionRetriever(BaseGraphRetriever): self.system_prompt_path = system_prompt_path self.system_prompt = system_prompt self.top_k = top_k if top_k is not None else 5 + self.wide_search_top_k = wide_search_top_k self.node_type = node_type self.node_name = node_name + self.triplet_distance_penalty = triplet_distance_penalty async def resolve_edges_to_text(self, retrieved_edges: list) -> str: """ @@ -105,6 +109,8 @@ class GraphCompletionRetriever(BaseGraphRetriever): collections=vector_index_collections or None, node_type=self.node_type, node_name=self.node_name, + wide_search_top_k=self.wide_search_top_k, + triplet_distance_penalty=self.triplet_distance_penalty, ) return found_triplets @@ -141,6 +147,10 @@ class GraphCompletionRetriever(BaseGraphRetriever): return triplets + async def convert_retrieved_objects_to_context(self, triplets: List[Edge]): + context = await self.resolve_edges_to_text(triplets) + return context + async def get_completion( self, query: str, diff --git a/cognee/modules/retrieval/graph_summary_completion_retriever.py b/cognee/modules/retrieval/graph_summary_completion_retriever.py index 051f39b22..e31ad126e 100644 --- a/cognee/modules/retrieval/graph_summary_completion_retriever.py +++ b/cognee/modules/retrieval/graph_summary_completion_retriever.py @@ -26,6 +26,8 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, save_interaction: bool = False, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): """Initialize retriever with default prompt paths and search parameters.""" super().__init__( @@ -36,6 +38,8 @@ class GraphSummaryCompletionRetriever(GraphCompletionRetriever): node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) self.summarize_prompt_path = summarize_prompt_path diff --git a/cognee/modules/retrieval/temporal_retriever.py b/cognee/modules/retrieval/temporal_retriever.py index f3da02c15..87d2ab009 100644 --- a/cognee/modules/retrieval/temporal_retriever.py +++ b/cognee/modules/retrieval/temporal_retriever.py @@ -47,6 +47,8 @@ class TemporalRetriever(GraphCompletionRetriever): top_k: Optional[int] = 5, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ): super().__init__( user_prompt_path=user_prompt_path, @@ -54,6 +56,8 @@ class TemporalRetriever(GraphCompletionRetriever): top_k=top_k, node_type=node_type, node_name=node_name, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) self.user_prompt_path = user_prompt_path self.system_prompt_path = system_prompt_path diff --git a/cognee/modules/retrieval/utils/brute_force_triplet_search.py b/cognee/modules/retrieval/utils/brute_force_triplet_search.py index f8bdbb97d..2f8a545f7 100644 --- a/cognee/modules/retrieval/utils/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/utils/brute_force_triplet_search.py @@ -58,6 +58,8 @@ async def get_memory_fragment( properties_to_project: Optional[List[str]] = None, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + relevant_ids_to_filter: Optional[List[str]] = None, + triplet_distance_penalty: Optional[float] = 3.5, ) -> CogneeGraph: """Creates and initializes a CogneeGraph memory fragment with optional property projections.""" if properties_to_project is None: @@ -74,6 +76,8 @@ async def get_memory_fragment( edge_properties_to_project=["relationship_name", "edge_text"], node_type=node_type, node_name=node_name, + relevant_ids_to_filter=relevant_ids_to_filter, + triplet_distance_penalty=triplet_distance_penalty, ) except EntityNotFoundError: @@ -95,6 +99,8 @@ async def brute_force_triplet_search( memory_fragment: Optional[CogneeGraph] = None, node_type: Optional[Type] = None, node_name: Optional[List[str]] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> List[Edge]: """ Performs a brute force search to retrieve the top triplets from the graph. @@ -107,6 +113,8 @@ async def brute_force_triplet_search( memory_fragment (Optional[CogneeGraph]): Existing memory fragment to reuse. node_type: node type to filter node_name: node name to filter + wide_search_top_k (Optional[int]): Number of initial elements to retrieve from collections + triplet_distance_penalty (Optional[float]): Default distance penalty in graph projection Returns: list: The top triplet results. @@ -116,10 +124,10 @@ async def brute_force_triplet_search( if top_k <= 0: raise ValueError("top_k must be a positive integer.") - if memory_fragment is None: - memory_fragment = await get_memory_fragment( - properties_to_project, node_type=node_type, node_name=node_name - ) + # Setting wide search limit based on the parameters + non_global_search = node_name is None + + wide_search_limit = wide_search_top_k if non_global_search else None if collections is None: collections = [ @@ -140,7 +148,7 @@ async def brute_force_triplet_search( async def search_in_collection(collection_name: str): try: return await vector_engine.search( - collection_name=collection_name, query_vector=query_vector, limit=None + collection_name=collection_name, query_vector=query_vector, limit=wide_search_limit ) except CollectionNotFoundError: return [] @@ -156,15 +164,38 @@ async def brute_force_triplet_search( return [] # Final statistics - projection_time = time.time() - start_time + vector_collection_search_time = time.time() - start_time logger.info( - f"Vector collection retrieval completed: Retrieved distances from {sum(1 for res in results if res)} collections in {projection_time:.2f}s" + f"Vector collection retrieval completed: Retrieved distances from {sum(1 for res in results if res)} collections in {vector_collection_search_time:.2f}s" ) node_distances = {collection: result for collection, result in zip(collections, results)} edge_distances = node_distances.get("EdgeType_relationship_name", None) + if wide_search_limit is not None: + relevant_ids_to_filter = list( + { + str(getattr(scored_node, "id")) + for collection_name, score_collection in node_distances.items() + if collection_name != "EdgeType_relationship_name" + and isinstance(score_collection, (list, tuple)) + for scored_node in score_collection + if getattr(scored_node, "id", None) + } + ) + else: + relevant_ids_to_filter = None + + if memory_fragment is None: + memory_fragment = await get_memory_fragment( + properties_to_project=properties_to_project, + node_type=node_type, + node_name=node_name, + relevant_ids_to_filter=relevant_ids_to_filter, + triplet_distance_penalty=triplet_distance_penalty, + ) + await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) await memory_fragment.map_vector_distances_to_graph_edges( vector_engine=vector_engine, query_vector=query_vector, edge_distances=edge_distances diff --git a/cognee/modules/search/methods/get_search_type_tools.py b/cognee/modules/search/methods/get_search_type_tools.py index 72e2db89a..165ec379b 100644 --- a/cognee/modules/search/methods/get_search_type_tools.py +++ b/cognee/modules/search/methods/get_search_type_tools.py @@ -37,6 +37,8 @@ async def get_search_type_tools( node_name: Optional[List[str]] = None, save_interaction: bool = False, last_k: Optional[int] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> list: search_tasks: dict[SearchType, List[Callable]] = { SearchType.SUMMARIES: [ @@ -67,6 +69,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphCompletionRetriever( system_prompt_path=system_prompt_path, @@ -75,6 +79,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.GRAPH_COMPLETION_COT: [ @@ -85,6 +91,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphCompletionCotRetriever( system_prompt_path=system_prompt_path, @@ -93,6 +101,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.GRAPH_COMPLETION_CONTEXT_EXTENSION: [ @@ -103,6 +113,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphCompletionContextExtensionRetriever( system_prompt_path=system_prompt_path, @@ -111,6 +123,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.GRAPH_SUMMARY_COMPLETION: [ @@ -121,6 +135,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_completion, GraphSummaryCompletionRetriever( system_prompt_path=system_prompt_path, @@ -129,6 +145,8 @@ async def get_search_type_tools( node_name=node_name, save_interaction=save_interaction, system_prompt=system_prompt, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ).get_context, ], SearchType.CODE: [ @@ -145,8 +163,16 @@ async def get_search_type_tools( ], SearchType.FEEDBACK: [UserQAFeedback(last_k=last_k).add_feedback], SearchType.TEMPORAL: [ - TemporalRetriever(top_k=top_k).get_completion, - TemporalRetriever(top_k=top_k).get_context, + TemporalRetriever( + top_k=top_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, + ).get_completion, + TemporalRetriever( + top_k=top_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, + ).get_context, ], SearchType.CHUNKS_LEXICAL: ( lambda _r=JaccardChunksRetriever(top_k=top_k): [ diff --git a/cognee/modules/search/methods/no_access_control_search.py b/cognee/modules/search/methods/no_access_control_search.py index fcb02da46..3a703bbc9 100644 --- a/cognee/modules/search/methods/no_access_control_search.py +++ b/cognee/modules/search/methods/no_access_control_search.py @@ -24,6 +24,8 @@ async def no_access_control_search( last_k: Optional[int] = None, only_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Tuple[Any, Union[str, List[Edge]], List[Dataset]]: search_tools = await get_search_type_tools( query_type=query_type, @@ -35,6 +37,8 @@ async def no_access_control_search( node_name=node_name, save_interaction=save_interaction, last_k=last_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) graph_engine = await get_graph_engine() is_empty = await graph_engine.is_empty() diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index b4278424b..9f180d607 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -47,6 +47,8 @@ async def search( only_context: bool = False, use_combined_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Union[CombinedSearchResult, List[SearchResult]]: """ @@ -90,6 +92,8 @@ async def search( only_context=only_context, use_combined_context=use_combined_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) else: search_results = [ @@ -105,6 +109,8 @@ async def search( last_k=last_k, only_context=only_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) ] @@ -219,6 +225,8 @@ async def authorized_search( only_context: bool = False, use_combined_context: bool = False, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Union[ Tuple[Any, Union[List[Edge], str], List[Dataset]], List[Tuple[Any, Union[List[Edge], str], List[Dataset]]], @@ -246,6 +254,8 @@ async def authorized_search( last_k=last_k, only_context=True, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) context = {} @@ -267,6 +277,8 @@ async def authorized_search( node_name=node_name, save_interaction=save_interaction, last_k=last_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) search_tools = specific_search_tools if len(search_tools) == 2: @@ -306,6 +318,7 @@ async def authorized_search( last_k=last_k, only_context=only_context, session_id=session_id, + wide_search_top_k=wide_search_top_k, ) return search_results @@ -325,6 +338,8 @@ async def search_in_datasets_context( only_context: bool = False, context: Optional[Any] = None, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> List[Tuple[Any, Union[str, List[Edge]], List[Dataset]]]: """ Searches all provided datasets and handles setting up of appropriate database context based on permissions. @@ -345,6 +360,8 @@ async def search_in_datasets_context( only_context: bool = False, context: Optional[Any] = None, session_id: Optional[str] = None, + wide_search_top_k: Optional[int] = 100, + triplet_distance_penalty: Optional[float] = 3.5, ) -> Tuple[Any, Union[str, List[Edge]], List[Dataset]]: # Set database configuration in async context for each dataset user has access for await set_database_global_context_variables(dataset.id, dataset.owner_id) @@ -378,6 +395,8 @@ async def search_in_datasets_context( node_name=node_name, save_interaction=save_interaction, last_k=last_k, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) search_tools = specific_search_tools if len(search_tools) == 2: @@ -413,6 +432,8 @@ async def search_in_datasets_context( only_context=only_context, context=context, session_id=session_id, + wide_search_top_k=wide_search_top_k, + triplet_distance_penalty=triplet_distance_penalty, ) ) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py index 37ba113b5..1d2b79cf9 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py @@ -9,7 +9,7 @@ def test_node_initialization(): """Test that a Node is initialized correctly.""" node = Node("node1", {"attr1": "value1"}, dimension=2) assert node.id == "node1" - assert node.attributes == {"attr1": "value1", "vector_distance": np.inf} + assert node.attributes == {"attr1": "value1", "vector_distance": 3.5} assert len(node.status) == 2 assert np.all(node.status == 1) @@ -96,7 +96,7 @@ def test_edge_initialization(): edge = Edge(node1, node2, {"weight": 10}, directed=False, dimension=2) assert edge.node1 == node1 assert edge.node2 == node2 - assert edge.attributes == {"vector_distance": np.inf, "weight": 10} + assert edge.attributes == {"vector_distance": 3.5, "weight": 10} assert edge.directed is False assert len(edge.status) == 2 assert np.all(edge.status == 1) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py index 6888648c3..711479387 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py @@ -1,4 +1,5 @@ import pytest +from unittest.mock import AsyncMock from cognee.modules.graph.exceptions import EntityNotFoundError, EntityAlreadyExistsError from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph @@ -11,6 +12,30 @@ def setup_graph(): return CogneeGraph() +@pytest.fixture +def mock_adapter(): + """Fixture to create a mock adapter for database operations.""" + adapter = AsyncMock() + return adapter + + +@pytest.fixture +def mock_vector_engine(): + """Fixture to create a mock vector engine.""" + engine = AsyncMock() + engine.search = AsyncMock() + return engine + + +class MockScoredResult: + """Mock class for vector search results.""" + + def __init__(self, id, score, payload=None): + self.id = id + self.score = score + self.payload = payload or {} + + def test_add_node_success(setup_graph): """Test successful addition of a node.""" graph = setup_graph @@ -73,3 +98,433 @@ def test_get_edges_nonexistent_node(setup_graph): graph = setup_graph with pytest.raises(EntityNotFoundError, match="Node with id nonexistent does not exist."): graph.get_edges_from_node("nonexistent") + + +@pytest.mark.asyncio +async def test_project_graph_from_db_full_graph(setup_graph, mock_adapter): + """Test projecting a full graph from database.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Node1", "description": "First node"}), + ("2", {"name": "Node2", "description": "Second node"}), + ] + edges_data = [ + ("1", "2", "CONNECTS_TO", {"relationship_name": "connects"}), + ] + + mock_adapter.get_graph_data = AsyncMock(return_value=(nodes_data, edges_data)) + + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name", "description"], + edge_properties_to_project=["relationship_name"], + ) + + assert len(graph.nodes) == 2 + assert len(graph.edges) == 1 + assert graph.get_node("1") is not None + assert graph.get_node("2") is not None + assert graph.edges[0].node1.id == "1" + assert graph.edges[0].node2.id == "2" + + +@pytest.mark.asyncio +async def test_project_graph_from_db_id_filtered(setup_graph, mock_adapter): + """Test projecting an ID-filtered graph from database.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Node1"}), + ("2", {"name": "Node2"}), + ] + edges_data = [ + ("1", "2", "CONNECTS_TO", {"relationship_name": "connects"}), + ] + + mock_adapter.get_id_filtered_graph_data = AsyncMock(return_value=(nodes_data, edges_data)) + + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name"], + edge_properties_to_project=["relationship_name"], + relevant_ids_to_filter=["1", "2"], + ) + + assert len(graph.nodes) == 2 + assert len(graph.edges) == 1 + mock_adapter.get_id_filtered_graph_data.assert_called_once() + + +@pytest.mark.asyncio +async def test_project_graph_from_db_nodeset_subgraph(setup_graph, mock_adapter): + """Test projecting a nodeset subgraph filtered by node type and name.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Alice", "type": "Person"}), + ("2", {"name": "Bob", "type": "Person"}), + ] + edges_data = [ + ("1", "2", "KNOWS", {"relationship_name": "knows"}), + ] + + mock_adapter.get_nodeset_subgraph = AsyncMock(return_value=(nodes_data, edges_data)) + + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name", "type"], + edge_properties_to_project=["relationship_name"], + node_type="Person", + node_name=["Alice"], + ) + + assert len(graph.nodes) == 2 + assert graph.get_node("1") is not None + assert len(graph.edges) == 1 + mock_adapter.get_nodeset_subgraph.assert_called_once() + + +@pytest.mark.asyncio +async def test_project_graph_from_db_empty_graph(setup_graph, mock_adapter): + """Test projecting empty graph raises EntityNotFoundError.""" + graph = setup_graph + + mock_adapter.get_graph_data = AsyncMock(return_value=([], [])) + + with pytest.raises(EntityNotFoundError, match="Empty graph projected from the database."): + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name"], + edge_properties_to_project=[], + ) + + +@pytest.mark.asyncio +async def test_project_graph_from_db_missing_nodes(setup_graph, mock_adapter): + """Test that edges referencing missing nodes raise error.""" + graph = setup_graph + + nodes_data = [ + ("1", {"name": "Node1"}), + ] + edges_data = [ + ("1", "999", "CONNECTS_TO", {"relationship_name": "connects"}), + ] + + mock_adapter.get_graph_data = AsyncMock(return_value=(nodes_data, edges_data)) + + with pytest.raises(EntityNotFoundError, match="Edge references nonexistent nodes"): + await graph.project_graph_from_db( + adapter=mock_adapter, + node_properties_to_project=["name"], + edge_properties_to_project=["relationship_name"], + ) + + +@pytest.mark.asyncio +async def test_map_vector_distances_to_graph_nodes(setup_graph): + """Test mapping vector distances to graph nodes.""" + graph = setup_graph + + node1 = Node("1", {"name": "Node1"}) + node2 = Node("2", {"name": "Node2"}) + graph.add_node(node1) + graph.add_node(node2) + + node_distances = { + "Entity_name": [ + MockScoredResult("1", 0.95), + MockScoredResult("2", 0.87), + ] + } + + await graph.map_vector_distances_to_graph_nodes(node_distances) + + assert graph.get_node("1").attributes.get("vector_distance") == 0.95 + assert graph.get_node("2").attributes.get("vector_distance") == 0.87 + + +@pytest.mark.asyncio +async def test_map_vector_distances_partial_node_coverage(setup_graph): + """Test mapping vector distances when only some nodes have results.""" + graph = setup_graph + + node1 = Node("1", {"name": "Node1"}) + node2 = Node("2", {"name": "Node2"}) + node3 = Node("3", {"name": "Node3"}) + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + + node_distances = { + "Entity_name": [ + MockScoredResult("1", 0.95), + MockScoredResult("2", 0.87), + ] + } + + await graph.map_vector_distances_to_graph_nodes(node_distances) + + assert graph.get_node("1").attributes.get("vector_distance") == 0.95 + assert graph.get_node("2").attributes.get("vector_distance") == 0.87 + assert graph.get_node("3").attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_multiple_categories(setup_graph): + """Test mapping vector distances from multiple collection categories.""" + graph = setup_graph + + # Create nodes + node1 = Node("1") + node2 = Node("2") + node3 = Node("3") + node4 = Node("4") + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + graph.add_node(node4) + + node_distances = { + "Entity_name": [ + MockScoredResult("1", 0.95), + MockScoredResult("2", 0.87), + ], + "TextSummary_text": [ + MockScoredResult("3", 0.92), + ], + } + + await graph.map_vector_distances_to_graph_nodes(node_distances) + + assert graph.get_node("1").attributes.get("vector_distance") == 0.95 + assert graph.get_node("2").attributes.get("vector_distance") == 0.87 + assert graph.get_node("3").attributes.get("vector_distance") == 0.92 + assert graph.get_node("4").attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_to_graph_edges_with_payload(setup_graph, mock_vector_engine): + """Test mapping vector distances to edges when edge_distances provided.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"edge_text": "CONNECTS_TO", "relationship_type": "connects"}, + ) + graph.add_edge(edge) + + edge_distances = [ + MockScoredResult("e1", 0.92, payload={"text": "CONNECTS_TO"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 0.92 + + +@pytest.mark.asyncio +async def test_map_vector_distances_to_graph_edges_search(setup_graph, mock_vector_engine): + """Test mapping edge distances when searching for them.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"edge_text": "CONNECTS_TO", "relationship_type": "connects"}, + ) + graph.add_edge(edge) + + mock_vector_engine.search.return_value = [ + MockScoredResult("e1", 0.88, payload={"text": "CONNECTS_TO"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=None, + ) + + mock_vector_engine.search.assert_called_once() + assert graph.edges[0].attributes.get("vector_distance") == 0.88 + + +@pytest.mark.asyncio +async def test_map_vector_distances_partial_edge_coverage(setup_graph, mock_vector_engine): + """Test mapping edge distances when only some edges have results.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + node3 = Node("3") + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + + edge1 = Edge(node1, node2, attributes={"edge_text": "CONNECTS_TO"}) + edge2 = Edge(node2, node3, attributes={"edge_text": "DEPENDS_ON"}) + graph.add_edge(edge1) + graph.add_edge(edge2) + + edge_distances = [ + MockScoredResult("e1", 0.92, payload={"text": "CONNECTS_TO"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 0.92 + assert graph.edges[1].attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_edges_fallback_to_relationship_type( + setup_graph, mock_vector_engine +): + """Test that edge mapping falls back to relationship_type when edge_text is missing.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"relationship_type": "KNOWS"}, + ) + graph.add_edge(edge) + + edge_distances = [ + MockScoredResult("e1", 0.85, payload={"text": "KNOWS"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 0.85 + + +@pytest.mark.asyncio +async def test_map_vector_distances_no_edge_matches(setup_graph, mock_vector_engine): + """Test edge mapping when no edges match the distance results.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge( + node1, + node2, + attributes={"edge_text": "CONNECTS_TO", "relationship_type": "connects"}, + ) + graph.add_edge(edge) + + edge_distances = [ + MockScoredResult("e1", 0.92, payload={"text": "SOME_OTHER_EDGE"}), + ] + + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[0.1, 0.2, 0.3], + edge_distances=edge_distances, + ) + + assert graph.edges[0].attributes.get("vector_distance") == 3.5 + + +@pytest.mark.asyncio +async def test_map_vector_distances_invalid_query_vector(setup_graph, mock_vector_engine): + """Test that invalid query vector raises error.""" + graph = setup_graph + + with pytest.raises(ValueError, match="Failed to generate query embedding"): + await graph.map_vector_distances_to_graph_edges( + vector_engine=mock_vector_engine, + query_vector=[], + edge_distances=None, + ) + + +@pytest.mark.asyncio +async def test_calculate_top_triplet_importances(setup_graph): + """Test calculating top triplet importances by score.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + node3 = Node("3") + node4 = Node("4") + + node1.add_attribute("vector_distance", 0.9) + node2.add_attribute("vector_distance", 0.8) + node3.add_attribute("vector_distance", 0.7) + node4.add_attribute("vector_distance", 0.6) + + graph.add_node(node1) + graph.add_node(node2) + graph.add_node(node3) + graph.add_node(node4) + + edge1 = Edge(node1, node2) + edge2 = Edge(node2, node3) + edge3 = Edge(node3, node4) + + edge1.add_attribute("vector_distance", 0.85) + edge2.add_attribute("vector_distance", 0.75) + edge3.add_attribute("vector_distance", 0.65) + + graph.add_edge(edge1) + graph.add_edge(edge2) + graph.add_edge(edge3) + + top_triplets = await graph.calculate_top_triplet_importances(k=2) + + assert len(top_triplets) == 2 + + assert top_triplets[0] == edge3 + assert top_triplets[1] == edge2 + + +@pytest.mark.asyncio +async def test_calculate_top_triplet_importances_default_distances(setup_graph): + """Test calculating importances when nodes/edges have no vector distances.""" + graph = setup_graph + + node1 = Node("1") + node2 = Node("2") + graph.add_node(node1) + graph.add_node(node2) + + edge = Edge(node1, node2) + graph.add_edge(edge) + + top_triplets = await graph.calculate_top_triplet_importances(k=1) + + assert len(top_triplets) == 1 + assert top_triplets[0] == edge diff --git a/cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py b/cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py new file mode 100644 index 000000000..5eb6fb105 --- /dev/null +++ b/cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py @@ -0,0 +1,582 @@ +import pytest +from unittest.mock import AsyncMock, patch + +from cognee.modules.retrieval.utils.brute_force_triplet_search import ( + brute_force_triplet_search, + get_memory_fragment, +) +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph +from cognee.modules.graph.exceptions.exceptions import EntityNotFoundError + + +class MockScoredResult: + """Mock class for vector search results.""" + + def __init__(self, id, score, payload=None): + self.id = id + self.score = score + self.payload = payload or {} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_empty_query(): + """Test that empty query raises ValueError.""" + with pytest.raises(ValueError, match="The query must be a non-empty string."): + await brute_force_triplet_search(query="") + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_none_query(): + """Test that None query raises ValueError.""" + with pytest.raises(ValueError, match="The query must be a non-empty string."): + await brute_force_triplet_search(query=None) + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_negative_top_k(): + """Test that negative top_k raises ValueError.""" + with pytest.raises(ValueError, match="top_k must be a positive integer."): + await brute_force_triplet_search(query="test query", top_k=-1) + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_zero_top_k(): + """Test that zero top_k raises ValueError.""" + with pytest.raises(ValueError, match="top_k must be a positive integer."): + await brute_force_triplet_search(query="test query", top_k=0) + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_wide_search_limit_global_search(): + """Test that wide_search_limit is applied for global search (node_name=None).""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search( + query="test", + node_name=None, # Global search + wide_search_top_k=75, + ) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["limit"] == 75 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_wide_search_limit_filtered_search(): + """Test that wide_search_limit is None for filtered search (node_name provided).""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search( + query="test", + node_name=["Node1"], + wide_search_top_k=50, + ) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["limit"] is None + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_wide_search_default(): + """Test that wide_search_top_k defaults to 100.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query="test", node_name=None) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["limit"] == 100 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_default_collections(): + """Test that default collections are used when none provided.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query="test") + + expected_collections = [ + "Entity_name", + "TextSummary_text", + "EntityType_name", + "DocumentChunk_text", + ] + + call_collections = [ + call[1]["collection_name"] for call in mock_vector_engine.search.call_args_list + ] + assert call_collections == expected_collections + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_custom_collections(): + """Test that custom collections are used when provided.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + custom_collections = ["CustomCol1", "CustomCol2"] + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query="test", collections=custom_collections) + + call_collections = [ + call[1]["collection_name"] for call in mock_vector_engine.search.call_args_list + ] + assert call_collections == custom_collections + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_all_collections_empty(): + """Test that empty list is returned when all collections return no results.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + results = await brute_force_triplet_search(query="test") + assert results == [] + + +# Tests for query embedding + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_embeds_query(): + """Test that query is embedded before searching.""" + query_text = "test query" + expected_vector = [0.1, 0.2, 0.3] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[expected_vector]) + mock_vector_engine.search = AsyncMock(return_value=[]) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ): + await brute_force_triplet_search(query=query_text) + + mock_vector_engine.embedding_engine.embed_text.assert_called_once_with([query_text]) + + for call in mock_vector_engine.search.call_args_list: + assert call[1]["query_vector"] == expected_vector + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_extracts_node_ids_global_search(): + """Test that node IDs are extracted from search results for global search.""" + scored_results = [ + MockScoredResult("node1", 0.95), + MockScoredResult("node2", 0.87), + MockScoredResult("node3", 0.92), + ] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=scored_results) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2", "node3"} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_reuses_provided_fragment(): + """Test that provided memory fragment is reused instead of creating new one.""" + provided_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[MockScoredResult("n1", 0.95)]) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment" + ) as mock_get_fragment, + ): + await brute_force_triplet_search( + query="test", + memory_fragment=provided_fragment, + node_name=["node"], + ) + + mock_get_fragment.assert_not_called() + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_creates_fragment_when_not_provided(): + """Test that memory fragment is created when not provided.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[MockScoredResult("n1", 0.95)]) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment, + ): + await brute_force_triplet_search(query="test", node_name=["node"]) + + mock_get_fragment.assert_called_once() + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_passes_top_k_to_importance_calculation(): + """Test that custom top_k is passed to importance calculation.""" + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(return_value=[MockScoredResult("n1", 0.95)]) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ), + ): + custom_top_k = 15 + await brute_force_triplet_search(query="test", top_k=custom_top_k, node_name=["n"]) + + mock_fragment.calculate_top_triplet_importances.assert_called_once_with(k=custom_top_k) + + +@pytest.mark.asyncio +async def test_get_memory_fragment_returns_empty_graph_on_entity_not_found(): + """Test that get_memory_fragment returns empty graph when entity not found.""" + mock_graph_engine = AsyncMock() + mock_graph_engine.project_graph_from_db = AsyncMock( + side_effect=EntityNotFoundError("Entity not found") + ) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_graph_engine", + return_value=mock_graph_engine, + ): + fragment = await get_memory_fragment() + + assert isinstance(fragment, CogneeGraph) + assert len(fragment.nodes) == 0 + + +@pytest.mark.asyncio +async def test_get_memory_fragment_returns_empty_graph_on_error(): + """Test that get_memory_fragment returns empty graph on generic error.""" + mock_graph_engine = AsyncMock() + mock_graph_engine.project_graph_from_db = AsyncMock(side_effect=Exception("Generic error")) + + with patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_graph_engine", + return_value=mock_graph_engine, + ): + fragment = await get_memory_fragment() + + assert isinstance(fragment, CogneeGraph) + assert len(fragment.nodes) == 0 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_deduplicates_node_ids(): + """Test that duplicate node IDs across collections are deduplicated.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [ + MockScoredResult("node1", 0.95), + MockScoredResult("node2", 0.87), + ] + elif collection_name == "TextSummary_text": + return [ + MockScoredResult("node1", 0.90), + MockScoredResult("node3", 0.92), + ] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2", "node3"} + assert len(call_kwargs["relevant_ids_to_filter"]) == 3 + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_excludes_edge_collection(): + """Test that EdgeType_relationship_name collection is excluded from ID extraction.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [MockScoredResult("node1", 0.95)] + elif collection_name == "EdgeType_relationship_name": + return [MockScoredResult("edge1", 0.88)] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search( + query="test", + node_name=None, + collections=["Entity_name", "EdgeType_relationship_name"], + ) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert call_kwargs["relevant_ids_to_filter"] == ["node1"] + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_skips_nodes_without_ids(): + """Test that nodes without ID attribute are skipped.""" + + class ScoredResultNoId: + """Mock result without id attribute.""" + + def __init__(self, score): + self.score = score + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [ + MockScoredResult("node1", 0.95), + ScoredResultNoId(0.90), + MockScoredResult("node2", 0.87), + ] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2"} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_handles_tuple_results(): + """Test that both list and tuple results are handled correctly.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return ( + MockScoredResult("node1", 0.95), + MockScoredResult("node2", 0.87), + ) + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2"} + + +@pytest.mark.asyncio +async def test_brute_force_triplet_search_mixed_empty_collections(): + """Test ID extraction with mixed empty and non-empty collections.""" + + def search_side_effect(*args, **kwargs): + collection_name = kwargs.get("collection_name") + if collection_name == "Entity_name": + return [MockScoredResult("node1", 0.95)] + elif collection_name == "TextSummary_text": + return [] + elif collection_name == "EntityType_name": + return [MockScoredResult("node2", 0.92)] + else: + return [] + + mock_vector_engine = AsyncMock() + mock_vector_engine.embedding_engine = AsyncMock() + mock_vector_engine.embedding_engine.embed_text = AsyncMock(return_value=[[0.1, 0.2, 0.3]]) + mock_vector_engine.search = AsyncMock(side_effect=search_side_effect) + + mock_fragment = AsyncMock( + map_vector_distances_to_graph_nodes=AsyncMock(), + map_vector_distances_to_graph_edges=AsyncMock(), + calculate_top_triplet_importances=AsyncMock(return_value=[]), + ) + + with ( + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_vector_engine", + return_value=mock_vector_engine, + ), + patch( + "cognee.modules.retrieval.utils.brute_force_triplet_search.get_memory_fragment", + return_value=mock_fragment, + ) as mock_get_fragment_fn, + ): + await brute_force_triplet_search(query="test", node_name=None) + + call_kwargs = mock_get_fragment_fn.call_args[1] + assert set(call_kwargs["relevant_ids_to_filter"]) == {"node1", "node2"}