From 30df102656bdaacee4c86359d4b09c0cbe0eb6d7 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Sep 2025 10:42:11 +0200 Subject: [PATCH 01/32] docs: Add docstrings for permission related functions. --- .../data/methods/create_authorized_dataset.py | 9 +++++++++ .../modules/data/methods/get_authorized_dataset.py | 2 +- .../data/methods/get_authorized_dataset_by_name.py | 11 +++++++++++ .../layers/resolve_authorized_user_dataset.py | 13 +++++++++++++ .../layers/resolve_authorized_user_datasets.py | 2 +- .../authorized_give_permission_on_datasets.py | 12 ++++++++++++ .../methods/check_permission_on_dataset.py | 11 +++++++++++ .../methods/get_all_user_permission_datasets.py | 10 ++++++++++ .../methods/get_document_ids_for_user.py | 10 ++++++++++ .../users/permissions/methods/get_principal.py | 9 +++++++++ .../permissions/methods/get_principal_datasets.py | 11 +++++++++++ .../modules/users/permissions/methods/get_role.py | 10 ++++++++++ .../get_specific_user_permission_datasets.py | 6 +++--- .../modules/users/permissions/methods/get_tenant.py | 9 +++++++++ .../methods/give_default_permission_to_role.py | 9 +++++++++ .../methods/give_default_permission_to_tenant.py | 9 +++++++++ .../methods/give_default_permission_to_user.py | 9 +++++++++ .../methods/give_permission_on_dataset.py | 10 ++++++++++ .../modules/users/roles/methods/add_user_to_role.py | 11 +++++++++++ cognee/modules/users/roles/methods/create_role.py | 10 ++++++++++ .../users/tenants/methods/add_user_to_tenant.py | 12 ++++++++++++ .../modules/users/tenants/methods/create_tenant.py | 10 ++++++++++ 22 files changed, 200 insertions(+), 5 deletions(-) diff --git a/cognee/modules/data/methods/create_authorized_dataset.py b/cognee/modules/data/methods/create_authorized_dataset.py index e43381b35..08057a6bd 100644 --- a/cognee/modules/data/methods/create_authorized_dataset.py +++ b/cognee/modules/data/methods/create_authorized_dataset.py @@ -6,6 +6,15 @@ from .create_dataset import create_dataset async def create_authorized_dataset(dataset_name: str, user: User) -> Dataset: + """ + Create a new dataset and give all permissions on this dataset to the given user. + Args: + dataset_name: Name of the dataset. + user: The user object. + + Returns: + Dataset: The new authorized dataset. + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/data/methods/get_authorized_dataset.py b/cognee/modules/data/methods/get_authorized_dataset.py index 0e30b7e0e..6c97322c8 100644 --- a/cognee/modules/data/methods/get_authorized_dataset.py +++ b/cognee/modules/data/methods/get_authorized_dataset.py @@ -15,7 +15,7 @@ async def get_authorized_dataset( Get a specific dataset with permissions for a user. Args: - user_id (UUID): user id + user: User object dataset_id (UUID): dataset id permission_type (str): permission type(read, write, delete, share), default is read diff --git a/cognee/modules/data/methods/get_authorized_dataset_by_name.py b/cognee/modules/data/methods/get_authorized_dataset_by_name.py index 654dcb630..5dc1d86a0 100644 --- a/cognee/modules/data/methods/get_authorized_dataset_by_name.py +++ b/cognee/modules/data/methods/get_authorized_dataset_by_name.py @@ -11,6 +11,17 @@ from ..models import Dataset async def get_authorized_dataset_by_name( dataset_name: str, user: User, permission_type: str ) -> Optional[Dataset]: + """ + Get a specific dataset with the given name, with permissions for a given user. + + Args: + dataset_name: Name of the dataset. + user: User object. + permission_type (str): permission type(read, write, delete, share), default is read + + Returns: + Optional[Dataset]: dataset with permissions + """ authorized_datasets = await get_authorized_existing_datasets([], permission_type, user) return next((dataset for dataset in authorized_datasets if dataset.name == dataset_name), None) diff --git a/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py b/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py index 30d0fef71..e135b8351 100644 --- a/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +++ b/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py @@ -11,6 +11,19 @@ from cognee.modules.data.methods import ( async def resolve_authorized_user_dataset(dataset_id: UUID, dataset_name: str, user: User): + """ + Function handles creation and dataset authorization if dataset already exist for Cognee. + Verifies that provided user has necessary permission for provided Dataset. + If Dataset does not exist creates the Dataset and gives permission for the user creating the dataset. + + Args: + dataset_id: Id of the dataset. + dataset_name: Name of the dataset. + user: Cognee User request is being processed for, if None default user will be used. + + Returns: + Tuple[User, Dataset]: A tuple containing the user and the authorized dataset. + """ if not user: user = await get_default_user() diff --git a/cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py b/cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py index 4f6fb8254..f91064995 100644 --- a/cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +++ b/cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py @@ -25,7 +25,7 @@ async def resolve_authorized_user_datasets( datasets: Dataset names or Dataset UUID (in case Datasets already exist) Returns: - + Tuple[User, List[Dataset]]: A tuple containing the user and the list of authorized datasets. """ # If no user is provided use default user if user is None: diff --git a/cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py b/cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py index d8a3777b7..7960eb756 100644 --- a/cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +++ b/cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py @@ -9,6 +9,18 @@ from uuid import UUID async def authorized_give_permission_on_datasets( principal_id: UUID, dataset_ids: Union[List[UUID], UUID], permission_name: str, owner_id: UUID ): + """ + Give permission to certain datasets to a user. + The request owner must have the necessary permission to share the datasets. + Args: + principal_id: Id of user to whom datasets are shared + dataset_ids: Ids of datasets to share + permission_name: Name of permission to give + owner_id: Id of the request owner + + Returns: + None + """ # If only a single dataset UUID is provided transform it to a list if not isinstance(dataset_ids, list): dataset_ids = [dataset_ids] diff --git a/cognee/modules/users/permissions/methods/check_permission_on_dataset.py b/cognee/modules/users/permissions/methods/check_permission_on_dataset.py index 467da7154..d489417e0 100644 --- a/cognee/modules/users/permissions/methods/check_permission_on_dataset.py +++ b/cognee/modules/users/permissions/methods/check_permission_on_dataset.py @@ -10,6 +10,17 @@ logger = get_logger() async def check_permission_on_dataset(user: User, permission_type: str, dataset_id: UUID): + """ + Check if a user has a specific permission on a dataset. + Args: + user: User whose permission is checked + permission_type: Type of permission to check + dataset_id: Id of the dataset + + Returns: + None + + """ if user is None: user = await get_default_user() diff --git a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py index 5b242baa4..a2a2b5fdd 100644 --- a/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py @@ -9,6 +9,16 @@ logger = get_logger() async def get_all_user_permission_datasets(user: User, permission_type: str) -> list[Dataset]: + """ + Return a list of datasets the user has permission for. + If the user is part of a tenant, return datasets his roles have permission for. + Args: + user + permission_type + + Returns: + list[Dataset]: List of datasets user has permission for + """ datasets = list() # Get all datasets User has explicit access to datasets.extend(await get_principal_datasets(user, permission_type)) diff --git a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py index 3b053d8e7..9b1db024e 100644 --- a/cognee/modules/users/permissions/methods/get_document_ids_for_user.py +++ b/cognee/modules/users/permissions/methods/get_document_ids_for_user.py @@ -8,6 +8,16 @@ from ...models import ACL, Permission async def get_document_ids_for_user(user_id: UUID, datasets: list[str] = None) -> list[str]: + """ + Return a list of documents ids for which the user has read permission. + If datasets are specified, return only documents from those datasets. + Args: + user_id: Id of the user + datasets: List of datasets + + Returns: + list[str]: List of documents for which the user has read permission + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/users/permissions/methods/get_principal.py b/cognee/modules/users/permissions/methods/get_principal.py index 53d39651a..245190cf8 100644 --- a/cognee/modules/users/permissions/methods/get_principal.py +++ b/cognee/modules/users/permissions/methods/get_principal.py @@ -6,6 +6,15 @@ from ...models.Principal import Principal async def get_principal(principal_id: UUID): + """ + Return information about a user based on their id + Args: + principal_id: Id of the user + + Returns: + principal: Information about the user (principal) + + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/users/permissions/methods/get_principal_datasets.py b/cognee/modules/users/permissions/methods/get_principal_datasets.py index b2385182f..a9adb8f00 100644 --- a/cognee/modules/users/permissions/methods/get_principal_datasets.py +++ b/cognee/modules/users/permissions/methods/get_principal_datasets.py @@ -9,6 +9,17 @@ from ...models.ACL import ACL async def get_principal_datasets(principal: Principal, permission_type: str) -> list[Dataset]: + """ + Return a list of datasets for which the user (principal) has a certain permission. + Args: + principal: Information about the user + permission_type: Type of permission + + Returns: + list[Dataset]: List of datasets for which the user (principal) + has the permission (permission_type). + + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/users/permissions/methods/get_role.py b/cognee/modules/users/permissions/methods/get_role.py index 007044c43..a703fc9f9 100644 --- a/cognee/modules/users/permissions/methods/get_role.py +++ b/cognee/modules/users/permissions/methods/get_role.py @@ -9,6 +9,16 @@ from ...models.Role import Role async def get_role(tenant_id: UUID, role_name: str): + """ + Return the role with the name role_name of the given tenant. + Args: + tenant_id: Id of the given tenant + role_name: Name of the role + + Returns + The role for the given tenant. + + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py b/cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py index b6ad1291d..8dee4d782 100644 --- a/cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +++ b/cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py @@ -15,9 +15,9 @@ async def get_specific_user_permission_datasets( Return a list of datasets user has given permission for. If a list of datasets is provided, verify for which datasets user has appropriate permission for and return list of datasets he has permission for. Args: - user_id: - permission_type: - dataset_ids: + user_id: Id of the user. + permission_type: Type of the permission. + dataset_ids: Ids of the provided datasets Returns: list[Dataset]: List of datasets user has permission for diff --git a/cognee/modules/users/permissions/methods/get_tenant.py b/cognee/modules/users/permissions/methods/get_tenant.py index c5bf1a633..832ff71b8 100644 --- a/cognee/modules/users/permissions/methods/get_tenant.py +++ b/cognee/modules/users/permissions/methods/get_tenant.py @@ -8,6 +8,15 @@ from ...models.Tenant import Tenant async def get_tenant(tenant_id: UUID): + """ + Return information about the tenant based on the given id. + Args: + tenant_id: Id of the given tenant + + Returns + Information about the given tenant. + + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/users/permissions/methods/give_default_permission_to_role.py b/cognee/modules/users/permissions/methods/give_default_permission_to_role.py index bf3b6a9c7..9d9b41c1b 100644 --- a/cognee/modules/users/permissions/methods/give_default_permission_to_role.py +++ b/cognee/modules/users/permissions/methods/give_default_permission_to_role.py @@ -16,6 +16,15 @@ from cognee.modules.users.models import ( async def give_default_permission_to_role(role_id: UUID, permission_name: str): + """ + Give the permission with given name to the role with the given id as a default permission. + Args: + role_id: Id of the role + permission_name: Name of the permission + + Returns: + None + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py b/cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py index 57049ae2e..7baa8c244 100644 --- a/cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +++ b/cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py @@ -16,6 +16,15 @@ from cognee.modules.users.models import ( async def give_default_permission_to_tenant(tenant_id: UUID, permission_name: str): + """ + Give the permission with given name to the tenant with the given id as a default permission. + Args: + tenant_id: Id of the tenant + permission_name: Name of the permission + + Returns: + None + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: tenant = ( diff --git a/cognee/modules/users/permissions/methods/give_default_permission_to_user.py b/cognee/modules/users/permissions/methods/give_default_permission_to_user.py index 40913ff12..545122fd0 100644 --- a/cognee/modules/users/permissions/methods/give_default_permission_to_user.py +++ b/cognee/modules/users/permissions/methods/give_default_permission_to_user.py @@ -16,6 +16,15 @@ from cognee.modules.users.models import ( async def give_default_permission_to_user(user_id: UUID, permission_name: str): + """ + Give the permission with given name to the user with the given id as a default permission. + Args: + user_id: Id of the tenant + permission_name: Name of the permission + + Returns: + None + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: user = (await session.execute(select(User).where(User.id == user_id))).scalars().first() diff --git a/cognee/modules/users/permissions/methods/give_permission_on_dataset.py b/cognee/modules/users/permissions/methods/give_permission_on_dataset.py index 0ed536981..6d0272192 100644 --- a/cognee/modules/users/permissions/methods/give_permission_on_dataset.py +++ b/cognee/modules/users/permissions/methods/give_permission_on_dataset.py @@ -24,6 +24,16 @@ async def give_permission_on_dataset( dataset_id: UUID, permission_name: str, ): + """ + Give a specific permission on a dataset to a user. + Args: + principal: User who is being given the permission on the dataset + dataset_id: Id of the dataset + permission_name: Name of permission to give + + Returns: + None + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: diff --git a/cognee/modules/users/roles/methods/add_user_to_role.py b/cognee/modules/users/roles/methods/add_user_to_role.py index c6d8fdb63..de5e47775 100644 --- a/cognee/modules/users/roles/methods/add_user_to_role.py +++ b/cognee/modules/users/roles/methods/add_user_to_role.py @@ -21,6 +21,17 @@ from cognee.modules.users.models import ( async def add_user_to_role(user_id: UUID, role_id: UUID, owner_id: UUID): + """ + Add a user with the given id to the role with the given id. + Args: + user_id: Id of the user. + role_id: Id of the role. + owner_id: Id of the request owner. + + Returns: + None + + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: user = (await session.execute(select(User).where(User.id == user_id))).scalars().first() diff --git a/cognee/modules/users/roles/methods/create_role.py b/cognee/modules/users/roles/methods/create_role.py index 897c42394..bdba4ad31 100644 --- a/cognee/modules/users/roles/methods/create_role.py +++ b/cognee/modules/users/roles/methods/create_role.py @@ -16,6 +16,16 @@ async def create_role( role_name: str, owner_id: UUID, ): + """ + Create a new role with the given name, if the request owner with the given id + has the necessary permission. + Args: + role_name: Name of the new role. + owner_id: Id of the request owner. + + Returns: + None + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: user = await get_user(owner_id) diff --git a/cognee/modules/users/tenants/methods/add_user_to_tenant.py b/cognee/modules/users/tenants/methods/add_user_to_tenant.py index cf0ad0535..1374067a7 100644 --- a/cognee/modules/users/tenants/methods/add_user_to_tenant.py +++ b/cognee/modules/users/tenants/methods/add_user_to_tenant.py @@ -13,6 +13,18 @@ from cognee.modules.users.exceptions import ( async def add_user_to_tenant(user_id: UUID, tenant_id: UUID, owner_id: UUID): + """ + Add a user with the given id to the tenant with the given id. + This can only be successful if the request owner with the given id is the tenant owner. + Args: + user_id: Id of the user. + tenant_id: Id of the tenant. + owner_id: Id of the request owner. + + Returns: + None + + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: user = await get_user(user_id) diff --git a/cognee/modules/users/tenants/methods/create_tenant.py b/cognee/modules/users/tenants/methods/create_tenant.py index 5d68e8110..bd8abadd1 100644 --- a/cognee/modules/users/tenants/methods/create_tenant.py +++ b/cognee/modules/users/tenants/methods/create_tenant.py @@ -8,6 +8,16 @@ from cognee.modules.users.methods import get_user async def create_tenant(tenant_name: str, user_id: UUID): + """ + Create a new tenant with the given name, for the user with the given id. + This user is the owner of the tenant. + Args: + tenant_name: Name of the new tenant. + user_id: Id of the user. + + Returns: + None + """ db_engine = get_relational_engine() async with db_engine.get_async_session() as session: try: From 293a0e0053759686cd519e061a18e92e21ce32b7 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Sep 2025 10:45:36 +0200 Subject: [PATCH 02/32] Fix formatiing --- .../embeddings/OllamaEmbeddingEngine.py | 6 +----- .../methods/get_authorized_dataset_by_name.py | 16 +++++++-------- .../layers/resolve_authorized_user_dataset.py | 20 +++++++++---------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py index 29c57ed2e..3ecc7dbe8 100644 --- a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py @@ -94,11 +94,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine): """ Internal method to call the Ollama embeddings endpoint for a single prompt. """ - payload = { - "model": self.model, - "prompt": prompt, - "input": prompt - } + payload = {"model": self.model, "prompt": prompt, "input": prompt} headers = {} api_key = os.getenv("LLM_API_KEY") if api_key: diff --git a/cognee/modules/data/methods/get_authorized_dataset_by_name.py b/cognee/modules/data/methods/get_authorized_dataset_by_name.py index 5dc1d86a0..ad50e25e9 100644 --- a/cognee/modules/data/methods/get_authorized_dataset_by_name.py +++ b/cognee/modules/data/methods/get_authorized_dataset_by_name.py @@ -12,16 +12,16 @@ async def get_authorized_dataset_by_name( dataset_name: str, user: User, permission_type: str ) -> Optional[Dataset]: """ - Get a specific dataset with the given name, with permissions for a given user. + Get a specific dataset with the given name, with permissions for a given user. - Args: - dataset_name: Name of the dataset. - user: User object. - permission_type (str): permission type(read, write, delete, share), default is read + Args: + dataset_name: Name of the dataset. + user: User object. + permission_type (str): permission type(read, write, delete, share), default is read - Returns: - Optional[Dataset]: dataset with permissions - """ + Returns: + Optional[Dataset]: dataset with permissions + """ authorized_datasets = await get_authorized_existing_datasets([], permission_type, user) return next((dataset for dataset in authorized_datasets if dataset.name == dataset_name), None) diff --git a/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py b/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py index e135b8351..7e3d1c124 100644 --- a/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +++ b/cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py @@ -12,18 +12,18 @@ from cognee.modules.data.methods import ( async def resolve_authorized_user_dataset(dataset_id: UUID, dataset_name: str, user: User): """ - Function handles creation and dataset authorization if dataset already exist for Cognee. - Verifies that provided user has necessary permission for provided Dataset. - If Dataset does not exist creates the Dataset and gives permission for the user creating the dataset. + Function handles creation and dataset authorization if dataset already exist for Cognee. + Verifies that provided user has necessary permission for provided Dataset. + If Dataset does not exist creates the Dataset and gives permission for the user creating the dataset. - Args: - dataset_id: Id of the dataset. - dataset_name: Name of the dataset. - user: Cognee User request is being processed for, if None default user will be used. + Args: + dataset_id: Id of the dataset. + dataset_name: Name of the dataset. + user: Cognee User request is being processed for, if None default user will be used. - Returns: - Tuple[User, Dataset]: A tuple containing the user and the authorized dataset. - """ + Returns: + Tuple[User, Dataset]: A tuple containing the user and the authorized dataset. + """ if not user: user = await get_default_user() From 475749b8decb2b20d7ebd3440c4af2bf5e1b961c Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 17 Sep 2025 11:47:37 +0200 Subject: [PATCH 03/32] docs: Updated some docs, not a lot was necessary --- cognee/modules/graph/utils/retrieve_existing_edges.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cognee/modules/graph/utils/retrieve_existing_edges.py b/cognee/modules/graph/utils/retrieve_existing_edges.py index 20cb30a26..f0aefacd4 100644 --- a/cognee/modules/graph/utils/retrieve_existing_edges.py +++ b/cognee/modules/graph/utils/retrieve_existing_edges.py @@ -23,8 +23,6 @@ async def retrieve_existing_edges( chunk_graphs (list[KnowledgeGraph]): List of knowledge graphs corresponding to each data chunk. Each graph contains nodes (entities) and edges (relationships) that were extracted from the chunk content. - graph_engine (GraphDBInterface): Interface to the graph database that will be queried - to check for existing edges. Must implement the has_edges() method. Returns: dict[str, bool]: A mapping of edge keys to boolean values indicating existence. From f651991c86d6fdc9dce6362100c3719dc9c2f5f6 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 12:02:38 +0200 Subject: [PATCH 04/32] feat: adds base class + renames rdflib implementation --- cognee/api/v1/cognify/cognify.py | 4 +-- .../get_default_tasks_by_indices.py | 4 +-- .../utils/expand_with_nodes_and_edges.py | 16 ++++----- .../ontology/base_ontology_resolver.py | 30 ++++++++++++++++ cognee/modules/ontology/models.py | 20 +++++++++++ .../ontology/rdf_xml/OntologyResolver.py | 35 +++++++------------ cognee/tasks/graph/extract_graph_from_data.py | 6 ++-- .../tasks/graph/extract_graph_from_data_v2.py | 4 +-- .../modules/ontology/test_ontology_adapter.py | 21 +++++------ 9 files changed, 90 insertions(+), 50 deletions(-) create mode 100644 cognee/modules/ontology/base_ontology_resolver.py create mode 100644 cognee/modules/ontology/models.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index e4f91b44c..a35658691 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -10,7 +10,7 @@ from cognee.infrastructure.llm import get_max_chunk_tokens from cognee.modules.pipelines import run_pipeline from cognee.modules.pipelines.tasks.task import Task from cognee.modules.chunking.TextChunker import TextChunker -from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver +from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver from cognee.modules.users.models import User from cognee.tasks.documents import ( @@ -230,7 +230,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's Task( extract_graph_from_data, graph_model=graph_model, - ontology_adapter=OntologyResolver(ontology_file=ontology_file_path), + ontology_adapter=RDFLibOntologyResolver(ontology_file=ontology_file_path), custom_prompt=custom_prompt, task_config={"batch_size": 10}, ), # Generate knowledge graphs from the document chunks. diff --git a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py index be532232f..677090a58 100644 --- a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +++ b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py @@ -5,7 +5,7 @@ from cognee.modules.chunking.TextChunker import TextChunker from cognee.tasks.graph import extract_graph_from_data from cognee.tasks.storage import add_data_points from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver +from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver async def get_default_tasks_by_indices( @@ -33,7 +33,7 @@ async def get_no_summary_tasks( # Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks) base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker) - ontology_adapter = OntologyResolver(ontology_file=ontology_file_path) + ontology_adapter = RDFLibOntologyResolver(ontology_file=ontology_file_path) graph_task = Task( extract_graph_from_data, diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index 125f59e72..3bd62e6e0 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -8,7 +8,7 @@ from cognee.modules.engine.utils import ( generate_node_name, ) from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver +from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver def _create_node_key(node_id: str, category: str) -> str: @@ -83,7 +83,7 @@ def _process_ontology_edges( def _create_type_node( node_type: str, - ontology_resolver: OntologyResolver, + ontology_resolver: RDFLibOntologyResolver, added_nodes_map: dict, added_ontology_nodes_map: dict, name_mapping: dict, @@ -141,7 +141,7 @@ def _create_entity_node( node_name: str, node_description: str, type_node: EntityType, - ontology_resolver: OntologyResolver, + ontology_resolver: RDFLibOntologyResolver, added_nodes_map: dict, added_ontology_nodes_map: dict, name_mapping: dict, @@ -198,7 +198,7 @@ def _create_entity_node( def _process_graph_nodes( data_chunk: DocumentChunk, graph: KnowledgeGraph, - ontology_resolver: OntologyResolver, + ontology_resolver: RDFLibOntologyResolver, added_nodes_map: dict, added_ontology_nodes_map: dict, name_mapping: dict, @@ -277,7 +277,7 @@ def _process_graph_edges( def expand_with_nodes_and_edges( data_chunks: list[DocumentChunk], chunk_graphs: list[KnowledgeGraph], - ontology_resolver: OntologyResolver = None, + ontology_resolver: RDFLibOntologyResolver = None, existing_edges_map: Optional[dict[str, bool]] = None, ): """ @@ -296,8 +296,8 @@ def expand_with_nodes_and_edges( chunk_graphs (list[KnowledgeGraph]): List of knowledge graphs corresponding to each data chunk. Each graph contains nodes (entities) and edges (relationships) extracted from the chunk content. - ontology_resolver (OntologyResolver, optional): Resolver for validating entities and - types against an ontology. If None, a default OntologyResolver is created. + ontology_resolver (RDFLibOntologyResolver, optional): Resolver for validating entities and + types against an ontology. If None, a default RDFLibOntologyResolver is created. Defaults to None. existing_edges_map (dict[str, bool], optional): Mapping of existing edge keys to prevent duplicate edge creation. Keys are formatted as "{source_id}_{target_id}_{relation}". @@ -320,7 +320,7 @@ def expand_with_nodes_and_edges( existing_edges_map = {} if ontology_resolver is None: - ontology_resolver = OntologyResolver() + ontology_resolver = RDFLibOntologyResolver() added_nodes_map = {} added_ontology_nodes_map = {} diff --git a/cognee/modules/ontology/base_ontology_resolver.py b/cognee/modules/ontology/base_ontology_resolver.py new file mode 100644 index 000000000..55826bfb0 --- /dev/null +++ b/cognee/modules/ontology/base_ontology_resolver.py @@ -0,0 +1,30 @@ +from abc import ABC, abstractmethod +from typing import List, Tuple, Optional + +from cognee.modules.ontology.models import AttachedOntologyNode + + +class BaseOntologyResolver(ABC): + """Abstract base class for ontology resolvers.""" + + @abstractmethod + def build_lookup(self) -> None: + """Build the lookup dictionary for ontology entities.""" + pass + + @abstractmethod + def refresh_lookup(self) -> None: + """Refresh the lookup dictionary.""" + pass + + @abstractmethod + def find_closest_match(self, name: str, category: str) -> Optional[str]: + """Find the closest match for a given name in the specified category.""" + pass + + @abstractmethod + def get_subgraph( + self, node_name: str, node_type: str = "individuals", directed: bool = True + ) -> Tuple[List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode]]: + """Get a subgraph for the given node.""" + pass diff --git a/cognee/modules/ontology/models.py b/cognee/modules/ontology/models.py new file mode 100644 index 000000000..eefa9e5dd --- /dev/null +++ b/cognee/modules/ontology/models.py @@ -0,0 +1,20 @@ +from typing import Any + + +class AttachedOntologyNode: + """Lightweight wrapper to be able to parse any ontology solution and generalize cognee interface.""" + + def __init__(self, uri: Any, category: str): + self.uri = uri + self.name = self._extract_name(uri) + self.category = category + + @staticmethod + def _extract_name(uri: Any) -> str: + uri_str = str(uri) + if "#" in uri_str: + return uri_str.split("#")[-1] + return uri_str.rstrip("/").split("/")[-1] + + def __repr__(self): + return f"AttachedOntologyNode(name={self.name}, category={self.category})" diff --git a/cognee/modules/ontology/rdf_xml/OntologyResolver.py b/cognee/modules/ontology/rdf_xml/OntologyResolver.py index 7f3fa004d..3c1a55b5a 100644 --- a/cognee/modules/ontology/rdf_xml/OntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/OntologyResolver.py @@ -10,30 +10,19 @@ from cognee.modules.ontology.exceptions import ( FindClosestMatchError, GetSubgraphError, ) +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver +from cognee.modules.ontology.models import AttachedOntologyNode logger = get_logger("OntologyAdapter") -class AttachedOntologyNode: - """Lightweight wrapper to be able to parse any ontology solution and generalize cognee interface.""" - - def __init__(self, uri: URIRef, category: str): - self.uri = uri - self.name = self._extract_name(uri) - self.category = category - - @staticmethod - def _extract_name(uri: URIRef) -> str: - uri_str = str(uri) - if "#" in uri_str: - return uri_str.split("#")[-1] - return uri_str.rstrip("/").split("/")[-1] - - def __repr__(self): - return f"AttachedOntologyNode(name={self.name}, category={self.category})" - - -class OntologyResolver: +class RDFLibOntologyResolver(BaseOntologyResolver): + """RDFLib-based ontology resolver implementation. + + This implementation uses RDFLib to parse and work with RDF/OWL ontology files. + It provides fuzzy matching and subgraph extraction capabilities for ontology entities. + """ + def __init__(self, ontology_file: Optional[str] = None): self.ontology_file = ontology_file try: @@ -60,7 +49,7 @@ class OntologyResolver: name = uri_str.rstrip("/").split("/")[-1] return name.lower().replace(" ", "_").strip() - def build_lookup(self): + def build_lookup(self) -> None: try: classes: Dict[str, URIRef] = {} individuals: Dict[str, URIRef] = {} @@ -97,7 +86,7 @@ class OntologyResolver: logger.error("Failed to build lookup dictionary: %s", str(e)) raise RuntimeError("Lookup build failed") from e - def refresh_lookup(self): + def refresh_lookup(self) -> None: self.build_lookup() logger.info("Ontology lookup refreshed.") @@ -125,7 +114,7 @@ class OntologyResolver: def get_subgraph( self, node_name: str, node_type: str = "individuals", directed: bool = True - ) -> Tuple[List[Any], List[Tuple[str, str, str]], Optional[Any]]: + ) -> Tuple[List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode]]: nodes_set = set() edges: List[Tuple[str, str, str]] = [] visited = set() diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index d81516206..2ad32f308 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -4,7 +4,7 @@ from pydantic import BaseModel from cognee.infrastructure.databases.graph import get_graph_engine from cognee.tasks.storage.add_data_points import add_data_points -from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver +from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.graph.utils import ( expand_with_nodes_and_edges, @@ -24,7 +24,7 @@ async def integrate_chunk_graphs( data_chunks: list[DocumentChunk], chunk_graphs: list, graph_model: Type[BaseModel], - ontology_adapter: OntologyResolver, + ontology_adapter: RDFLibOntologyResolver, ) -> List[DocumentChunk]: """Updates DocumentChunk objects, integrates data points and edges into databases.""" @@ -70,7 +70,7 @@ async def integrate_chunk_graphs( async def extract_graph_from_data( data_chunks: List[DocumentChunk], graph_model: Type[BaseModel], - ontology_adapter: OntologyResolver = None, + ontology_adapter: RDFLibOntologyResolver = None, custom_prompt: Optional[str] = None, ) -> List[DocumentChunk]: """ diff --git a/cognee/tasks/graph/extract_graph_from_data_v2.py b/cognee/tasks/graph/extract_graph_from_data_v2.py index c1f43df5c..ce69f9b0e 100644 --- a/cognee/tasks/graph/extract_graph_from_data_v2.py +++ b/cognee/tasks/graph/extract_graph_from_data_v2.py @@ -3,7 +3,7 @@ from typing import List from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver +from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import ( extract_content_nodes_and_relationship_names, @@ -17,7 +17,7 @@ from cognee.tasks.graph.extract_graph_from_data import integrate_chunk_graphs async def extract_graph_from_data( data_chunks: List[DocumentChunk], n_rounds: int = 2, - ontology_adapter: OntologyResolver = None, + ontology_adapter: RDFLibOntologyResolver = None, ) -> List[DocumentChunk]: """Extract and update graph data from document chunks in multiple steps.""" chunk_nodes = await asyncio.gather( diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index 8b406e53a..e0a6f1402 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -1,12 +1,13 @@ import pytest from rdflib import Graph, Namespace, RDF, OWL, RDFS -from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver, AttachedOntologyNode +from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.models import AttachedOntologyNode def test_ontology_adapter_initialization_success(): """Test successful initialization of OntologyAdapter.""" - adapter = OntologyResolver() + adapter = RDFLibOntologyResolver() adapter.build_lookup() assert isinstance(adapter.lookup, dict) @@ -14,7 +15,7 @@ def test_ontology_adapter_initialization_success(): def test_ontology_adapter_initialization_file_not_found(): """Test OntologyAdapter initialization with nonexistent file.""" - adapter = OntologyResolver(ontology_file="nonexistent.owl") + adapter = RDFLibOntologyResolver(ontology_file="nonexistent.owl") assert adapter.graph is None @@ -27,7 +28,7 @@ def test_build_lookup(): g.add((ns.Audi, RDF.type, ns.Car)) - resolver = OntologyResolver() + resolver = RDFLibOntologyResolver() resolver.graph = g resolver.build_lookup() @@ -50,7 +51,7 @@ def test_find_closest_match_exact(): g.add((ns.Car, RDF.type, OWL.Class)) g.add((ns.Audi, RDF.type, ns.Car)) - resolver = OntologyResolver() + resolver = RDFLibOntologyResolver() resolver.graph = g resolver.build_lookup() @@ -71,7 +72,7 @@ def test_find_closest_match_fuzzy(): g.add((ns.Audi, RDF.type, ns.Car)) g.add((ns.BMW, RDF.type, ns.Car)) - resolver = OntologyResolver() + resolver = RDFLibOntologyResolver() resolver.graph = g resolver.build_lookup() @@ -92,7 +93,7 @@ def test_find_closest_match_no_match(): g.add((ns.Audi, RDF.type, ns.Car)) g.add((ns.BMW, RDF.type, ns.Car)) - resolver = OntologyResolver() + resolver = RDFLibOntologyResolver() resolver.graph = g resolver.build_lookup() @@ -105,7 +106,7 @@ def test_get_subgraph_no_match_rdflib(): """Test get_subgraph returns empty results for a non-existent node.""" g = Graph() - resolver = OntologyResolver() + resolver = RDFLibOntologyResolver() resolver.graph = g resolver.build_lookup() @@ -138,7 +139,7 @@ def test_get_subgraph_success_rdflib(): g.add((ns.VW, owns, ns.Audi)) g.add((ns.VW, owns, ns.Porsche)) - resolver = OntologyResolver() + resolver = RDFLibOntologyResolver() resolver.graph = g resolver.build_lookup() @@ -163,7 +164,7 @@ def test_refresh_lookup_rdflib(): """Test that refresh_lookup rebuilds the lookup dict into a new object.""" g = Graph() - resolver = OntologyResolver() + resolver = RDFLibOntologyResolver() resolver.graph = g resolver.build_lookup() From 93a383b56a4e774a863a84847b4eb62ce61789cf Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 12:23:30 +0200 Subject: [PATCH 05/32] feat: adds matching strategies and moves resolver --- cognee/api/v1/cognify/cognify.py | 2 +- .../get_default_tasks_by_indices.py | 2 +- .../utils/expand_with_nodes_and_edges.py | 2 +- .../ontology/base_ontology_resolver.py | 10 ++++ .../modules/ontology/matching_strategies.py | 55 +++++++++++++++++++ ...yResolver.py => RDFLibOntologyResolver.py} | 13 ++--- cognee/tasks/graph/extract_graph_from_data.py | 2 +- .../tasks/graph/extract_graph_from_data_v2.py | 2 +- .../modules/ontology/test_ontology_adapter.py | 2 +- 9 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 cognee/modules/ontology/matching_strategies.py rename cognee/modules/ontology/rdf_xml/{OntologyResolver.py => RDFLibOntologyResolver.py} (95%) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index a35658691..e933bafd8 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -10,7 +10,7 @@ from cognee.infrastructure.llm import get_max_chunk_tokens from cognee.modules.pipelines import run_pipeline from cognee.modules.pipelines.tasks.task import Task from cognee.modules.chunking.TextChunker import TextChunker -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.users.models import User from cognee.tasks.documents import ( diff --git a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py index 677090a58..fb10c7eed 100644 --- a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +++ b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py @@ -5,7 +5,7 @@ from cognee.modules.chunking.TextChunker import TextChunker from cognee.tasks.graph import extract_graph_from_data from cognee.tasks.storage import add_data_points from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver async def get_default_tasks_by_indices( diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index 3bd62e6e0..bc6205d41 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -8,7 +8,7 @@ from cognee.modules.engine.utils import ( generate_node_name, ) from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver def _create_node_key(node_id: str, category: str) -> str: diff --git a/cognee/modules/ontology/base_ontology_resolver.py b/cognee/modules/ontology/base_ontology_resolver.py index 55826bfb0..86f51fcb7 100644 --- a/cognee/modules/ontology/base_ontology_resolver.py +++ b/cognee/modules/ontology/base_ontology_resolver.py @@ -2,10 +2,20 @@ from abc import ABC, abstractmethod from typing import List, Tuple, Optional from cognee.modules.ontology.models import AttachedOntologyNode +from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy class BaseOntologyResolver(ABC): """Abstract base class for ontology resolvers.""" + + def __init__(self, matching_strategy: Optional[MatchingStrategy] = None): + """Initialize the ontology resolver with a matching strategy. + + Args: + matching_strategy: The strategy to use for entity matching. + Defaults to FuzzyMatchingStrategy if None. + """ + self.matching_strategy = matching_strategy or FuzzyMatchingStrategy() @abstractmethod def build_lookup(self) -> None: diff --git a/cognee/modules/ontology/matching_strategies.py b/cognee/modules/ontology/matching_strategies.py new file mode 100644 index 000000000..c576bf6e2 --- /dev/null +++ b/cognee/modules/ontology/matching_strategies.py @@ -0,0 +1,55 @@ +import difflib +from abc import ABC, abstractmethod +from typing import List, Optional + + +class MatchingStrategy(ABC): + """Abstract base class for ontology entity matching strategies.""" + + @abstractmethod + def find_match(self, name: str, candidates: List[str]) -> Optional[str]: + """Find the best match for a given name from a list of candidates. + + Args: + name: The name to match + candidates: List of candidate names to match against + + Returns: + The best matching candidate name, or None if no match found + """ + pass + + +class FuzzyMatchingStrategy(MatchingStrategy): + """Fuzzy matching strategy using difflib for approximate string matching.""" + + def __init__(self, cutoff: float = 0.8): + """Initialize fuzzy matching strategy. + + Args: + cutoff: Minimum similarity score (0.0 to 1.0) for a match to be considered valid + """ + self.cutoff = cutoff + + def find_match(self, name: str, candidates: List[str]) -> Optional[str]: + """Find the closest fuzzy match for a given name. + + Args: + name: The normalized name to match + candidates: List of normalized candidate names + + Returns: + The best matching candidate name, or None if no match meets the cutoff + """ + if not candidates: + return None + + # Check for exact match first + if name in candidates: + return name + + # Find fuzzy match + best_match = difflib.get_close_matches( + name, candidates, n=1, cutoff=self.cutoff + ) + return best_match[0] if best_match else None diff --git a/cognee/modules/ontology/rdf_xml/OntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py similarity index 95% rename from cognee/modules/ontology/rdf_xml/OntologyResolver.py rename to cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index 3c1a55b5a..d8de5794a 100644 --- a/cognee/modules/ontology/rdf_xml/OntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -12,6 +12,7 @@ from cognee.modules.ontology.exceptions import ( ) from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.modules.ontology.models import AttachedOntologyNode +from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy logger = get_logger("OntologyAdapter") @@ -23,7 +24,8 @@ class RDFLibOntologyResolver(BaseOntologyResolver): It provides fuzzy matching and subgraph extraction capabilities for ontology entities. """ - def __init__(self, ontology_file: Optional[str] = None): + def __init__(self, ontology_file: Optional[str] = None, matching_strategy: Optional[MatchingStrategy] = None): + super().__init__(matching_strategy) self.ontology_file = ontology_file try: if ontology_file and os.path.exists(ontology_file): @@ -94,13 +96,8 @@ class RDFLibOntologyResolver(BaseOntologyResolver): try: normalized_name = name.lower().replace(" ", "_").strip() possible_matches = list(self.lookup.get(category, {}).keys()) - if normalized_name in possible_matches: - return normalized_name - - best_match = difflib.get_close_matches( - normalized_name, possible_matches, n=1, cutoff=0.8 - ) - return best_match[0] if best_match else None + + return self.matching_strategy.find_match(normalized_name, possible_matches) except Exception as e: logger.error("Error in find_closest_match: %s", str(e)) raise FindClosestMatchError() from e diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 2ad32f308..22cbc70fe 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -4,7 +4,7 @@ from pydantic import BaseModel from cognee.infrastructure.databases.graph import get_graph_engine from cognee.tasks.storage.add_data_points import add_data_points -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.graph.utils import ( expand_with_nodes_and_edges, diff --git a/cognee/tasks/graph/extract_graph_from_data_v2.py b/cognee/tasks/graph/extract_graph_from_data_v2.py index ce69f9b0e..d2b4924c7 100644 --- a/cognee/tasks/graph/extract_graph_from_data_v2.py +++ b/cognee/tasks/graph/extract_graph_from_data_v2.py @@ -3,7 +3,7 @@ from typing import List from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import ( extract_content_nodes_and_relationship_names, diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index e0a6f1402..051cb3556 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -1,6 +1,6 @@ import pytest from rdflib import Graph, Namespace, RDF, OWL, RDFS -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.models import AttachedOntologyNode From 00c3ba3a0ccbad28b203938c5d8a47eb7594b492 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 12:24:39 +0200 Subject: [PATCH 06/32] ruff fix --- .../ontology/base_ontology_resolver.py | 10 ++++---- .../modules/ontology/matching_strategies.py | 24 +++++++++---------- .../rdf_xml/RDFLibOntologyResolver.py | 16 +++++++++---- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/cognee/modules/ontology/base_ontology_resolver.py b/cognee/modules/ontology/base_ontology_resolver.py index 86f51fcb7..7005e6981 100644 --- a/cognee/modules/ontology/base_ontology_resolver.py +++ b/cognee/modules/ontology/base_ontology_resolver.py @@ -7,12 +7,12 @@ from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyM class BaseOntologyResolver(ABC): """Abstract base class for ontology resolvers.""" - + def __init__(self, matching_strategy: Optional[MatchingStrategy] = None): """Initialize the ontology resolver with a matching strategy. - + Args: - matching_strategy: The strategy to use for entity matching. + matching_strategy: The strategy to use for entity matching. Defaults to FuzzyMatchingStrategy if None. """ self.matching_strategy = matching_strategy or FuzzyMatchingStrategy() @@ -35,6 +35,8 @@ class BaseOntologyResolver(ABC): @abstractmethod def get_subgraph( self, node_name: str, node_type: str = "individuals", directed: bool = True - ) -> Tuple[List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode]]: + ) -> Tuple[ + List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode] + ]: """Get a subgraph for the given node.""" pass diff --git a/cognee/modules/ontology/matching_strategies.py b/cognee/modules/ontology/matching_strategies.py index c576bf6e2..0e8ba7b96 100644 --- a/cognee/modules/ontology/matching_strategies.py +++ b/cognee/modules/ontology/matching_strategies.py @@ -5,15 +5,15 @@ from typing import List, Optional class MatchingStrategy(ABC): """Abstract base class for ontology entity matching strategies.""" - + @abstractmethod def find_match(self, name: str, candidates: List[str]) -> Optional[str]: """Find the best match for a given name from a list of candidates. - + Args: name: The name to match candidates: List of candidate names to match against - + Returns: The best matching candidate name, or None if no match found """ @@ -22,34 +22,32 @@ class MatchingStrategy(ABC): class FuzzyMatchingStrategy(MatchingStrategy): """Fuzzy matching strategy using difflib for approximate string matching.""" - + def __init__(self, cutoff: float = 0.8): """Initialize fuzzy matching strategy. - + Args: cutoff: Minimum similarity score (0.0 to 1.0) for a match to be considered valid """ self.cutoff = cutoff - + def find_match(self, name: str, candidates: List[str]) -> Optional[str]: """Find the closest fuzzy match for a given name. - + Args: name: The normalized name to match candidates: List of normalized candidate names - + Returns: The best matching candidate name, or None if no match meets the cutoff """ if not candidates: return None - + # Check for exact match first if name in candidates: return name - + # Find fuzzy match - best_match = difflib.get_close_matches( - name, candidates, n=1, cutoff=self.cutoff - ) + best_match = difflib.get_close_matches(name, candidates, n=1, cutoff=self.cutoff) return best_match[0] if best_match else None diff --git a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index d8de5794a..c6b3e22be 100644 --- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -19,12 +19,16 @@ logger = get_logger("OntologyAdapter") class RDFLibOntologyResolver(BaseOntologyResolver): """RDFLib-based ontology resolver implementation. - + This implementation uses RDFLib to parse and work with RDF/OWL ontology files. It provides fuzzy matching and subgraph extraction capabilities for ontology entities. """ - - def __init__(self, ontology_file: Optional[str] = None, matching_strategy: Optional[MatchingStrategy] = None): + + def __init__( + self, + ontology_file: Optional[str] = None, + matching_strategy: Optional[MatchingStrategy] = None, + ): super().__init__(matching_strategy) self.ontology_file = ontology_file try: @@ -96,7 +100,7 @@ class RDFLibOntologyResolver(BaseOntologyResolver): try: normalized_name = name.lower().replace(" ", "_").strip() possible_matches = list(self.lookup.get(category, {}).keys()) - + return self.matching_strategy.find_match(normalized_name, possible_matches) except Exception as e: logger.error("Error in find_closest_match: %s", str(e)) @@ -111,7 +115,9 @@ class RDFLibOntologyResolver(BaseOntologyResolver): def get_subgraph( self, node_name: str, node_type: str = "individuals", directed: bool = True - ) -> Tuple[List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode]]: + ) -> Tuple[ + List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode] + ]: nodes_set = set() edges: List[Tuple[str, str, str]] = [] visited = set() From 631b2f37ce0b8bad90bc5cdb3bcdc7d35d4c9f0e Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 12:28:44 +0200 Subject: [PATCH 07/32] fix: deletes old ontology resolver instance --- cognee/tasks/graph/extract_graph_from_data.py | 2 +- cognee/tasks/graph/extract_graph_from_data_v2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 22cbc70fe..1ae28ca89 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -102,5 +102,5 @@ async def extract_graph_from_data( ] return await integrate_chunk_graphs( - data_chunks, chunk_graphs, graph_model, ontology_adapter or OntologyResolver() + data_chunks, chunk_graphs, graph_model, ontology_adapter ) diff --git a/cognee/tasks/graph/extract_graph_from_data_v2.py b/cognee/tasks/graph/extract_graph_from_data_v2.py index d2b4924c7..5a4194fb1 100644 --- a/cognee/tasks/graph/extract_graph_from_data_v2.py +++ b/cognee/tasks/graph/extract_graph_from_data_v2.py @@ -44,5 +44,5 @@ async def extract_graph_from_data( data_chunks=data_chunks, chunk_graphs=chunk_graphs, graph_model=KnowledgeGraph, - ontology_adapter=ontology_adapter or OntologyResolver(), + ontology_adapter=ontology_adapter, ) From 75bef6e9299677f9e569f4ff2096b032578f7ae8 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 12:30:19 +0200 Subject: [PATCH 08/32] ruff fix --- cognee/tasks/graph/extract_graph_from_data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 1ae28ca89..6681dd975 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -101,6 +101,4 @@ async def extract_graph_from_data( if edge.source_node_id in valid_node_ids and edge.target_node_id in valid_node_ids ] - return await integrate_chunk_graphs( - data_chunks, chunk_graphs, graph_model, ontology_adapter - ) + return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model, ontology_adapter) From 6261fca0c4fe57ab9bfe6d66dc7a9c7e2550c608 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 12:44:04 +0200 Subject: [PATCH 09/32] feat: adds default ontology resolver --- .../graph/utils/expand_with_nodes_and_edges.py | 3 ++- .../ontology/get_default_ontology_resolver.py | 18 ++++++++++++++++++ .../modules/ontology/test_ontology_adapter.py | 7 ++++--- 3 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 cognee/modules/ontology/get_default_ontology_resolver.py diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index bc6205d41..b3e8e8029 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -9,6 +9,7 @@ from cognee.modules.engine.utils import ( ) from cognee.shared.data_models import KnowledgeGraph from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver def _create_node_key(node_id: str, category: str) -> str: @@ -320,7 +321,7 @@ def expand_with_nodes_and_edges( existing_edges_map = {} if ontology_resolver is None: - ontology_resolver = RDFLibOntologyResolver() + ontology_resolver = get_default_ontology_resolver() added_nodes_map = {} added_ontology_nodes_map = {} diff --git a/cognee/modules/ontology/get_default_ontology_resolver.py b/cognee/modules/ontology/get_default_ontology_resolver.py new file mode 100644 index 000000000..9dc5c59ba --- /dev/null +++ b/cognee/modules/ontology/get_default_ontology_resolver.py @@ -0,0 +1,18 @@ +from typing import Optional + +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + +def get_default_ontology_resolver(ontology_file: Optional[str] = None) -> BaseOntologyResolver: + """Get the default ontology resolver (RDFLib with fuzzy matching). + + Args: + ontology_file: Optional path to ontology file + + Returns: + Default RDFLib ontology resolver with fuzzy matching strategy + """ + fuzzy_strategy = FuzzyMatchingStrategy() + return RDFLibOntologyResolver(ontology_file=ontology_file, matching_strategy=fuzzy_strategy) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index 051cb3556..401c6dc02 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -2,12 +2,13 @@ import pytest from rdflib import Graph, Namespace, RDF, OWL, RDFS from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.models import AttachedOntologyNode +from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver def test_ontology_adapter_initialization_success(): """Test successful initialization of OntologyAdapter.""" - adapter = RDFLibOntologyResolver() + adapter = get_default_ontology_resolver() adapter.build_lookup() assert isinstance(adapter.lookup, dict) @@ -106,7 +107,7 @@ def test_get_subgraph_no_match_rdflib(): """Test get_subgraph returns empty results for a non-existent node.""" g = Graph() - resolver = RDFLibOntologyResolver() + resolver = get_default_ontology_resolver() resolver.graph = g resolver.build_lookup() @@ -164,7 +165,7 @@ def test_refresh_lookup_rdflib(): """Test that refresh_lookup rebuilds the lookup dict into a new object.""" g = Graph() - resolver = RDFLibOntologyResolver() + resolver = get_default_ontology_resolver() resolver.graph = g resolver.build_lookup() From 7c046eafab20e8714ee985bb1cf9873c4e9ae3bf Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 13:28:11 +0200 Subject: [PATCH 10/32] feat: adds get_ontology_resolver + typed dict to hold params --- .../utils/expand_with_nodes_and_edges.py | 5 +-- .../ontology/get_default_ontology_resolver.py | 18 ---------- .../modules/ontology/get_ontology_resolver.py | 36 +++++++++++++++++++ cognee/modules/ontology/ontology_config.py | 16 +++++++++ 4 files changed, 55 insertions(+), 20 deletions(-) delete mode 100644 cognee/modules/ontology/get_default_ontology_resolver.py create mode 100644 cognee/modules/ontology/get_ontology_resolver.py create mode 100644 cognee/modules/ontology/ontology_config.py diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index b3e8e8029..e18860744 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -9,7 +9,7 @@ from cognee.modules.engine.utils import ( ) from cognee.shared.data_models import KnowledgeGraph from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver -from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver +from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver def _create_node_key(node_id: str, category: str) -> str: @@ -321,7 +321,8 @@ def expand_with_nodes_and_edges( existing_edges_map = {} if ontology_resolver is None: - ontology_resolver = get_default_ontology_resolver() + config = get_ontology_resolver() + ontology_resolver = config["resolver"] added_nodes_map = {} added_ontology_nodes_map = {} diff --git a/cognee/modules/ontology/get_default_ontology_resolver.py b/cognee/modules/ontology/get_default_ontology_resolver.py deleted file mode 100644 index 9dc5c59ba..000000000 --- a/cognee/modules/ontology/get_default_ontology_resolver.py +++ /dev/null @@ -1,18 +0,0 @@ -from typing import Optional - -from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver -from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver -from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - - -def get_default_ontology_resolver(ontology_file: Optional[str] = None) -> BaseOntologyResolver: - """Get the default ontology resolver (RDFLib with fuzzy matching). - - Args: - ontology_file: Optional path to ontology file - - Returns: - Default RDFLib ontology resolver with fuzzy matching strategy - """ - fuzzy_strategy = FuzzyMatchingStrategy() - return RDFLibOntologyResolver(ontology_file=ontology_file, matching_strategy=fuzzy_strategy) diff --git a/cognee/modules/ontology/get_ontology_resolver.py b/cognee/modules/ontology/get_ontology_resolver.py new file mode 100644 index 000000000..01377c162 --- /dev/null +++ b/cognee/modules/ontology/get_ontology_resolver.py @@ -0,0 +1,36 @@ +from typing import Optional + +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy +from cognee.modules.ontology.ontology_config import OntologyConfig + + +def get_ontology_resolver( + resolver: Optional[BaseOntologyResolver] = None, + matching_strategy: Optional[MatchingStrategy] = None, +) -> OntologyConfig: + """Get ontology resolver configuration with default or custom objects. + + Args: + resolver: Optional pre-configured ontology resolver instance + matching_strategy: Optional matching strategy instance + + Returns: + Ontology configuration with default RDFLib resolver and fuzzy matching strategy, + or custom objects if provided + """ + config: OntologyConfig = {} + + if resolver is not None: + config["resolver"] = resolver + else: + default_strategy = matching_strategy or FuzzyMatchingStrategy() + config["resolver"] = RDFLibOntologyResolver( + ontology_file=None, matching_strategy=default_strategy + ) + + if matching_strategy is not None and resolver is None: + config["matching_strategy"] = matching_strategy + + return config diff --git a/cognee/modules/ontology/ontology_config.py b/cognee/modules/ontology/ontology_config.py new file mode 100644 index 000000000..e28da9f92 --- /dev/null +++ b/cognee/modules/ontology/ontology_config.py @@ -0,0 +1,16 @@ +from typing import TypedDict, Optional + +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver +from cognee.modules.ontology.matching_strategies import MatchingStrategy + + +class OntologyConfig(TypedDict, total=False): + """Configuration for ontology resolver. + + Attributes: + resolver: The ontology resolver instance to use + matching_strategy: The matching strategy to use + """ + + resolver: Optional[BaseOntologyResolver] + matching_strategy: Optional[MatchingStrategy] From 142d8068e12fc0380db4b596e3dc8af63dc1e88d Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 13:30:20 +0200 Subject: [PATCH 11/32] chore: updates default empty ontology resolver tests --- .../unit/modules/ontology/test_ontology_adapter.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index 401c6dc02..74383d5df 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -2,13 +2,14 @@ import pytest from rdflib import Graph, Namespace, RDF, OWL, RDFS from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.models import AttachedOntologyNode -from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver +from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver def test_ontology_adapter_initialization_success(): """Test successful initialization of OntologyAdapter.""" - adapter = get_default_ontology_resolver() + config = get_ontology_resolver() + adapter = config["resolver"] adapter.build_lookup() assert isinstance(adapter.lookup, dict) @@ -107,7 +108,8 @@ def test_get_subgraph_no_match_rdflib(): """Test get_subgraph returns empty results for a non-existent node.""" g = Graph() - resolver = get_default_ontology_resolver() + config = get_ontology_resolver() + resolver = config["resolver"] resolver.graph = g resolver.build_lookup() @@ -165,7 +167,8 @@ def test_refresh_lookup_rdflib(): """Test that refresh_lookup rebuilds the lookup dict into a new object.""" g = Graph() - resolver = get_default_ontology_resolver() + config = get_ontology_resolver() + resolver = config["resolver"] resolver.graph = g resolver.build_lookup() From e815a3fc140d79fb0f6987b7ef730b2a4cd437b6 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:12:47 +0200 Subject: [PATCH 12/32] chore: changes ontology file path parameter to the new config structure --- cognee/api/v1/cognify/cognify.py | 14 +++++++++----- cognee/tasks/graph/extract_graph_from_data.py | 10 +++++++++- examples/python/ontology_demo_example.py | 9 ++++++++- examples/python/ontology_demo_example_2.py | 8 +++++++- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index e933bafd8..f90e487e0 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -10,7 +10,8 @@ from cognee.infrastructure.llm import get_max_chunk_tokens from cognee.modules.pipelines import run_pipeline from cognee.modules.pipelines.tasks.task import Task from cognee.modules.chunking.TextChunker import TextChunker -from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.ontology_config import OntologyConfig +from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver from cognee.modules.users.models import User from cognee.tasks.documents import ( @@ -39,7 +40,7 @@ async def cognify( graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, - ontology_file_path: Optional[str] = None, + ontology_config: OntologyConfig = None, vector_db_config: dict = None, graph_db_config: dict = None, run_in_background: bool = False, @@ -188,11 +189,14 @@ async def cognify( - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False) - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) """ + if ontology_config is None: + ontology_config = get_ontology_resolver() + if temporal_cognify: tasks = await get_temporal_tasks(user, chunker, chunk_size) else: tasks = await get_default_tasks( - user, graph_model, chunker, chunk_size, ontology_file_path, custom_prompt + user, graph_model, chunker, chunk_size, ontology_config, custom_prompt ) # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for @@ -216,7 +220,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, - ontology_file_path: Optional[str] = None, + ontology_config: OntologyConfig = get_ontology_resolver(), custom_prompt: Optional[str] = None, ) -> list[Task]: default_tasks = [ @@ -230,7 +234,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's Task( extract_graph_from_data, graph_model=graph_model, - ontology_adapter=RDFLibOntologyResolver(ontology_file=ontology_file_path), + ontology_config=ontology_config, custom_prompt=custom_prompt, task_config={"batch_size": 10}, ), # Generate knowledge graphs from the document chunks. diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 6681dd975..f0ef9c7f9 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -4,6 +4,8 @@ from pydantic import BaseModel from cognee.infrastructure.databases.graph import get_graph_engine from cognee.tasks.storage.add_data_points import add_data_points +from cognee.modules.ontology.ontology_config import OntologyConfig +from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.graph.utils import ( @@ -70,7 +72,7 @@ async def integrate_chunk_graphs( async def extract_graph_from_data( data_chunks: List[DocumentChunk], graph_model: Type[BaseModel], - ontology_adapter: RDFLibOntologyResolver = None, + ontology_config: OntologyConfig = None, custom_prompt: Optional[str] = None, ) -> List[DocumentChunk]: """ @@ -101,4 +103,10 @@ async def extract_graph_from_data( if edge.source_node_id in valid_node_ids and edge.target_node_id in valid_node_ids ] + # Extract resolver from config if provided, otherwise get default + if ontology_config is None: + ontology_config = get_ontology_resolver() + + ontology_adapter = ontology_config["resolver"] + return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model, ontology_adapter) diff --git a/examples/python/ontology_demo_example.py b/examples/python/ontology_demo_example.py index 8243faef5..ea1ab8b72 100644 --- a/examples/python/ontology_demo_example.py +++ b/examples/python/ontology_demo_example.py @@ -5,6 +5,8 @@ import cognee from cognee.api.v1.search import SearchType from cognee.api.v1.visualize.visualize import visualize_graph from cognee.shared.logging_utils import setup_logging +from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver text_1 = """ 1. Audi @@ -60,7 +62,12 @@ async def main(): os.path.dirname(os.path.abspath(__file__)), "ontology_input_example/basic_ontology.owl" ) - await cognee.cognify(ontology_file_path=ontology_path) + # Create ontology config with custom ontology file + ontology_config = get_ontology_resolver( + resolver=RDFLibOntologyResolver(ontology_file=ontology_path) + ) + + await cognee.cognify(ontology_config=ontology_config) print("Knowledge with ontology created.") # Step 4: Query insights diff --git a/examples/python/ontology_demo_example_2.py b/examples/python/ontology_demo_example_2.py index 22fb19862..e897da2e5 100644 --- a/examples/python/ontology_demo_example_2.py +++ b/examples/python/ontology_demo_example_2.py @@ -5,6 +5,8 @@ import os import textwrap from cognee.api.v1.search import SearchType from cognee.api.v1.visualize.visualize import visualize_graph +from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver async def run_pipeline(ontology_path=None): @@ -17,7 +19,11 @@ async def run_pipeline(ontology_path=None): await cognee.add(scientific_papers_dir) - pipeline_run = await cognee.cognify(ontology_file_path=ontology_path) + ontology_config = get_ontology_resolver( + resolver=RDFLibOntologyResolver(ontology_file=ontology_path) + ) + + pipeline_run = await cognee.cognify(ontology_config=ontology_config) return pipeline_run From d2c7980e8317d7a3af79a2b3bbcd3fb77c786bbf Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:14:39 +0200 Subject: [PATCH 13/32] chore: updates mutable default param --- cognee/api/v1/cognify/cognify.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index f90e487e0..2cb844d12 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -220,9 +220,11 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, - ontology_config: OntologyConfig = get_ontology_resolver(), + ontology_config: OntologyConfig = None, custom_prompt: Optional[str] = None, ) -> list[Task]: + if ontology_config is None: + ontology_config = get_ontology_resolver() default_tasks = [ Task(classify_documents), Task(check_permissions_on_dataset, user=user, permissions=["write"]), From f4c70cc315dbb73aa8a2463ed8a085119034d535 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:39:43 +0200 Subject: [PATCH 14/32] feat: adds tests for the new logic + fixes the case when only matching is provided --- .../modules/ontology/get_ontology_resolver.py | 5 +- .../modules/ontology/test_ontology_adapter.py | 207 ++++++++++++++++++ 2 files changed, 209 insertions(+), 3 deletions(-) diff --git a/cognee/modules/ontology/get_ontology_resolver.py b/cognee/modules/ontology/get_ontology_resolver.py index 01377c162..d75928af9 100644 --- a/cognee/modules/ontology/get_ontology_resolver.py +++ b/cognee/modules/ontology/get_ontology_resolver.py @@ -24,13 +24,12 @@ def get_ontology_resolver( if resolver is not None: config["resolver"] = resolver + config["matching_strategy"] = matching_strategy or resolver.matching_strategy else: default_strategy = matching_strategy or FuzzyMatchingStrategy() config["resolver"] = RDFLibOntologyResolver( ontology_file=None, matching_strategy=default_strategy ) - - if matching_strategy is not None and resolver is None: - config["matching_strategy"] = matching_strategy + config["matching_strategy"] = default_strategy return config diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index 74383d5df..9b7eeeae0 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -177,3 +177,210 @@ def test_refresh_lookup_rdflib(): resolver.refresh_lookup() assert resolver.lookup is not original_lookup + + +def test_fuzzy_matching_strategy_exact_match(): + """Test FuzzyMatchingStrategy finds exact matches.""" + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + strategy = FuzzyMatchingStrategy() + candidates = ["audi", "bmw", "mercedes"] + + result = strategy.find_match("audi", candidates) + assert result == "audi" + + +def test_fuzzy_matching_strategy_fuzzy_match(): + """Test FuzzyMatchingStrategy finds fuzzy matches.""" + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + strategy = FuzzyMatchingStrategy(cutoff=0.6) + candidates = ["audi", "bmw", "mercedes"] + + result = strategy.find_match("audii", candidates) + assert result == "audi" + + +def test_fuzzy_matching_strategy_no_match(): + """Test FuzzyMatchingStrategy returns None when no match meets cutoff.""" + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + strategy = FuzzyMatchingStrategy(cutoff=0.9) + candidates = ["audi", "bmw", "mercedes"] + + result = strategy.find_match("completely_different", candidates) + assert result is None + + +def test_fuzzy_matching_strategy_empty_candidates(): + """Test FuzzyMatchingStrategy handles empty candidates list.""" + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + strategy = FuzzyMatchingStrategy() + + result = strategy.find_match("audi", []) + assert result is None + + +def test_base_ontology_resolver_initialization(): + """Test BaseOntologyResolver initialization with default matching strategy.""" + from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + class TestOntologyResolver(BaseOntologyResolver): + def build_lookup(self): + pass + + def refresh_lookup(self): + pass + + def find_closest_match(self, name, category): + return None + + def get_subgraph(self, node_name, node_type="individuals", directed=True): + return [], [], None + + resolver = TestOntologyResolver() + assert isinstance(resolver.matching_strategy, FuzzyMatchingStrategy) + + +def test_base_ontology_resolver_custom_matching_strategy(): + """Test BaseOntologyResolver initialization with custom matching strategy.""" + from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver + from cognee.modules.ontology.matching_strategies import MatchingStrategy + + class CustomMatchingStrategy(MatchingStrategy): + def find_match(self, name, candidates): + return "custom_match" + + class TestOntologyResolver(BaseOntologyResolver): + def build_lookup(self): + pass + + def refresh_lookup(self): + pass + + def find_closest_match(self, name, category): + return None + + def get_subgraph(self, node_name, node_type="individuals", directed=True): + return [], [], None + + custom_strategy = CustomMatchingStrategy() + resolver = TestOntologyResolver(matching_strategy=custom_strategy) + assert resolver.matching_strategy == custom_strategy + + +def test_ontology_config_structure(): + """Test OntologyConfig TypedDict structure.""" + from cognee.modules.ontology.ontology_config import OntologyConfig + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + resolver = RDFLibOntologyResolver() + matching_strategy = FuzzyMatchingStrategy() + + config: OntologyConfig = {"resolver": resolver, "matching_strategy": matching_strategy} + + assert config["resolver"] == resolver + assert config["matching_strategy"] == matching_strategy + + +def test_get_ontology_resolver_default(): + """Test get_ontology_resolver returns default configuration.""" + from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + config = get_ontology_resolver() + + assert isinstance(config["resolver"], RDFLibOntologyResolver) + assert isinstance(config["matching_strategy"], FuzzyMatchingStrategy) + assert config["resolver"].matching_strategy == config["matching_strategy"] + + +def test_get_ontology_resolver_custom_resolver(): + """Test get_ontology_resolver with custom resolver.""" + from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + custom_resolver = RDFLibOntologyResolver(ontology_file="test.owl") + config = get_ontology_resolver(resolver=custom_resolver) + + assert config["resolver"] == custom_resolver + assert config["matching_strategy"] == custom_resolver.matching_strategy + assert isinstance(config["matching_strategy"], FuzzyMatchingStrategy) + + +def test_get_ontology_resolver_custom_matching_strategy(): + """Test get_ontology_resolver with custom matching strategy.""" + from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + custom_strategy = FuzzyMatchingStrategy(cutoff=0.9) + config = get_ontology_resolver(matching_strategy=custom_strategy) + + assert isinstance(config["resolver"], RDFLibOntologyResolver) + assert config["matching_strategy"] == custom_strategy + assert config["resolver"].matching_strategy == custom_strategy + + +def test_get_ontology_resolver_both_custom(): + """Test get_ontology_resolver with both custom resolver and matching strategy.""" + from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + custom_resolver = RDFLibOntologyResolver(ontology_file="test.owl") + custom_strategy = FuzzyMatchingStrategy(cutoff=0.9) + config = get_ontology_resolver(resolver=custom_resolver, matching_strategy=custom_strategy) + + assert config["resolver"] == custom_resolver + assert config["matching_strategy"] == custom_strategy + + +def test_get_ontology_resolver_only_resolver_uses_resolver_strategy(): + """Test that when only resolver is passed, it uses the resolver's matching strategy.""" + from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + custom_strategy = FuzzyMatchingStrategy(cutoff=0.8) + custom_resolver = RDFLibOntologyResolver(matching_strategy=custom_strategy) + + config = get_ontology_resolver(resolver=custom_resolver) + + assert config["resolver"] == custom_resolver + assert config["matching_strategy"] == custom_strategy + assert config["matching_strategy"] == custom_resolver.matching_strategy + + +def test_rdflib_ontology_resolver_uses_matching_strategy(): + """Test that RDFLibOntologyResolver uses the provided matching strategy.""" + from cognee.modules.ontology.matching_strategies import MatchingStrategy + + class TestMatchingStrategy(MatchingStrategy): + def find_match(self, name, candidates): + return "test_match" if candidates else None + + ns = Namespace("http://example.org/test#") + g = Graph() + g.add((ns.Car, RDF.type, OWL.Class)) + g.add((ns.Audi, RDF.type, ns.Car)) + + resolver = RDFLibOntologyResolver(matching_strategy=TestMatchingStrategy()) + resolver.graph = g + resolver.build_lookup() + + result = resolver.find_closest_match("Audi", "individuals") + assert result == "test_match" + + +def test_rdflib_ontology_resolver_default_matching_strategy(): + """Test that RDFLibOntologyResolver uses FuzzyMatchingStrategy by default.""" + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + resolver = RDFLibOntologyResolver() + assert isinstance(resolver.matching_strategy, FuzzyMatchingStrategy) From 3426f2345229a290b77823496ceaad7e9bf6581b Mon Sep 17 00:00:00 2001 From: Hande <159312713+hande-k@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:41:25 +0200 Subject: [PATCH 15/32] add none to default as observability --- cognee/base_config.py | 9 ++++++++- cognee/modules/observability/get_observe.py | 14 ++++++++++++++ cognee/modules/observability/observers.py | 1 + 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cognee/base_config.py b/cognee/base_config.py index 940846128..8de42cbad 100644 --- a/cognee/base_config.py +++ b/cognee/base_config.py @@ -10,13 +10,20 @@ import pydantic class BaseConfig(BaseSettings): data_root_directory: str = get_absolute_path(".data_storage") system_root_directory: str = get_absolute_path(".cognee_system") - monitoring_tool: object = Observer.LANGFUSE + monitoring_tool: object = Observer.NONE @pydantic.model_validator(mode="after") def validate_paths(self): # Require absolute paths for root directories self.data_root_directory = ensure_absolute_path(self.data_root_directory) self.system_root_directory = ensure_absolute_path(self.system_root_directory) + + # Set monitoring tool based on available keys + if self.langfuse_public_key and self.langfuse_secret_key: + self.monitoring_tool = Observer.LANGFUSE + else: + self.monitoring_tool = Observer.NONE + return self langfuse_public_key: Optional[str] = os.getenv("LANGFUSE_PUBLIC_KEY") diff --git a/cognee/modules/observability/get_observe.py b/cognee/modules/observability/get_observe.py index db3655482..9ee44e46a 100644 --- a/cognee/modules/observability/get_observe.py +++ b/cognee/modules/observability/get_observe.py @@ -9,3 +9,17 @@ def get_observe(): from langfuse.decorators import observe return observe + elif monitoring == Observer.NONE: + # Return a no-op decorator that handles keyword arguments + def no_op_decorator(*args, **kwargs): + if len(args) == 1 and callable(args[0]) and not kwargs: + # Direct decoration: @observe + return args[0] + else: + # Parameterized decoration: @observe(as_type="generation") + def decorator(func): + return func + + return decorator + + return no_op_decorator diff --git a/cognee/modules/observability/observers.py b/cognee/modules/observability/observers.py index 7bd0380ec..9c4aff43b 100644 --- a/cognee/modules/observability/observers.py +++ b/cognee/modules/observability/observers.py @@ -4,6 +4,7 @@ from enum import Enum class Observer(str, Enum): """Monitoring tools""" + NONE = "none" LANGFUSE = "langfuse" LLMLITE = "llmlite" LANGSMITH = "langsmith" From 04ac0d52d287a349f105a35eeb3cd2e1402aadb1 Mon Sep 17 00:00:00 2001 From: Hande <159312713+hande-k@users.noreply.github.com> Date: Thu, 18 Sep 2025 14:17:53 +0200 Subject: [PATCH 16/32] remove defaulting none if not langfuse --- cognee/base_config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cognee/base_config.py b/cognee/base_config.py index 8de42cbad..cc2b912cf 100644 --- a/cognee/base_config.py +++ b/cognee/base_config.py @@ -21,8 +21,6 @@ class BaseConfig(BaseSettings): # Set monitoring tool based on available keys if self.langfuse_public_key and self.langfuse_secret_key: self.monitoring_tool = Observer.LANGFUSE - else: - self.monitoring_tool = Observer.NONE return self From 94373e5a01d948af80756987f3be990ad9652f0e Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:24:23 +0200 Subject: [PATCH 17/32] feat: adds new config structure based on requirements --- cognee/api/v1/cognify/cognify.py | 23 +++-- .../utils/expand_with_nodes_and_edges.py | 8 +- .../ontology/get_default_ontology_resolver.py | 6 ++ .../modules/ontology/get_ontology_resolver.py | 35 -------- cognee/modules/ontology/ontology_config.py | 18 ++-- .../rdf_xml/RDFLibOntologyResolver.py | 2 +- cognee/tasks/graph/extract_graph_from_data.py | 22 ++--- .../modules/ontology/test_ontology_adapter.py | 90 +++++-------------- examples/python/ontology_demo_example.py | 14 +-- examples/python/ontology_demo_example_2.py | 12 +-- 10 files changed, 85 insertions(+), 145 deletions(-) create mode 100644 cognee/modules/ontology/get_default_ontology_resolver.py delete mode 100644 cognee/modules/ontology/get_ontology_resolver.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 2cb844d12..f4bd5d1b4 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -10,8 +10,8 @@ from cognee.infrastructure.llm import get_max_chunk_tokens from cognee.modules.pipelines import run_pipeline from cognee.modules.pipelines.tasks.task import Task from cognee.modules.chunking.TextChunker import TextChunker -from cognee.modules.ontology.ontology_config import OntologyConfig -from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver +from cognee.modules.ontology.ontology_config import Config +from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver from cognee.modules.users.models import User from cognee.tasks.documents import ( @@ -40,7 +40,7 @@ async def cognify( graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, - ontology_config: OntologyConfig = None, + config: Config = None, vector_db_config: dict = None, graph_db_config: dict = None, run_in_background: bool = False, @@ -101,8 +101,6 @@ async def cognify( Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2) Default limits: ~512-8192 tokens depending on models. Smaller chunks = more granular but potentially fragmented knowledge. - ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types. - Useful for specialized fields like medical or legal documents. vector_db_config: Custom vector database configuration for embeddings storage. graph_db_config: Custom graph database configuration for relationship storage. run_in_background: If True, starts processing asynchronously and returns immediately. @@ -189,14 +187,14 @@ async def cognify( - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False) - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) """ - if ontology_config is None: - ontology_config = get_ontology_resolver() + if config is None: + config: Config = {"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}} if temporal_cognify: tasks = await get_temporal_tasks(user, chunker, chunk_size) else: tasks = await get_default_tasks( - user, graph_model, chunker, chunk_size, ontology_config, custom_prompt + user, graph_model, chunker, chunk_size, config, custom_prompt ) # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for @@ -220,11 +218,12 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, chunk_size: int = None, - ontology_config: OntologyConfig = None, + config: Config = None, custom_prompt: Optional[str] = None, ) -> list[Task]: - if ontology_config is None: - ontology_config = get_ontology_resolver() + if config is None: + config: Config = {"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}} + default_tasks = [ Task(classify_documents), Task(check_permissions_on_dataset, user=user, permissions=["write"]), @@ -236,7 +235,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's Task( extract_graph_from_data, graph_model=graph_model, - ontology_config=ontology_config, + config=config, custom_prompt=custom_prompt, task_config={"batch_size": 10}, ), # Generate knowledge graphs from the document chunks. diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index e18860744..ef72cd0e1 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -7,9 +7,10 @@ from cognee.modules.engine.utils import ( generate_node_id, generate_node_name, ) +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.shared.data_models import KnowledgeGraph from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver -from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver +from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver def _create_node_key(node_id: str, category: str) -> str: @@ -278,7 +279,7 @@ def _process_graph_edges( def expand_with_nodes_and_edges( data_chunks: list[DocumentChunk], chunk_graphs: list[KnowledgeGraph], - ontology_resolver: RDFLibOntologyResolver = None, + ontology_resolver: BaseOntologyResolver = None, existing_edges_map: Optional[dict[str, bool]] = None, ): """ @@ -321,8 +322,7 @@ def expand_with_nodes_and_edges( existing_edges_map = {} if ontology_resolver is None: - config = get_ontology_resolver() - ontology_resolver = config["resolver"] + ontology_resolver = get_default_ontology_resolver() added_nodes_map = {} added_ontology_nodes_map = {} diff --git a/cognee/modules/ontology/get_default_ontology_resolver.py b/cognee/modules/ontology/get_default_ontology_resolver.py new file mode 100644 index 000000000..ae10fbde5 --- /dev/null +++ b/cognee/modules/ontology/get_default_ontology_resolver.py @@ -0,0 +1,6 @@ +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + +def get_default_ontology_resolver() -> RDFLibOntologyResolver: + return RDFLibOntologyResolver(ontology_file=None, matching_strategy=FuzzyMatchingStrategy()) diff --git a/cognee/modules/ontology/get_ontology_resolver.py b/cognee/modules/ontology/get_ontology_resolver.py deleted file mode 100644 index d75928af9..000000000 --- a/cognee/modules/ontology/get_ontology_resolver.py +++ /dev/null @@ -1,35 +0,0 @@ -from typing import Optional - -from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver -from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver -from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy -from cognee.modules.ontology.ontology_config import OntologyConfig - - -def get_ontology_resolver( - resolver: Optional[BaseOntologyResolver] = None, - matching_strategy: Optional[MatchingStrategy] = None, -) -> OntologyConfig: - """Get ontology resolver configuration with default or custom objects. - - Args: - resolver: Optional pre-configured ontology resolver instance - matching_strategy: Optional matching strategy instance - - Returns: - Ontology configuration with default RDFLib resolver and fuzzy matching strategy, - or custom objects if provided - """ - config: OntologyConfig = {} - - if resolver is not None: - config["resolver"] = resolver - config["matching_strategy"] = matching_strategy or resolver.matching_strategy - else: - default_strategy = matching_strategy or FuzzyMatchingStrategy() - config["resolver"] = RDFLibOntologyResolver( - ontology_file=None, matching_strategy=default_strategy - ) - config["matching_strategy"] = default_strategy - - return config diff --git a/cognee/modules/ontology/ontology_config.py b/cognee/modules/ontology/ontology_config.py index e28da9f92..397411edc 100644 --- a/cognee/modules/ontology/ontology_config.py +++ b/cognee/modules/ontology/ontology_config.py @@ -5,12 +5,20 @@ from cognee.modules.ontology.matching_strategies import MatchingStrategy class OntologyConfig(TypedDict, total=False): - """Configuration for ontology resolver. + """Configuration containing ontology resolver. Attributes: - resolver: The ontology resolver instance to use - matching_strategy: The matching strategy to use + ontology_resolver: The ontology resolver instance to use """ - resolver: Optional[BaseOntologyResolver] - matching_strategy: Optional[MatchingStrategy] + ontology_resolver: Optional[BaseOntologyResolver] + + +class Config(TypedDict, total=False): + """Top-level configuration dictionary. + + Attributes: + ontology_config: Configuration containing ontology resolver + """ + + ontology_config: Optional[OntologyConfig] diff --git a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index c6b3e22be..2a7a03751 100644 --- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -28,7 +28,7 @@ class RDFLibOntologyResolver(BaseOntologyResolver): self, ontology_file: Optional[str] = None, matching_strategy: Optional[MatchingStrategy] = None, - ): + ) -> None: super().__init__(matching_strategy) self.ontology_file = ontology_file try: diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index f0ef9c7f9..7c049546c 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -4,8 +4,8 @@ from pydantic import BaseModel from cognee.infrastructure.databases.graph import get_graph_engine from cognee.tasks.storage.add_data_points import add_data_points -from cognee.modules.ontology.ontology_config import OntologyConfig -from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver +from cognee.modules.ontology.ontology_config import Config +from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.graph.utils import ( @@ -26,7 +26,7 @@ async def integrate_chunk_graphs( data_chunks: list[DocumentChunk], chunk_graphs: list, graph_model: Type[BaseModel], - ontology_adapter: RDFLibOntologyResolver, + ontology_resolver: RDFLibOntologyResolver, ) -> List[DocumentChunk]: """Updates DocumentChunk objects, integrates data points and edges into databases.""" @@ -38,9 +38,9 @@ async def integrate_chunk_graphs( ) if not isinstance(graph_model, type) or not issubclass(graph_model, BaseModel): raise InvalidGraphModelError(graph_model) - if ontology_adapter is None or not hasattr(ontology_adapter, "get_subgraph"): + if ontology_resolver is None or not hasattr(ontology_resolver, "get_subgraph"): raise InvalidOntologyAdapterError( - type(ontology_adapter).__name__ if ontology_adapter else "None" + type(ontology_resolver).__name__ if ontology_resolver else "None" ) graph_engine = await get_graph_engine() @@ -57,7 +57,7 @@ async def integrate_chunk_graphs( ) graph_nodes, graph_edges = expand_with_nodes_and_edges( - data_chunks, chunk_graphs, ontology_adapter, existing_edges_map + data_chunks, chunk_graphs, ontology_resolver, existing_edges_map ) if len(graph_nodes) > 0: @@ -72,7 +72,7 @@ async def integrate_chunk_graphs( async def extract_graph_from_data( data_chunks: List[DocumentChunk], graph_model: Type[BaseModel], - ontology_config: OntologyConfig = None, + config: Config = None, custom_prompt: Optional[str] = None, ) -> List[DocumentChunk]: """ @@ -104,9 +104,9 @@ async def extract_graph_from_data( ] # Extract resolver from config if provided, otherwise get default - if ontology_config is None: - ontology_config = get_ontology_resolver() + if config is None: + config: Config = {"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}} - ontology_adapter = ontology_config["resolver"] + ontology_resolver = config["ontology_config"]["ontology_resolver"] - return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model, ontology_adapter) + return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model, ontology_resolver) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index 9b7eeeae0..88e9b314d 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -2,13 +2,13 @@ import pytest from rdflib import Graph, Namespace, RDF, OWL, RDFS from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.models import AttachedOntologyNode -from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver +from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver def test_ontology_adapter_initialization_success(): """Test successful initialization of OntologyAdapter.""" - config = get_ontology_resolver() + config = get_default_ontology_resolver() adapter = config["resolver"] adapter.build_lookup() @@ -108,7 +108,7 @@ def test_get_subgraph_no_match_rdflib(): """Test get_subgraph returns empty results for a non-existent node.""" g = Graph() - config = get_ontology_resolver() + config = get_default_ontology_resolver() resolver = config["resolver"] resolver.graph = g resolver.build_lookup() @@ -167,7 +167,7 @@ def test_refresh_lookup_rdflib(): """Test that refresh_lookup rebuilds the lookup dict into a new object.""" g = Graph() - config = get_ontology_resolver() + config = get_default_ontology_resolver() resolver = config["resolver"] resolver.graph = g resolver.build_lookup() @@ -272,89 +272,47 @@ def test_base_ontology_resolver_custom_matching_strategy(): def test_ontology_config_structure(): - """Test OntologyConfig TypedDict structure.""" - from cognee.modules.ontology.ontology_config import OntologyConfig + """Test TypedDict structure for ontology configuration.""" + from cognee.modules.ontology.ontology_config import Config from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy resolver = RDFLibOntologyResolver() matching_strategy = FuzzyMatchingStrategy() - config: OntologyConfig = {"resolver": resolver, "matching_strategy": matching_strategy} + config: Config = {"ontology_config": {"ontology_resolver": resolver}} - assert config["resolver"] == resolver - assert config["matching_strategy"] == matching_strategy + assert config["ontology_config"]["ontology_resolver"] == resolver def test_get_ontology_resolver_default(): """Test get_ontology_resolver returns default configuration.""" - from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver + from cognee.modules.ontology.get_ontology_resolver import get_default_ontology_resolver + from cognee.modules.ontology.ontology_config import Config from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - config = get_ontology_resolver() + config: Config = get_default_ontology_resolver() - assert isinstance(config["resolver"], RDFLibOntologyResolver) - assert isinstance(config["matching_strategy"], FuzzyMatchingStrategy) - assert config["resolver"].matching_strategy == config["matching_strategy"] + assert isinstance(config["ontology_config"]["ontology_resolver"], RDFLibOntologyResolver) + assert isinstance( + config["ontology_config"]["ontology_resolver"].matching_strategy, FuzzyMatchingStrategy + ) -def test_get_ontology_resolver_custom_resolver(): - """Test get_ontology_resolver with custom resolver.""" - from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver +def test_get_default_ontology_resolver(): + """Test get_default_ontology_resolver returns default configuration.""" + from cognee.modules.ontology.get_ontology_resolver import get_default_ontology_resolver + from cognee.modules.ontology.ontology_config import Config from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - custom_resolver = RDFLibOntologyResolver(ontology_file="test.owl") - config = get_ontology_resolver(resolver=custom_resolver) + config: Config = get_default_ontology_resolver() - assert config["resolver"] == custom_resolver - assert config["matching_strategy"] == custom_resolver.matching_strategy - assert isinstance(config["matching_strategy"], FuzzyMatchingStrategy) - - -def test_get_ontology_resolver_custom_matching_strategy(): - """Test get_ontology_resolver with custom matching strategy.""" - from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver - from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - - custom_strategy = FuzzyMatchingStrategy(cutoff=0.9) - config = get_ontology_resolver(matching_strategy=custom_strategy) - - assert isinstance(config["resolver"], RDFLibOntologyResolver) - assert config["matching_strategy"] == custom_strategy - assert config["resolver"].matching_strategy == custom_strategy - - -def test_get_ontology_resolver_both_custom(): - """Test get_ontology_resolver with both custom resolver and matching strategy.""" - from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver - from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - - custom_resolver = RDFLibOntologyResolver(ontology_file="test.owl") - custom_strategy = FuzzyMatchingStrategy(cutoff=0.9) - config = get_ontology_resolver(resolver=custom_resolver, matching_strategy=custom_strategy) - - assert config["resolver"] == custom_resolver - assert config["matching_strategy"] == custom_strategy - - -def test_get_ontology_resolver_only_resolver_uses_resolver_strategy(): - """Test that when only resolver is passed, it uses the resolver's matching strategy.""" - from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver - from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver - from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - - custom_strategy = FuzzyMatchingStrategy(cutoff=0.8) - custom_resolver = RDFLibOntologyResolver(matching_strategy=custom_strategy) - - config = get_ontology_resolver(resolver=custom_resolver) - - assert config["resolver"] == custom_resolver - assert config["matching_strategy"] == custom_strategy - assert config["matching_strategy"] == custom_resolver.matching_strategy + assert isinstance(config["ontology_config"]["ontology_resolver"], RDFLibOntologyResolver) + assert isinstance( + config["ontology_config"]["ontology_resolver"].matching_strategy, FuzzyMatchingStrategy + ) def test_rdflib_ontology_resolver_uses_matching_strategy(): diff --git a/examples/python/ontology_demo_example.py b/examples/python/ontology_demo_example.py index ea1ab8b72..5b18e6ed4 100644 --- a/examples/python/ontology_demo_example.py +++ b/examples/python/ontology_demo_example.py @@ -5,8 +5,8 @@ import cognee from cognee.api.v1.search import SearchType from cognee.api.v1.visualize.visualize import visualize_graph from cognee.shared.logging_utils import setup_logging -from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.ontology_config import Config text_1 = """ 1. Audi @@ -62,12 +62,14 @@ async def main(): os.path.dirname(os.path.abspath(__file__)), "ontology_input_example/basic_ontology.owl" ) - # Create ontology config with custom ontology file - ontology_config = get_ontology_resolver( - resolver=RDFLibOntologyResolver(ontology_file=ontology_path) - ) + # Create full config structure manually + config: Config = { + "ontology_config": { + "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_path) + } + } - await cognee.cognify(ontology_config=ontology_config) + await cognee.cognify(config=config) print("Knowledge with ontology created.") # Step 4: Query insights diff --git a/examples/python/ontology_demo_example_2.py b/examples/python/ontology_demo_example_2.py index e897da2e5..01bcd9ae4 100644 --- a/examples/python/ontology_demo_example_2.py +++ b/examples/python/ontology_demo_example_2.py @@ -5,8 +5,8 @@ import os import textwrap from cognee.api.v1.search import SearchType from cognee.api.v1.visualize.visualize import visualize_graph -from cognee.modules.ontology.get_ontology_resolver import get_ontology_resolver from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.ontology_config import Config async def run_pipeline(ontology_path=None): @@ -19,11 +19,13 @@ async def run_pipeline(ontology_path=None): await cognee.add(scientific_papers_dir) - ontology_config = get_ontology_resolver( - resolver=RDFLibOntologyResolver(ontology_file=ontology_path) - ) + config: Config = { + "ontology_config": { + "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_path) + } + } - pipeline_run = await cognee.cognify(ontology_config=ontology_config) + pipeline_run = await cognee.cognify(config=config) return pipeline_run From 9883c097ab9d2b4a5f3045a6ceb3e04190a02df7 Mon Sep 17 00:00:00 2001 From: Daulet Amirkhanov Date: Thu, 18 Sep 2025 16:37:08 +0100 Subject: [PATCH 18/32] fix: make `cognee -ui` dependencies (api) part of core deps (#1439) ## Description ## Type of Change - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Changes Made - - - ## Testing ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## Related Issues ## Additional Notes ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- README.md | 10 ---------- pyproject.toml | 7 +++---- uv.lock | 14 ++++++-------- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index ae1f5f365..41bd1d4ea 100644 --- a/README.md +++ b/README.md @@ -176,16 +176,6 @@ You can also cognify your files and query using cognee UI. Cognee UI 2 -### Installation for UI - -To use the cognee UI with full functionality, you need to install cognee with API dependencies: - -```bash -pip install 'cognee[api]' -``` - -The UI requires backend server functionality (uvicorn and other API dependencies) which are not included in the default cognee installation to keep it lightweight. - ### Running the UI Try cognee UI by running ``` cognee-cli -ui ``` command on your terminal. diff --git a/pyproject.toml b/pyproject.toml index 0c34d8ee1..9e6bbe896 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,14 +62,13 @@ dependencies = [ "pylance>=0.22.0,<1.0.0", "kuzu (==0.11.0)", "python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows -] - -[project.optional-dependencies] -api = [ "uvicorn>=0.34.0,<1.0.0", "gunicorn>=20.1.0,<24", "websockets>=15.0.1,<16.0.0" ] + +[project.optional-dependencies] +api=[] distributed = [ "modal>=1.0.5,<2.0.0", ] diff --git a/uv.lock b/uv.lock index 7e3d536d9..327e60ec1 100644 --- a/uv.lock +++ b/uv.lock @@ -823,6 +823,7 @@ dependencies = [ { name = "fastapi" }, { name = "fastapi-users", extra = ["sqlalchemy"] }, { name = "filetype" }, + { name = "gunicorn" }, { name = "instructor" }, { name = "jinja2" }, { name = "kuzu" }, @@ -856,17 +857,14 @@ dependencies = [ { name = "structlog" }, { name = "tiktoken" }, { name = "typing-extensions" }, + { name = "uvicorn" }, + { name = "websockets" }, ] [package.optional-dependencies] anthropic = [ { name = "anthropic" }, ] -api = [ - { name = "gunicorn" }, - { name = "uvicorn" }, - { name = "websockets" }, -] aws = [ { name = "s3fs", extra = ["boto3"] }, ] @@ -993,7 +991,7 @@ requires-dist = [ { name = "google-generativeai", marker = "extra == 'gemini'", specifier = ">=0.8.4,<0.9" }, { name = "graphiti-core", marker = "extra == 'graphiti'", specifier = ">=0.7.0,<0.8" }, { name = "groq", marker = "extra == 'groq'", specifier = ">=0.8.0,<1.0.0" }, - { name = "gunicorn", marker = "extra == 'api'", specifier = ">=20.1.0,<24" }, + { name = "gunicorn", specifier = ">=20.1.0,<24" }, { name = "instructor", specifier = ">=1.9.1,<2.0.0" }, { name = "jinja2", specifier = ">=3.1.3,<4" }, { name = "kuzu", specifier = "==0.11.0" }, @@ -1060,8 +1058,8 @@ requires-dist = [ { name = "tweepy", marker = "extra == 'dev'", specifier = ">=4.14.0,<5.0.0" }, { name = "typing-extensions", specifier = ">=4.12.2,<5.0.0" }, { name = "unstructured", extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], marker = "extra == 'docs'", specifier = ">=0.18.1,<19" }, - { name = "uvicorn", marker = "extra == 'api'", specifier = ">=0.34.0,<1.0.0" }, - { name = "websockets", marker = "extra == 'api'", specifier = ">=15.0.1,<16.0.0" }, + { name = "uvicorn", specifier = ">=0.34.0,<1.0.0" }, + { name = "websockets", specifier = ">=15.0.1,<16.0.0" }, ] provides-extras = ["api", "distributed", "neo4j", "neptune", "postgres", "postgres-binary", "notebook", "langchain", "llama-index", "gemini", "huggingface", "ollama", "mistral", "anthropic", "deepeval", "posthog", "falkordb", "groq", "chromadb", "docs", "codegraph", "evals", "gui", "graphiti", "aws", "dev", "debug"] From 7051367832840bb88155378bbe94cbac612b7ca1 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:02:26 +0200 Subject: [PATCH 19/32] fix: fixes linting --- .../tests/unit/modules/ontology/test_ontology_adapter.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index 88e9b314d..a1fc4a4eb 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -8,8 +8,7 @@ from cognee.modules.ontology.get_default_ontology_resolver import get_default_on def test_ontology_adapter_initialization_success(): """Test successful initialization of OntologyAdapter.""" - config = get_default_ontology_resolver() - adapter = config["resolver"] + adapter = get_default_ontology_resolver() adapter.build_lookup() assert isinstance(adapter.lookup, dict) @@ -108,8 +107,7 @@ def test_get_subgraph_no_match_rdflib(): """Test get_subgraph returns empty results for a non-existent node.""" g = Graph() - config = get_default_ontology_resolver() - resolver = config["resolver"] + resolver = get_default_ontology_resolver() resolver.graph = g resolver.build_lookup() @@ -277,8 +275,8 @@ def test_ontology_config_structure(): from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - resolver = RDFLibOntologyResolver() matching_strategy = FuzzyMatchingStrategy() + resolver = RDFLibOntologyResolver(matching_strategy=matching_strategy) config: Config = {"ontology_config": {"ontology_resolver": resolver}} From 980c3e3677ba7aac22799206beace582c340b62c Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:03:32 +0200 Subject: [PATCH 20/32] fix: fixes unit tests --- .../modules/ontology/test_ontology_adapter.py | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index a1fc4a4eb..d40f1369a 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -165,8 +165,7 @@ def test_refresh_lookup_rdflib(): """Test that refresh_lookup rebuilds the lookup dict into a new object.""" g = Graph() - config = get_default_ontology_resolver() - resolver = config["resolver"] + resolver = get_default_ontology_resolver() resolver.graph = g resolver.build_lookup() @@ -284,33 +283,25 @@ def test_ontology_config_structure(): def test_get_ontology_resolver_default(): - """Test get_ontology_resolver returns default configuration.""" - from cognee.modules.ontology.get_ontology_resolver import get_default_ontology_resolver - from cognee.modules.ontology.ontology_config import Config + """Test get_default_ontology_resolver returns default resolver.""" from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - config: Config = get_default_ontology_resolver() + resolver = get_default_ontology_resolver() - assert isinstance(config["ontology_config"]["ontology_resolver"], RDFLibOntologyResolver) - assert isinstance( - config["ontology_config"]["ontology_resolver"].matching_strategy, FuzzyMatchingStrategy - ) + assert isinstance(resolver, RDFLibOntologyResolver) + assert isinstance(resolver.matching_strategy, FuzzyMatchingStrategy) def test_get_default_ontology_resolver(): - """Test get_default_ontology_resolver returns default configuration.""" - from cognee.modules.ontology.get_ontology_resolver import get_default_ontology_resolver - from cognee.modules.ontology.ontology_config import Config + """Test get_default_ontology_resolver returns default resolver.""" from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy - config: Config = get_default_ontology_resolver() + resolver = get_default_ontology_resolver() - assert isinstance(config["ontology_config"]["ontology_resolver"], RDFLibOntologyResolver) - assert isinstance( - config["ontology_config"]["ontology_resolver"].matching_strategy, FuzzyMatchingStrategy - ) + assert isinstance(resolver, RDFLibOntologyResolver) + assert isinstance(resolver.matching_strategy, FuzzyMatchingStrategy) def test_rdflib_ontology_resolver_uses_matching_strategy(): From 7c33418ae973fad68ac6ac3bda4ef88d15372bbb Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:24:15 +0200 Subject: [PATCH 21/32] chore: adds and updates docstrings --- .../utils/expand_with_nodes_and_edges.py | 2 +- cognee/tasks/graph/extract_graph_from_data.py | 25 ++++++++++++++++--- .../tasks/graph/extract_graph_from_data_v2.py | 18 ++++++++++--- .../modules/ontology/test_ontology_adapter.py | 10 ++++---- 4 files changed, 43 insertions(+), 12 deletions(-) diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index ef72cd0e1..5b603c163 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -298,7 +298,7 @@ def expand_with_nodes_and_edges( chunk_graphs (list[KnowledgeGraph]): List of knowledge graphs corresponding to each data chunk. Each graph contains nodes (entities) and edges (relationships) extracted from the chunk content. - ontology_resolver (RDFLibOntologyResolver, optional): Resolver for validating entities and + ontology_resolver (BaseOntologyResolver, optional): Resolver for validating entities and types against an ontology. If None, a default RDFLibOntologyResolver is created. Defaults to None. existing_edges_map (dict[str, bool], optional): Mapping of existing edge keys to prevent diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 7c049546c..5c3b11821 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -6,7 +6,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine from cognee.tasks.storage.add_data_points import add_data_points from cognee.modules.ontology.ontology_config import Config from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver -from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.graph.utils import ( expand_with_nodes_and_edges, @@ -26,9 +26,28 @@ async def integrate_chunk_graphs( data_chunks: list[DocumentChunk], chunk_graphs: list, graph_model: Type[BaseModel], - ontology_resolver: RDFLibOntologyResolver, + ontology_resolver: BaseOntologyResolver, ) -> List[DocumentChunk]: - """Updates DocumentChunk objects, integrates data points and edges into databases.""" + """Integrate chunk graphs with ontology validation and store in databases. + + This function processes document chunks and their associated knowledge graphs, + validates entities against an ontology resolver, and stores the integrated + data points and edges in the configured databases. + + Args: + data_chunks: List of document chunks containing source data + chunk_graphs: List of knowledge graphs corresponding to each chunk + graph_model: Pydantic model class for graph data validation + ontology_resolver: Resolver for validating entities against ontology + + Returns: + List of updated DocumentChunk objects with integrated data + + Raises: + InvalidChunkGraphInputError: If input validation fails + InvalidGraphModelError: If graph model validation fails + InvalidOntologyAdapterError: If ontology resolver validation fails + """ if not isinstance(data_chunks, list) or not isinstance(chunk_graphs, list): raise InvalidChunkGraphInputError("data_chunks and chunk_graphs must be lists.") diff --git a/cognee/tasks/graph/extract_graph_from_data_v2.py b/cognee/tasks/graph/extract_graph_from_data_v2.py index 5a4194fb1..0a8869784 100644 --- a/cognee/tasks/graph/extract_graph_from_data_v2.py +++ b/cognee/tasks/graph/extract_graph_from_data_v2.py @@ -3,7 +3,7 @@ from typing import List from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import ( extract_content_nodes_and_relationship_names, @@ -17,9 +17,21 @@ from cognee.tasks.graph.extract_graph_from_data import integrate_chunk_graphs async def extract_graph_from_data( data_chunks: List[DocumentChunk], n_rounds: int = 2, - ontology_adapter: RDFLibOntologyResolver = None, + ontology_adapter: BaseOntologyResolver = None, ) -> List[DocumentChunk]: - """Extract and update graph data from document chunks in multiple steps.""" + """Extract and update graph data from document chunks using cascade extraction. + + This function performs multi-step graph extraction from document chunks, + using cascade extraction techniques to build comprehensive knowledge graphs. + + Args: + data_chunks: List of document chunks to process + n_rounds: Number of extraction rounds to perform (default: 2) + ontology_adapter: Resolver for validating entities against ontology + + Returns: + List of updated DocumentChunk objects with extracted graph data + """ chunk_nodes = await asyncio.gather( *[extract_nodes(chunk.text, n_rounds) for chunk in data_chunks] ) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index d40f1369a..4757e2595 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -6,7 +6,7 @@ from cognee.modules.ontology.get_default_ontology_resolver import get_default_on def test_ontology_adapter_initialization_success(): - """Test successful initialization of OntologyAdapter.""" + """Test successful initialization of RDFLibOntologyResolver from get_default_ontology_resolver.""" adapter = get_default_ontology_resolver() adapter.build_lookup() @@ -104,7 +104,7 @@ def test_find_closest_match_no_match(): def test_get_subgraph_no_match_rdflib(): - """Test get_subgraph returns empty results for a non-existent node.""" + """Test get_subgraph returns empty results for a non-existent node using RDFLibOntologyResolver.""" g = Graph() resolver = get_default_ontology_resolver() @@ -162,7 +162,7 @@ def test_get_subgraph_success_rdflib(): def test_refresh_lookup_rdflib(): - """Test that refresh_lookup rebuilds the lookup dict into a new object.""" + """Test that refresh_lookup rebuilds the lookup dict into a new object using RDFLibOntologyResolver.""" g = Graph() resolver = get_default_ontology_resolver() @@ -283,7 +283,7 @@ def test_ontology_config_structure(): def test_get_ontology_resolver_default(): - """Test get_default_ontology_resolver returns default resolver.""" + """Test get_default_ontology_resolver returns a properly configured RDFLibOntologyResolver with FuzzyMatchingStrategy.""" from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy @@ -294,7 +294,7 @@ def test_get_ontology_resolver_default(): def test_get_default_ontology_resolver(): - """Test get_default_ontology_resolver returns default resolver.""" + """Test get_default_ontology_resolver returns a properly configured RDFLibOntologyResolver with FuzzyMatchingStrategy.""" from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy From 9ee93a4260698a83ebf1fce992894b99fada8351 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:25:00 +0200 Subject: [PATCH 22/32] ruff fix --- cognee/tasks/graph/extract_graph_from_data.py | 8 ++++---- cognee/tasks/graph/extract_graph_from_data_v2.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 5c3b11821..391c6fabe 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -29,20 +29,20 @@ async def integrate_chunk_graphs( ontology_resolver: BaseOntologyResolver, ) -> List[DocumentChunk]: """Integrate chunk graphs with ontology validation and store in databases. - + This function processes document chunks and their associated knowledge graphs, validates entities against an ontology resolver, and stores the integrated data points and edges in the configured databases. - + Args: data_chunks: List of document chunks containing source data chunk_graphs: List of knowledge graphs corresponding to each chunk graph_model: Pydantic model class for graph data validation ontology_resolver: Resolver for validating entities against ontology - + Returns: List of updated DocumentChunk objects with integrated data - + Raises: InvalidChunkGraphInputError: If input validation fails InvalidGraphModelError: If graph model validation fails diff --git a/cognee/tasks/graph/extract_graph_from_data_v2.py b/cognee/tasks/graph/extract_graph_from_data_v2.py index 0a8869784..8cea6602e 100644 --- a/cognee/tasks/graph/extract_graph_from_data_v2.py +++ b/cognee/tasks/graph/extract_graph_from_data_v2.py @@ -20,15 +20,15 @@ async def extract_graph_from_data( ontology_adapter: BaseOntologyResolver = None, ) -> List[DocumentChunk]: """Extract and update graph data from document chunks using cascade extraction. - + This function performs multi-step graph extraction from document chunks, using cascade extraction techniques to build comprehensive knowledge graphs. - + Args: data_chunks: List of document chunks to process n_rounds: Number of extraction rounds to perform (default: 2) ontology_adapter: Resolver for validating entities against ontology - + Returns: List of updated DocumentChunk objects with extracted graph data """ From 2f225c9e036c6444da73d641f89fe63e96c5d438 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 19 Sep 2025 12:54:33 +0200 Subject: [PATCH 23/32] feat: adds ontology resolver env handling --- cognee/api/v1/cognify/cognify.py | 38 ++++++++++++++-- .../ontology/get_default_ontology_resolver.py | 37 ++++++++++++++- .../modules/ontology/ontology_env_config.py | 45 +++++++++++++++++++ cognee/tasks/graph/extract_graph_from_data.py | 22 ++++++++- 4 files changed, 136 insertions(+), 6 deletions(-) create mode 100644 cognee/modules/ontology/ontology_env_config.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index f4bd5d1b4..1292d243a 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -3,6 +3,7 @@ from pydantic import BaseModel from typing import Union, Optional from uuid import UUID +from cognee.modules.ontology.ontology_env_config import get_ontology_env_config from cognee.shared.logging_utils import get_logger from cognee.shared.data_models import KnowledgeGraph from cognee.infrastructure.llm import get_max_chunk_tokens @@ -11,7 +12,10 @@ from cognee.modules.pipelines import run_pipeline from cognee.modules.pipelines.tasks.task import Task from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.ontology.ontology_config import Config -from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver +from cognee.modules.ontology.get_default_ontology_resolver import ( + get_default_ontology_resolver, + get_ontology_resolver_from_env, +) from cognee.modules.users.models import User from cognee.tasks.documents import ( @@ -188,7 +192,21 @@ async def cognify( - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) """ if config is None: - config: Config = {"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}} + ontology_config = get_ontology_env_config() + if ( + ontology_config.ontology_file_path + and ontology_config.ontology_resolver + and ontology_config.matching_strategy + ): + config: Config = { + "ontology_config": { + "ontology_resolver": get_ontology_resolver_from_env(**ontology_config.to_dict()) + } + } + else: + config: Config = { + "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} + } if temporal_cognify: tasks = await get_temporal_tasks(user, chunker, chunk_size) @@ -222,7 +240,21 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's custom_prompt: Optional[str] = None, ) -> list[Task]: if config is None: - config: Config = {"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}} + ontology_config = get_ontology_env_config() + if ( + ontology_config.ontology_file_path + and ontology_config.ontology_resolver + and ontology_config.matching_strategy + ): + config: Config = { + "ontology_config": { + "ontology_resolver": get_ontology_resolver_from_env(**ontology_config.to_dict()) + } + } + else: + config: Config = { + "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} + } default_tasks = [ Task(classify_documents), diff --git a/cognee/modules/ontology/get_default_ontology_resolver.py b/cognee/modules/ontology/get_default_ontology_resolver.py index ae10fbde5..f9aebe59a 100644 --- a/cognee/modules/ontology/get_default_ontology_resolver.py +++ b/cognee/modules/ontology/get_default_ontology_resolver.py @@ -1,6 +1,41 @@ +from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy -def get_default_ontology_resolver() -> RDFLibOntologyResolver: +def get_default_ontology_resolver() -> BaseOntologyResolver: return RDFLibOntologyResolver(ontology_file=None, matching_strategy=FuzzyMatchingStrategy()) + + +def get_ontology_resolver_from_env( + ontology_resolver: str = "", matching_strategy: str = "", ontology_file_path: str = "" +) -> BaseOntologyResolver: + """ + Create and return an ontology resolver instance based on environment parameters. + + Currently, this function supports only the RDFLib-based ontology resolver + with a fuzzy matching strategy. + + Args: + ontology_resolver (str): The ontology resolver type to use. + Supported value: "rdflib". + matching_strategy (str): The matching strategy to apply. + Supported value: "fuzzy". + ontology_file_path (str): Path to the ontology file required for the resolver. + + Returns: + BaseOntologyResolver: An instance of the requested ontology resolver. + + Raises: + EnvironmentError: If the provided resolver or strategy is unsupported, + or if required parameters are missing. + """ + if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path: + return RDFLibOntologyResolver( + matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path + ) + else: + raise EnvironmentError( + f"Unsupported ontology resolver: {ontology_resolver}. " + f"Supported resolvers are: RdfLib with FuzzyMatchingStrategy." + ) diff --git a/cognee/modules/ontology/ontology_env_config.py b/cognee/modules/ontology/ontology_env_config.py new file mode 100644 index 000000000..a351b35e7 --- /dev/null +++ b/cognee/modules/ontology/ontology_env_config.py @@ -0,0 +1,45 @@ +"""This module contains the configuration for ontology handling.""" + +from functools import lru_cache +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class OntologyEnvConfig(BaseSettings): + """ + Represents the configuration for ontology handling, including parameters for + ontology file storage and resolution/matching strategies. + + Public methods: + - to_dict + + Instance variables: + - ontology_resolver + - ontology_matching + - ontology_file_path + - model_config + """ + + ontology_resolver: str = "rdflib" + matching_strategy: str = "fuzzy" + ontology_file_path: str = "" + + model_config = SettingsConfigDict(env_file=".env", extra="allow", populate_by_name=True) + + def to_dict(self) -> dict: + """ + Return the configuration as a dictionary. + """ + return { + "ontology_resolver": self.ontology_resolver, + "matching_strategy": self.matching_strategy, + "ontology_file_path": self.ontology_file_path, + } + + +@lru_cache +def get_ontology_env_config(): + """ + Retrieve the ontology configuration. This function utilizes caching to return a + singleton instance of the OntologyConfig class for efficiency. + """ + return OntologyEnvConfig() diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 391c6fabe..e4dafe4e7 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -3,9 +3,13 @@ from typing import Type, List, Optional from pydantic import BaseModel from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.ontology.ontology_env_config import get_ontology_env_config from cognee.tasks.storage.add_data_points import add_data_points from cognee.modules.ontology.ontology_config import Config -from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver +from cognee.modules.ontology.get_default_ontology_resolver import ( + get_default_ontology_resolver, + get_ontology_resolver_from_env, +) from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.graph.utils import ( @@ -124,7 +128,21 @@ async def extract_graph_from_data( # Extract resolver from config if provided, otherwise get default if config is None: - config: Config = {"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}} + ontology_config = get_ontology_env_config() + if ( + ontology_config.ontology_file_path + and ontology_config.ontology_resolver + and ontology_config.matching_strategy + ): + config: Config = { + "ontology_config": { + "ontology_resolver": get_ontology_resolver_from_env(**ontology_config.to_dict()) + } + } + else: + config: Config = { + "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} + } ontology_resolver = config["ontology_config"]["ontology_resolver"] From 57f864a58f2ba68e167db38b99cad58eb4359c16 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 19 Sep 2025 13:02:25 +0200 Subject: [PATCH 24/32] feat: adds tests for the env settings --- .../modules/ontology/test_ontology_adapter.py | 170 ++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index 4757e2595..efb472c1e 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -331,3 +331,173 @@ def test_rdflib_ontology_resolver_default_matching_strategy(): resolver = RDFLibOntologyResolver() assert isinstance(resolver.matching_strategy, FuzzyMatchingStrategy) + + +def test_get_ontology_resolver_from_env_success(): + """Test get_ontology_resolver_from_env returns correct resolver with valid parameters.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + resolver = get_ontology_resolver_from_env( + ontology_resolver="rdflib", + matching_strategy="fuzzy", + ontology_file_path="/test/path.owl" + ) + + assert isinstance(resolver, RDFLibOntologyResolver) + assert isinstance(resolver.matching_strategy, FuzzyMatchingStrategy) + assert resolver.ontology_file == "/test/path.owl" + + +def test_get_ontology_resolver_from_env_unsupported_resolver(): + """Test get_ontology_resolver_from_env raises EnvironmentError for unsupported resolver.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError) as exc_info: + get_ontology_resolver_from_env( + ontology_resolver="unsupported", + matching_strategy="fuzzy", + ontology_file_path="/test/path.owl" + ) + + assert "Unsupported ontology resolver: unsupported" in str(exc_info.value) + assert "Supported resolvers are: RdfLib with FuzzyMatchingStrategy" in str(exc_info.value) + + +def test_get_ontology_resolver_from_env_unsupported_strategy(): + """Test get_ontology_resolver_from_env raises EnvironmentError for unsupported strategy.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError) as exc_info: + get_ontology_resolver_from_env( + ontology_resolver="rdflib", + matching_strategy="unsupported", + ontology_file_path="/test/path.owl" + ) + + assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) + + +def test_get_ontology_resolver_from_env_empty_file_path(): + """Test get_ontology_resolver_from_env raises EnvironmentError for empty file path.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError) as exc_info: + get_ontology_resolver_from_env( + ontology_resolver="rdflib", + matching_strategy="fuzzy", + ontology_file_path="" + ) + + assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) + + +def test_get_ontology_resolver_from_env_none_file_path(): + """Test get_ontology_resolver_from_env raises EnvironmentError for None file path.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError) as exc_info: + get_ontology_resolver_from_env( + ontology_resolver="rdflib", + matching_strategy="fuzzy", + ontology_file_path=None + ) + + assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) + + +def test_get_ontology_resolver_from_env_empty_resolver(): + """Test get_ontology_resolver_from_env raises EnvironmentError for empty resolver.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError) as exc_info: + get_ontology_resolver_from_env( + ontology_resolver="", + matching_strategy="fuzzy", + ontology_file_path="/test/path.owl" + ) + + assert "Unsupported ontology resolver:" in str(exc_info.value) + + +def test_get_ontology_resolver_from_env_empty_strategy(): + """Test get_ontology_resolver_from_env raises EnvironmentError for empty strategy.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError) as exc_info: + get_ontology_resolver_from_env( + ontology_resolver="rdflib", + matching_strategy="", + ontology_file_path="/test/path.owl" + ) + + assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) + + +def test_get_ontology_resolver_from_env_default_parameters(): + """Test get_ontology_resolver_from_env with default empty parameters raises EnvironmentError.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError) as exc_info: + get_ontology_resolver_from_env() + + assert "Unsupported ontology resolver:" in str(exc_info.value) + + +def test_get_ontology_resolver_from_env_case_sensitivity(): + """Test get_ontology_resolver_from_env is case sensitive.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + with pytest.raises(EnvironmentError): + get_ontology_resolver_from_env( + ontology_resolver="RDFLIB", + matching_strategy="fuzzy", + ontology_file_path="/test/path.owl" + ) + + with pytest.raises(EnvironmentError): + get_ontology_resolver_from_env( + ontology_resolver="RdfLib", + matching_strategy="fuzzy", + ontology_file_path="/test/path.owl" + ) + + +def test_get_ontology_resolver_from_env_with_actual_file(): + """Test get_ontology_resolver_from_env works with actual file path.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver + from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy + + resolver = get_ontology_resolver_from_env( + ontology_resolver="rdflib", + matching_strategy="fuzzy", + ontology_file_path="/path/to/ontology.owl" + ) + + assert isinstance(resolver, RDFLibOntologyResolver) + assert isinstance(resolver.matching_strategy, FuzzyMatchingStrategy) + assert resolver.ontology_file == "/path/to/ontology.owl" + + +def test_get_ontology_resolver_from_env_resolver_functionality(): + """Test that resolver created from env function works correctly.""" + from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env + + resolver = get_ontology_resolver_from_env( + ontology_resolver="rdflib", + matching_strategy="fuzzy", + ontology_file_path="/test/path.owl" + ) + + resolver.build_lookup() + assert isinstance(resolver.lookup, dict) + + result = resolver.find_closest_match("test", "individuals") + assert result is None # Should return None for non-existent entity + + nodes, relationships, start_node = resolver.get_subgraph("test", "individuals") + assert nodes == [] + assert relationships == [] + assert start_node is None From 765834c0a966688a55f6d416af361fcb5a17a19a Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 19 Sep 2025 13:39:08 +0200 Subject: [PATCH 25/32] ruff formatting --- .../modules/ontology/test_ontology_adapter.py | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index efb472c1e..dfab79732 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -340,9 +340,7 @@ def test_get_ontology_resolver_from_env_success(): from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy resolver = get_ontology_resolver_from_env( - ontology_resolver="rdflib", - matching_strategy="fuzzy", - ontology_file_path="/test/path.owl" + ontology_resolver="rdflib", matching_strategy="fuzzy", ontology_file_path="/test/path.owl" ) assert isinstance(resolver, RDFLibOntologyResolver) @@ -358,7 +356,7 @@ def test_get_ontology_resolver_from_env_unsupported_resolver(): get_ontology_resolver_from_env( ontology_resolver="unsupported", matching_strategy="fuzzy", - ontology_file_path="/test/path.owl" + ontology_file_path="/test/path.owl", ) assert "Unsupported ontology resolver: unsupported" in str(exc_info.value) @@ -373,7 +371,7 @@ def test_get_ontology_resolver_from_env_unsupported_strategy(): get_ontology_resolver_from_env( ontology_resolver="rdflib", matching_strategy="unsupported", - ontology_file_path="/test/path.owl" + ontology_file_path="/test/path.owl", ) assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) @@ -385,9 +383,7 @@ def test_get_ontology_resolver_from_env_empty_file_path(): with pytest.raises(EnvironmentError) as exc_info: get_ontology_resolver_from_env( - ontology_resolver="rdflib", - matching_strategy="fuzzy", - ontology_file_path="" + ontology_resolver="rdflib", matching_strategy="fuzzy", ontology_file_path="" ) assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) @@ -399,9 +395,7 @@ def test_get_ontology_resolver_from_env_none_file_path(): with pytest.raises(EnvironmentError) as exc_info: get_ontology_resolver_from_env( - ontology_resolver="rdflib", - matching_strategy="fuzzy", - ontology_file_path=None + ontology_resolver="rdflib", matching_strategy="fuzzy", ontology_file_path=None ) assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) @@ -413,9 +407,7 @@ def test_get_ontology_resolver_from_env_empty_resolver(): with pytest.raises(EnvironmentError) as exc_info: get_ontology_resolver_from_env( - ontology_resolver="", - matching_strategy="fuzzy", - ontology_file_path="/test/path.owl" + ontology_resolver="", matching_strategy="fuzzy", ontology_file_path="/test/path.owl" ) assert "Unsupported ontology resolver:" in str(exc_info.value) @@ -427,9 +419,7 @@ def test_get_ontology_resolver_from_env_empty_strategy(): with pytest.raises(EnvironmentError) as exc_info: get_ontology_resolver_from_env( - ontology_resolver="rdflib", - matching_strategy="", - ontology_file_path="/test/path.owl" + ontology_resolver="rdflib", matching_strategy="", ontology_file_path="/test/path.owl" ) assert "Unsupported ontology resolver: rdflib" in str(exc_info.value) @@ -453,14 +443,14 @@ def test_get_ontology_resolver_from_env_case_sensitivity(): get_ontology_resolver_from_env( ontology_resolver="RDFLIB", matching_strategy="fuzzy", - ontology_file_path="/test/path.owl" + ontology_file_path="/test/path.owl", ) with pytest.raises(EnvironmentError): get_ontology_resolver_from_env( ontology_resolver="RdfLib", matching_strategy="fuzzy", - ontology_file_path="/test/path.owl" + ontology_file_path="/test/path.owl", ) @@ -473,7 +463,7 @@ def test_get_ontology_resolver_from_env_with_actual_file(): resolver = get_ontology_resolver_from_env( ontology_resolver="rdflib", matching_strategy="fuzzy", - ontology_file_path="/path/to/ontology.owl" + ontology_file_path="/path/to/ontology.owl", ) assert isinstance(resolver, RDFLibOntologyResolver) @@ -486,9 +476,7 @@ def test_get_ontology_resolver_from_env_resolver_functionality(): from cognee.modules.ontology.get_default_ontology_resolver import get_ontology_resolver_from_env resolver = get_ontology_resolver_from_env( - ontology_resolver="rdflib", - matching_strategy="fuzzy", - ontology_file_path="/test/path.owl" + ontology_resolver="rdflib", matching_strategy="fuzzy", ontology_file_path="/test/path.owl" ) resolver.build_lookup() From 54cf0967cc2d7d360ffb8b70046e3f72fec6496f Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 19 Sep 2025 13:46:01 +0200 Subject: [PATCH 26/32] chore: updates env template --- .env.template | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.env.template b/.env.template index 781e82428..ddcd41a6c 100644 --- a/.env.template +++ b/.env.template @@ -116,7 +116,15 @@ VECTOR_DB_PROVIDER="lancedb" VECTOR_DB_URL= VECTOR_DB_KEY= +################################################################################ +# 🧩 Ontology resolver settings +################################################################################ +# -- Ontology resolver params -------------------------------------- +# ONTOLOGY_RESOLVER=rdflib # Default: uses rdflib and owl file to read ontology structures +# MATCHING_STRATEGY=fuzzy # Default: uses fuzzy matching with 80% similarity threshold +# ONTOLOGY_FILE_PATH=YOUR_FULL_FULE_PATH # Default: empty +# To add ontology resolvers, either set them as it is set in ontology_example or add full_path and settings as envs. ################################################################################ # 🔄 MIGRATION (RELATIONAL → GRAPH) SETTINGS From ea487d2ca3715c4233674b62e66ab391c8d1c19d Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 19 Sep 2025 13:56:11 +0200 Subject: [PATCH 27/32] adds: ads default handling in expand_with_nodes_and_edges --- .../graph/utils/expand_with_nodes_and_edges.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index 5b603c163..39c1d4bd1 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -8,9 +8,11 @@ from cognee.modules.engine.utils import ( generate_node_name, ) from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver +from cognee.modules.ontology.ontology_env_config import get_ontology_env_config from cognee.shared.data_models import KnowledgeGraph from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver -from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver +from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver, \ + get_ontology_resolver_from_env def _create_node_key(node_id: str, category: str) -> str: @@ -322,7 +324,15 @@ def expand_with_nodes_and_edges( existing_edges_map = {} if ontology_resolver is None: - ontology_resolver = get_default_ontology_resolver() + ontology_config = get_ontology_env_config() + if ( + ontology_config.ontology_file_path + and ontology_config.ontology_resolver + and ontology_config.matching_strategy + ): + ontology_resolver = get_ontology_resolver_from_env(**ontology_config.to_dict()) + else: + ontology_resolver = get_default_ontology_resolver() added_nodes_map = {} added_ontology_nodes_map = {} From 28ed9c4c736fbe2e07f11e3f36f5adc6f9fcbc4f Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 19 Sep 2025 13:58:18 +0200 Subject: [PATCH 28/32] ruff formatting --- .../graph/utils/expand_with_nodes_and_edges.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index 39c1d4bd1..3b01f5af4 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -11,8 +11,10 @@ from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.modules.ontology.ontology_env_config import get_ontology_env_config from cognee.shared.data_models import KnowledgeGraph from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver -from cognee.modules.ontology.get_default_ontology_resolver import get_default_ontology_resolver, \ - get_ontology_resolver_from_env +from cognee.modules.ontology.get_default_ontology_resolver import ( + get_default_ontology_resolver, + get_ontology_resolver_from_env, +) def _create_node_key(node_id: str, category: str) -> str: @@ -326,9 +328,9 @@ def expand_with_nodes_and_edges( if ontology_resolver is None: ontology_config = get_ontology_env_config() if ( - ontology_config.ontology_file_path - and ontology_config.ontology_resolver - and ontology_config.matching_strategy + ontology_config.ontology_file_path + and ontology_config.ontology_resolver + and ontology_config.matching_strategy ): ontology_resolver = get_ontology_resolver_from_env(**ontology_config.to_dict()) else: From f93338bc3431b9a757915b534719694df548ce19 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Fri, 19 Sep 2025 16:38:14 +0200 Subject: [PATCH 29/32] fix: fixes ontology notebook --- notebooks/ontology_demo.ipynb | 986 ++-------------------------------- 1 file changed, 50 insertions(+), 936 deletions(-) diff --git a/notebooks/ontology_demo.ipynb b/notebooks/ontology_demo.ipynb index e48d8467d..ef4a046b8 100644 --- a/notebooks/ontology_demo.ipynb +++ b/notebooks/ontology_demo.ipynb @@ -36,45 +36,33 @@ }, { "cell_type": "code", - "execution_count": 1, "id": "8cf7ba29f9a150af", - "metadata": { - "ExecuteTime": { - "end_time": "2025-03-26T16:17:55.937140Z", - "start_time": "2025-03-26T16:17:55.908542Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "# Install required package\n", "# !pip install cognee" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "id": "abb86851", "metadata": {}, - "outputs": [], "source": [ "import os\n", "\n", "# Set up OpenAI API key (required for Cognee's LLM functionality)\n", "if \"LLM_API_KEY\" not in os.environ:\n", " os.environ[\"LLM_API_KEY\"] = \"your-api-key-here\" # Replace with your API key" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, "id": "d825d126b3a0ec26", - "metadata": { - "ExecuteTime": { - "end_time": "2025-03-26T16:18:09.382400Z", - "start_time": "2025-03-26T16:18:09.342349Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ "# Import required libraries\n", "import cognee\n", @@ -85,7 +73,9 @@ "from cognee.api.v1.search import SearchType\n", "\n", "logger = get_logger()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -102,17 +92,10 @@ }, { "cell_type": "code", - "execution_count": 13, "id": "4d0e4a58e4207a7d", - "metadata": { - "ExecuteTime": { - "end_time": "2025-04-09T17:12:54.006718Z", - "start_time": "2025-04-09T17:12:53.992906Z" - } - }, - "outputs": [], + "metadata": {}, "source": [ - "async def run_pipeline(ontology_path=None):\n", + "async def run_pipeline(config=None):\n", " # Clean existing data\n", " await cognee.prune.prune_data()\n", " await cognee.prune.prune_system(metadata=True)\n", @@ -130,7 +113,7 @@ " await cognee.add(scientific_papers_dir)\n", " \n", " # Cognify with optional ontology\n", - " return await cognee.cognify(ontology_file_path=ontology_path)\n", + " return await cognee.cognify(config=config)\n", "\n", "async def query_pipeline(questions):\n", " answers = []\n", @@ -141,7 +124,9 @@ " )\n", " answers.append(search_results)\n", " return answers" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -155,423 +140,11 @@ }, { "cell_type": "code", - "execution_count": 14, "id": "1363772d2b48f5c0", - "metadata": { - "ExecuteTime": { - "end_time": "2025-04-09T17:14:31.818452Z", - "start_time": "2025-04-09T17:12:55.491598Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[2m2025-08-27T13:55:36.031761\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mDeleted Kuzu database files at /Users/daulet/Desktop/dev/cognee-claude/cognee/.cognee_system/databases/cognee_graph_kuzu\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "--- Results WITH ontology ---\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[2m2025-08-27T13:55:36.330304\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mDatabase deleted successfully.\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:36.521821\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `88a655ee-2a8f-5e47-90b4-ccc5aee28ee5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:36.683661\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User a6d0292a-e5d5-4087-a06d-e6e40c92ddbd has registered.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[2m2025-08-27T13:55:36.852839\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:37.022061\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mReading PDF: /Users/daulet/Desktop/dev/cognee/examples/data/scientific_papers/nutrients-13-01241.pdf\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.infrastructure.loaders.external.pypdf_loader\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:37.159853\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:37.317975\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:37.464301\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `88a655ee-2a8f-5e47-90b4-ccc5aee28ee5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:37.631226\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `88a655ee-2a8f-5e47-90b4-ccc5aee28ee5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:37.806056\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:37.952328\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:38.123930\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mReading PDF: /Users/daulet/Desktop/dev/cognee/examples/data/scientific_papers/TOJ-22-0073_152Mendoza.pdf\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.infrastructure.loaders.external.pypdf_loader\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:38.230010\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:38.400266\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:38.544525\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `88a655ee-2a8f-5e47-90b4-ccc5aee28ee5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:38.712540\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mOntology file 'examples/python/ontology_input_example/enriched_medical_ontology_with_classes.owl' not found. No owl ontology will be attached to the graph.\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:38.726158\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `3a6c74ba-93cd-56db-a1c5-9c48aa366dc5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:38.871531\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:39.018586\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:39.179788\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task started: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:55:39.369582\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:55:39 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:55:39 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:55:39 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.658679\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'study' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.660483\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coffee consumption study' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.660854\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'person' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.661218\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'laura torres-collado' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.661612\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'laura maría compañ-gabucio' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.661903\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'sandra gonzález-palacios' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.662160\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'leyre notario-barandiaran' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.662469\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'alejandro oncina-cánovas' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.662774\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'jesús vioque' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.663026\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'manuela garcía-de la hera' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.663391\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'compound' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.663658\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'caffeine' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.663932\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'beverage' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.664183\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'decaffeinated coffee' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.664460\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'disease' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.664706\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiovascular disease' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.665024\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cancer' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.665327\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'outcome' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.665566\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'all-cause mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.665804\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'database' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.666041\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'national death index' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.666342\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'valencia nutrition study' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.666631\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'diet' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.666882\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mediterranean diet' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.667152\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'duration' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.667354\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'follow-up period' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.667602\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'fda 2021 study' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.667963\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'valencia nutrition survey' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.668205\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health condition' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.668473\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiovascular disease' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.668722\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'behavior' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.668963\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coffee consumption' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.669206\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health outcome' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.669438\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'all-cause mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.669706\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'caffeinated coffee' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.669937\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'period' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.670145\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for '18 years follow up' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.670439\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'concept' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.670862\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'moderate consumption' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.671122\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.671349\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chemical' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.671712\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chronic illness' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.671900\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'demographic' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.672117\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'elderly population' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.672357\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'spanish population' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.672771\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'nutritional survey' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:11.672995\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'study authors' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:14.617820\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:56:14 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:56:14 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:56:14 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:20.416149\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:24.327046\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:24.483512\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:24.634531\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:24.800450\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task completed: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:24.959932\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:25.116106\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:25.281875\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `3a6c74ba-93cd-56db-a1c5-9c48aa366dc5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:25.429555\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `3a6c74ba-93cd-56db-a1c5-9c48aa366dc5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:25.588955\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:25.744193\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:25.909248\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task started: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:26.099711\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:56:26 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:56:26 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.017564\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'person' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.020412\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'michael f. mendoza' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.021095\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'ralf martz sulague' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.021558\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'therese posas-mendoza' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.022129\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'carl j. lavie' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.022674\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'beverage' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.023245\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coffee' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.023721\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health_condition' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.024235\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiovascular health' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.024721\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'hypertension' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.025227\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'heart failure' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.025607\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'atrial fibrillation' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.026034\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coronary heart disease' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.026407\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'nutrient' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.026835\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'dietary nutrient' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.027189\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'action' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.027578\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coffee consumption' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.027937\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chemical' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.028310\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'caffeine' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.029010\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'diterpenes' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.029272\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'phenolic acid' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.029707\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'date' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.030169\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for '2023-01-01' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.030506\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for '2000-2021' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.031084\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'medical condition' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.031452\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'atrial fibrillation' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.031900\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiovascular disease' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.032243\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'research methodology' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.032714\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'meta-analysis' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.033041\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chemical compounds' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.033366\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'antioxidants' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.033701\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'medical topic' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.034065\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.034340\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health concept' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.034758\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health benefits' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.035022\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chemical compound' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.035511\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chlorogenic acid' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.035929\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'ferulic acid' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.036169\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chemical element' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.036785\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'magnesium' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.037181\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'trigonelline' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:48.038230\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'p-coumaric acid' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:51.414844\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:56:51 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:56:51 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:56:56.891985\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:01.419876\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:01.558475\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:01.702641\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:01.871728\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task completed: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:02.026804\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:02.183571\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:02.344790\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `3a6c74ba-93cd-56db-a1c5-9c48aa366dc5`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:02.688185\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 93 nodes, 194 edges in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:03.214983\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.04s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:57:03 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:06.496514\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 93 nodes, 194 edges in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:06.962219\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.02s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:57:07 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:12.400015\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 93 nodes, 194 edges in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:12.800274\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:57:12 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:15.801364\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 93 nodes, 194 edges in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:16.301946\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.02s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:57:16 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q: What are common risk factors for Type 2 Diabetes?\n", - "A: ['Common risk factors for Type 2 Diabetes include:\\n1. Obesity, particularly with a high body mass index (BMI).\\n2. Physical inactivity or low levels of exercise.\\n3. Unhealthy diet, particularly high in sugar and fats.\\n4. Family history of diabetes.\\n5. Age, especially being over 45 years old.\\n6. High blood pressure or hypertension.\\n7. High cholesterol levels.\\n8. History of gestational diabetes or giving birth to a baby over 9 lbs.\\n9. Ethnicity, with higher risk in certain populations (e.g., African American, Hispanic).\\n10. Insulin resistance or metabolic syndrome.']\n", - "\n", - "Q: What preventive measures reduce the risk of Hypertension?\n", - "A: ['Preventive measures that reduce the risk of hypertension include moderate coffee consumption, which has been associated with a decreased risk of developing hypertension, heart failure, and atrial fibrillation. Additionally, adjustments in lifestyle factors, such as avoiding excessive coffee consumption, especially boiled or unfiltered varieties, which can raise cholesterol levels, can further help lower hypertension risk.']\n", - "\n", - "Q: What symptoms indicate possible Cardiovascular Disease?\n", - "A: ['Symptoms that indicate possible Cardiovascular Disease (CVD) may include, but are not limited to, chest pain, shortness of breath, fatigue, dizziness, and palpitations. Additionally, factors such as high blood pressure, high cholesterol, diabetes, and obesity can also be signs of increased risk for CVD.']\n", - "\n", - "Q: What diseases are associated with Obesity?\n", - "A: ['Diseases associated with obesity include cardiovascular disease, cancer, and diabetes. Obesity can exacerbate these conditions and increase the risk of their occurrence.']\n", - "\n" - ] - } - ], + "metadata": {}, "source": [ + "from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver\n", + "from cognee.modules.ontology.ontology_config import Config\n", "# Test questions\n", "questions = [\n", " \"What are common risk factors for Type 2 Diabetes?\",\n", @@ -581,450 +154,28 @@ "]\n", "\n", "# Path to medical ontology\n", - "ontology_path = \"examples/python/ontology_input_example/enriched_medical_ontology_with_classes.owl\" # Update with your ontology path\n", + "ontology_path = \"../examples/python/ontology_input_example/enriched_medical_ontology_with_classes.owl\" # Update with your ontology path\n", + "\n", + "config: Config = {\n", + " \"ontology_config\": {\n", + " \"ontology_resolver\": RDFLibOntologyResolver(ontology_file=ontology_path)\n", + " }\n", + " }\n", "\n", "# Run with ontology\n", "print(\"\\n--- Results WITH ontology ---\\n\")\n", - "await run_pipeline(ontology_path=ontology_path)\n", + "await run_pipeline(config=config)\n", "answers_with = await query_pipeline(questions)\n", "for q, a in zip(questions, answers_with):\n", " print(f\"Q: {q}\\nA: {a}\\n\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": 15, "id": "3aa18f4cdd5ceff6", - "metadata": { - "ExecuteTime": { - "end_time": "2025-04-09T14:32:24.891560Z", - "start_time": "2025-04-09T14:30:47.863808Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[2m2025-08-27T13:57:25.168873\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mDeleted Kuzu database files at /Users/daulet/Desktop/dev/cognee-claude/cognee/.cognee_system/databases/cognee_graph_kuzu\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:25.266675\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mDatabase deleted successfully.\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "--- Results WITHOUT ontology ---\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[2m2025-08-27T13:57:25.420598\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `f4f6b83c-3555-5296-a812-107346770fbd`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:25.561245\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User e0763f65-1749-42aa-8436-22b776b42bcf has registered.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[2m2025-08-27T13:57:25.702415\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:25.864741\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mReading PDF: /Users/daulet/Desktop/dev/cognee/examples/data/scientific_papers/nutrients-13-01241.pdf\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.infrastructure.loaders.external.pypdf_loader\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:26.013387\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:26.153761\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:26.292634\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `f4f6b83c-3555-5296-a812-107346770fbd`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:26.458350\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `f4f6b83c-3555-5296-a812-107346770fbd`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:26.606166\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:26.761724\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:26.945575\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mReading PDF: /Users/daulet/Desktop/dev/cognee/examples/data/scientific_papers/TOJ-22-0073_152Mendoza.pdf\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.infrastructure.loaders.external.pypdf_loader\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:27.052394\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `ingest_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:27.201593\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `resolve_data_directories`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:27.360901\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `f4f6b83c-3555-5296-a812-107346770fbd`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:27.532808\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mOntology file 'None' not found. No owl ontology will be attached to the graph.\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:27.561598\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `05779e2b-4ff1-5b13-8fc4-7fd789498ec4`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:27.722699\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:27.871031\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:28.023426\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task started: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:28.206266\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:57:28 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:57:28 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:57:28 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.514070\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'study' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.515588\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coffee consumption study' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.516028\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'person' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.516456\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'laura torres-collado' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.516897\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'laura maría compañ-gabucio' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.517277\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'sandra gonzález-palacios' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.517597\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'leyre notario-barandiaran' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.518313\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'alejandro oncina-cánovas' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.518865\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'jesús vioque' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.519339\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'manuela garcía-de la hera' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.519682\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'valencia nutrition study' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.520013\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'diet' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.520298\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mediterranean diet' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.520547\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'date' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.520885\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'date study received' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.521219\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'date study accepted' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.521484\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'date study published' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.521798\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mortality' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.522131\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cvd mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.522432\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cancer mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.522689\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'all-cause mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.523071\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health behavior' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.523338\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coffee consumption' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.523618\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'valencia nutrition survey' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.523894\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health outcome' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.524143\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.524424\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health condition' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.524691\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiovascular diseases' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.524970\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cancer' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.525248\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'beverage' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.525530\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'caffeinated coffee' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.525831\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'decaffeinated coffee' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.526090\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'statistical measure' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.526348\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'hazard ratio' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.526990\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'time unit' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.527685\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'person-years' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.528244\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'time period' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.528578\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'study duration' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.528876\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cumulative incidence' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.529258\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'concept' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.529535\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'adult life' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.529802\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chronic illness' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.530362\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'all-cause mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.530650\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'longitudinal studies' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.530924\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mediterranean lifestyle' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.531196\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'research study' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.531491\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'self-reported data' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.531710\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'spanish population' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.531986\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'sample size' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.532239\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'response bias' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.532464\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'funding sources' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.532711\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'ethical approval' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:49.532977\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'informed consent' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:52.613755\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:57:52 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:57:52 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:57:52 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:57:57.853670\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:03.401474\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:03.562616\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:03.705987\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:03.855674\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task completed: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:04.012884\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:04.169312\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:04.474564\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `05779e2b-4ff1-5b13-8fc4-7fd789498ec4`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:04.632222\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run started: `05779e2b-4ff1-5b13-8fc4-7fd789498ec4`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:04.779116\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:04.932294\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:05.105238\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task started: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:05.297050\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:58:05 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:58:05 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.909979\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'person' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.912597\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'michael f. mendoza' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.913179\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'ralf martz sulague' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.913706\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'therese posas-mendoza' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.914114\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'carl j. lavie' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.914529\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'beverage' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.914920\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coffee' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.915255\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health domain' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.915646\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiovascular health' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.916079\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'disease' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.916464\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'hypertension' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.916802\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'substance' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.917174\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cholesterol' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.917500\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'atrial fibrillation' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.917880\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'coronary heart disease' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.918233\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'phenolic acid' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.918754\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'diterpenes' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.919133\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'gene' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.919446\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cyp1a2' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.919786\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health advisory' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.920133\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'caffeine consumption during pregnancy' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.920442\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'caffeine' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.920789\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'myocardial infarction' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.921121\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'heart failure' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.921517\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chemical compounds' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.921853\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'antioxidants' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.922179\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'medical condition' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.922542\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiovascular disease' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.922792\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'medical procedure' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.923044\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'cardiac surgery' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.923349\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'health metric' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.923688\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'mortality' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.923894\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'research' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.924157\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'ali-hassan-sayegh et al (2014)' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.924444\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'ding et al (2015)' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.924734\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'lopez-garcia et al (2008)' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.925225\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'de koning gans et al (2010)' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.925735\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'andersen et al (2006)' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.926028\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'kleemola et al (2000)' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.926485\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'kim et al (2019)' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.926770\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'chemical compound' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.927582\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'blood pressure' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.927856\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'concept' in category 'classes'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:29.928277\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mNo close match found for 'moderation' in category 'individuals'\u001b[0m [\u001b[0m\u001b[1m\u001b[34mOntologyAdapter\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:33.804451\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\u001b[92m14:58:33 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\u001b[92m14:58:33 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:40.232682\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task started: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:44.523716\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `add_data_points`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:44.668235\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `summarize_text`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:44.816078\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `extract_graph_from_data`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:44.967879\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mAsync Generator task completed: `extract_chunks_from_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:45.126329\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `check_permissions_on_dataset`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:45.296802\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mCoroutine task completed: `classify_documents`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_base\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:45.447048\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mPipeline run completed: `05779e2b-4ff1-5b13-8fc4-7fd789498ec4`\u001b[0m [\u001b[0m\u001b[1m\u001b[34mrun_tasks_with_telemetry()\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:45.774857\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 102 nodes, 214 edges in 0.00s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:46.269205\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.02s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:58:46 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:49.120648\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 102 nodes, 214 edges in 0.00s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:49.625746\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:58:49 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:53.119208\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 102 nodes, 214 edges in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:53.576759\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.02s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:58:53 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:56.395448\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph projection completed: 102 nodes, 214 edges in 0.01s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mCogneeGraph\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:56.961329\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mVector collection retrieval completed: Retrieved distances from 6 collections in 0.02s\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\u001b[92m14:58:57 - LiteLLM:INFO\u001b[0m: utils.py:3341 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "\n", - "\u001b[1m\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q: What are common risk factors for Type 2 Diabetes?\n", - "A: ['Common risk factors for Type 2 Diabetes include:\\n1. High body mass index (BMI) \\n2. Physical inactivity \\n3. Poor diet (such as low adherence to a Mediterranean diet)\\n4. Smoking\\n5. Age (increased risk with older age)\\n6. Family history of diabetes\\n7. High blood pressure\\n8. High blood cholesterol\\n9. Waist circumference (indicating abdominal obesity)\\n10. Presence of chronic diseases (e.g., cardiovascular diseases, hypertension)']\n", - "\n", - "Q: What preventive measures reduce the risk of Hypertension?\n", - "A: ['Preventive measures to reduce the risk of hypertension include:\\n1. **Moderate Coffee Consumption**: Studies suggest that moderate coffee intake may lower the risk of developing hypertension.\\n2. **Diet and Lifestyle**: Adopting a healthy diet, particularly one resembling the Mediterranean diet, and maintaining a healthy lifestyle plays a crucial role in cardiovascular health.\\n3. **Managing Genetics and Smoking Status**: Outcomes of coffee consumption on blood pressure may vary based on genetic factors, especially those related to caffeine metabolism, and smoking habits should be considered.']\n", - "\n", - "Q: What symptoms indicate possible Cardiovascular Disease?\n", - "A: ['Symptoms indicating possible cardiovascular disease may include: \\n- Chest pain or discomfort (angina) \\n- Shortness of breath \\n- Fatigue or weakness \\n- Palpitations or irregular heartbeat \\n- Dizziness or fainting \\n- Swelling in the legs, ankles, or feet \\n- Pain or numbness in the arms or legs. \\nFurther evaluation by a healthcare provider is essential for accurate diagnosis and treatment.']\n", - "\n", - "Q: What diseases are associated with Obesity?\n", - "A: ['Diseases associated with obesity include cardiovascular diseases, hypertension, and type 2 diabetes. Obesity is linked with an increased risk for these conditions, which can lead to complications in heart health and overall mortality.']\n", - "\n" - ] - } - ], + "metadata": {}, "source": [ "# Run without ontology\n", "print(\"\\n--- Results WITHOUT ontology ---\\n\")\n", @@ -1032,7 +183,9 @@ "answers_without = await query_pipeline(questions)\n", "for q, a in zip(questions, answers_without):\n", " print(f\"Q: {q}\\nA: {a}\\n\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1046,45 +199,8 @@ }, { "cell_type": "code", - "execution_count": 16, "id": "36ee2a360f47a054", - "metadata": { - "ExecuteTime": { - "end_time": "2025-04-09T15:25:33.512697Z", - "start_time": "2025-04-09T15:25:33.471854Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[2m2025-08-27T13:58:58.679995\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mGraph visualization saved as /Users/daulet/graph_visualization.html\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n", - "\n", - "\u001b[2m2025-08-27T13:58:58.682148\u001b[0m [\u001b[32m\u001b[1minfo \u001b[0m] \u001b[1mThe HTML file has been stored on your home directory! Navigate there with cd ~\u001b[0m [\u001b[0m\u001b[1m\u001b[34mcognee.shared.logging_utils\u001b[0m]\u001b[0m\n" - ] - }, - { - "data": { - "text/plain": [ - "'/Users/daulet/graph_visualization.html'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": {}, "source": [ "import webbrowser\n", "import os\n", @@ -1094,7 +210,9 @@ "html_file = os.path.join(home_dir, \"graph_visualization.html\")\n", "display(html_file)\n", "webbrowser.open(f\"file://{html_file}\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1132,22 +250,8 @@ }, { "cell_type": "code", - "execution_count": null, "id": "8d2a0fe555a7bc0f", "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", - "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", - "\u001b[1;31mClick here for more info. \n", - "\u001b[1;31mView Jupyter log for further details." - ] - } - ], "source": [ "# Only exit in interactive mode, not during GitHub Actions\n", "import os\n", @@ -1158,7 +262,17 @@ " os._exit(0)\n", "else:\n", " print(\"Skipping kernel exit - running in GitHub Actions\")" - ] + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "", + "id": "adb6601890237b6a", + "outputs": [], + "execution_count": null } ], "metadata": { From 6821f900ee7d53b1e951b18aaf8a3a9b2f0587cf Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Fri, 19 Sep 2025 17:27:17 +0200 Subject: [PATCH 30/32] version: v0.3.4 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 540e9cbbc..7a30f5038 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "cognee" -version = "0.3.4.dev4" +version = "0.3.4" description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning." authors = [ { name = "Vasilije Markovic" }, diff --git a/uv.lock b/uv.lock index 91e4680f2..a09a543bc 100644 --- a/uv.lock +++ b/uv.lock @@ -811,7 +811,7 @@ wheels = [ [[package]] name = "cognee" -version = "0.3.4.dev4" +version = "0.3.4" source = { editable = "." } dependencies = [ { name = "aiofiles" }, From 38c05ba71ac1074cc617c6e0582160ea755fccdd Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Fri, 19 Sep 2025 17:55:08 +0200 Subject: [PATCH 31/32] fix: ruff formatting error --- cognee/base_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cognee/base_config.py b/cognee/base_config.py index 100262dfc..a2ad06249 100644 --- a/cognee/base_config.py +++ b/cognee/base_config.py @@ -13,7 +13,6 @@ class BaseConfig(BaseSettings): cache_root_directory: str = get_absolute_path(".cognee_cache") monitoring_tool: object = Observer.NONE - @pydantic.model_validator(mode="after") def validate_paths(self): # Adding this here temporarily to ensure that the cache root directory is set correctly for S3 storage automatically From 96eb0d448a2ed77295bdaac6f83119fb635810b2 Mon Sep 17 00:00:00 2001 From: Chaitany <67532224+patelchaitany@users.noreply.github.com> Date: Fri, 19 Sep 2025 21:54:33 +0530 Subject: [PATCH 32/32] feat(#1357): Lexical chunk retriever (#1392) ## Description I Implemented Lexical Chunk Retriever In the LexicalRetriever class is Inherite the BaseRetriever and The DocumentChunk are lazy loaded when first time query is made because it save time during object initialization and the function get_context and the get_completion are Implemented same as the ChunksRetriever the only diffrence is that the DocumentChunk are converted to match the output type of the ChunksRetriever using function get_own_properties in the utils. ## Type of Change - [-] Bug fix (non-breaking change that fixes an issue) - [-] New feature (non-breaking change that adds functionality) - [-] Breaking change (fix or feature that would cause existing functionality to change) - [-] Documentation update - [-] Code refactoring - [-] Performance improvement - [-] Other (please specify): ## Changes Made - Added LexicalRetriever base class with customizable tokenizer & scorer - Implemented caching of DocumentChunk tokens and payloads - Added robust initialization with error handling and logging - Implemented get_context with top_k ranking and optional scores - Implemented get_completion consistent with BaseRetriever interface - Added JaccardChunksRetriever demo using set/multiset Jaccard similarity - Support for stopwords and multiset frequency-aware similarity - Integrated logging for initialization, scoring, and retrieval ## Testing - Manual tests: initialized retriever, retrieved chunks with toy corpus - Edge cases: empty corpus, empty query, scorer/tokenizer errors - Verified Jaccard similarity results for single/multiset cases - Code formatted and linted ## Screenshots/Videos (if applicable) ## Pre-submission Checklist - [-] **I have tested my changes thoroughly before submitting this PR** - [-] **This PR contains minimal changes necessary to address the issue/feature** - [-] My code follows the project's coding standards and style guidelines - [-] I have added tests that prove my fix is effective or that my feature works - [-] I have added necessary documentation (if applicable) - [-] All new and existing tests pass - [-] I have searched existing PRs to ensure this change hasn't been submitted already - [-] I have linked any relevant issues in the description - [-] My commits have clear and descriptive messages ## Related Issues Relates to #1392 ## Additional Notes Int the cognee/modules/chunking/models/DocumentChunk.py don't remove the optional from is_part_of attributes. ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Andrej Milicevic Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: Igor Ilic Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Co-authored-by: Boris Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com> --- .../disable_independent_workflows.sh | 2 +- .github/workflows/test_gemini.yml | 29 ----- .github/workflows/test_llms.yml | 86 +++++++++++++ .github/workflows/test_openrouter.yml | 30 ----- .github/workflows/test_suites.yml | 23 ++-- cognee/api/v1/search/search.py | 3 + cognee/modules/retrieval/jaccard_retrival.py | 56 +++++++++ cognee/modules/retrieval/lexical_retriever.py | 117 ++++++++++++++++++ .../search/methods/get_search_type_tools.py | 5 + cognee/modules/search/types/SearchType.py | 1 + 10 files changed, 276 insertions(+), 76 deletions(-) delete mode 100644 .github/workflows/test_gemini.yml create mode 100644 .github/workflows/test_llms.yml delete mode 100644 .github/workflows/test_openrouter.yml create mode 100644 cognee/modules/retrieval/jaccard_retrival.py create mode 100644 cognee/modules/retrieval/lexical_retriever.py diff --git a/.github/workflows/disable_independent_workflows.sh b/.github/workflows/disable_independent_workflows.sh index 693c3092d..ff57da80d 100755 --- a/.github/workflows/disable_independent_workflows.sh +++ b/.github/workflows/disable_independent_workflows.sh @@ -10,7 +10,7 @@ WORKFLOWS=( "test_kuzu.yml" "test_multimetric_qa_eval_run.yaml" "test_graphrag_vs_rag_notebook.yml" - "test_gemini.yml" + "test_llms.yml" "test_multimedia_example.yaml" "test_deduplication.yml" "test_eval_framework.yml" diff --git a/.github/workflows/test_gemini.yml b/.github/workflows/test_gemini.yml deleted file mode 100644 index 544e15a5e..000000000 --- a/.github/workflows/test_gemini.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: test | gemini - -on: - workflow_call: - -jobs: - test-gemini: - name: Run Gemini Test - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run Gemini Simple Example - env: - LLM_PROVIDER: "gemini" - LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }} - LLM_MODEL: "gemini/gemini-1.5-flash" - EMBEDDING_PROVIDER: "gemini" - EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }} - EMBEDDING_MODEL: "gemini/text-embedding-004" - EMBEDDING_DIMENSIONS: "768" - EMBEDDING_MAX_TOKENS: "8076" - run: uv run python ./examples/python/simple_example.py diff --git a/.github/workflows/test_llms.yml b/.github/workflows/test_llms.yml new file mode 100644 index 000000000..5a0f947c9 --- /dev/null +++ b/.github/workflows/test_llms.yml @@ -0,0 +1,86 @@ +name: LLM Test Suites + +permissions: + contents: read + +on: + workflow_call: + +env: + RUNTIME__LOG_LEVEL: ERROR + ENV: 'dev' + +jobs: + test-gemini: + name: Run Gemini Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Gemini Simple Example + env: + LLM_PROVIDER: "gemini" + LLM_API_KEY: ${{ secrets.GEMINI_API_KEY }} + LLM_MODEL: "gemini/gemini-1.5-flash" + EMBEDDING_PROVIDER: "gemini" + EMBEDDING_API_KEY: ${{ secrets.GEMINI_API_KEY }} + EMBEDDING_MODEL: "gemini/text-embedding-004" + EMBEDDING_DIMENSIONS: "768" + EMBEDDING_MAX_TOKENS: "8076" + run: uv run python ./examples/python/simple_example.py + + test-fastembed: + name: Run Fastembed Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run Fastembed Simple Example + env: + LLM_PROVIDER: "openai" + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_PROVIDER: "fastembed" + EMBEDDING_MODEL: "sentence-transformers/all-MiniLM-L6-v2" + EMBEDDING_DIMENSIONS: "384" + EMBEDDING_MAX_TOKENS: "256" + run: uv run python ./examples/python/simple_example.py + + test-openrouter: + name: Run OpenRouter Test + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Run OpenRouter Simple Example + env: + LLM_PROVIDER: "custom" + LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + LLM_MODEL: "openrouter/x-ai/grok-code-fast-1" + LLM_ENDPOINT: "https://openrouter.ai/api/v1" + EMBEDDING_PROVIDER: "openai" + EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} + EMBEDDING_MODEL: "openai/text-embedding-3-large" + EMBEDDING_DIMENSIONS: "3072" + EMBEDDING_MAX_TOKENS: "8191" + run: uv run python ./examples/python/simple_example.py \ No newline at end of file diff --git a/.github/workflows/test_openrouter.yml b/.github/workflows/test_openrouter.yml deleted file mode 100644 index 9c2dcdebe..000000000 --- a/.github/workflows/test_openrouter.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: test | openrouter - -on: - workflow_call: - -jobs: - test-openrouter: - name: Run OpenRouter Test - runs-on: ubuntu-22.04 - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Cognee Setup - uses: ./.github/actions/cognee_setup - with: - python-version: '3.11.x' - - - name: Run OpenRouter Simple Example - env: - LLM_PROVIDER: "custom" - LLM_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} - LLM_MODEL: "openrouter/x-ai/grok-code-fast-1" - LLM_ENDPOINT: "https://openrouter.ai/api/v1" - EMBEDDING_PROVIDER: "openai" - EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} - EMBEDDING_MODEL: "openai/text-embedding-3-large" - EMBEDDING_DIMENSIONS: "3072" - EMBEDDING_MAX_TOKENS: "8191" - run: uv run python ./examples/python/simple_example.py diff --git a/.github/workflows/test_suites.yml b/.github/workflows/test_suites.yml index 86f89249d..ff18f2962 100644 --- a/.github/workflows/test_suites.yml +++ b/.github/workflows/test_suites.yml @@ -115,16 +115,10 @@ jobs: secrets: inherit # Additional LLM tests - gemini-tests: - name: Gemini Tests - needs: [basic-tests, e2e-tests] - uses: ./.github/workflows/test_gemini.yml - secrets: inherit - - openrouter-tests: - name: OpenRouter Tests - needs: [basic-tests, e2e-tests] - uses: ./.github/workflows/test_openrouter.yml + llm-tests: + name: LLM Test Suite + needs: [ basic-tests, e2e-tests ] + uses: ./.github/workflows/test_llms.yml secrets: inherit # Ollama tests moved to the end @@ -138,8 +132,7 @@ jobs: different-operating-systems-tests, vector-db-tests, example-tests, - gemini-tests, - openrouter-tests, + llm-tests, mcp-test, relational-db-migration-tests, docker-compose-test, @@ -161,8 +154,7 @@ jobs: example-tests, db-examples-tests, mcp-test, - gemini-tests, - openrouter-tests, + llm-tests, ollama-tests, relational-db-migration-tests, docker-compose-test, @@ -183,8 +175,7 @@ jobs: "${{ needs.example-tests.result }}" == "success" && "${{ needs.db-examples-tests.result }}" == "success" && "${{ needs.relational-db-migration-tests.result }}" == "success" && - "${{ needs.gemini-tests.result }}" == "success" && - "${{ needs.openrouter-tests.result }}" == "success" && + "${{ needs.llm-tests.result }}" == "success" && "${{ needs.docker-compose-test.result }}" == "success" && "${{ needs.docker-ci-test.result }}" == "success" && "${{ needs.ollama-tests.result }}" == "success" ]]; then diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index dcebce012..7209c6036 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -82,6 +82,9 @@ async def search( Best for: General-purpose queries or when you're unsure which search type is best. Returns: The results from the automatically selected search type. + **CHUNKS_LEXICAL**: + Token-based lexical chunk search (e.g., Jaccard). Best for: exact-term matching, stopword-aware lookups. + Returns: Ranked text chunks (optionally with scores). Args: query_text: Your question or search query in natural language. diff --git a/cognee/modules/retrieval/jaccard_retrival.py b/cognee/modules/retrieval/jaccard_retrival.py new file mode 100644 index 000000000..91d2b67f7 --- /dev/null +++ b/cognee/modules/retrieval/jaccard_retrival.py @@ -0,0 +1,56 @@ +from cognee.modules.retrieval.lexical_retriever import LexicalRetriever +import re +from collections import Counter +from typing import Optional +class JaccardChunksRetriever(LexicalRetriever): + """ + Retriever that specializes LexicalRetriever to use Jaccard similarity. + """ + + def __init__(self, top_k: int = 10, with_scores: bool = False, + stop_words: Optional[list[str]] = None, multiset_jaccard: bool = False): + """ + Parameters + ---------- + top_k : int + Number of top results to return. + with_scores : bool + If True, return (payload, score) pairs. Otherwise, only payloads. + stop_words : list[str], optional + List of tokens to filter out. + multiset_jaccard : bool + If True, use multiset Jaccard (frequency aware). + """ + self.stop_words = {t.lower() for t in stop_words} if stop_words else set() + self.multiset_jaccard = multiset_jaccard + + super().__init__( + tokenizer=self._tokenizer, + scorer=self._scorer, + top_k=top_k, + with_scores=with_scores + ) + + def _tokenizer(self, text: str) -> list[str]: + """ + Tokenizer: lowercases, splits on word characters (w+), filters stopwords. + """ + tokens = re.findall(r"\w+", text.lower()) + return [t for t in tokens if t not in self.stop_words] + + def _scorer(self, query_tokens: list[str], chunk_tokens: list[str]) -> float: + """ + Jaccard similarity scorer. + - If multiset_jaccard=True, uses frequency-aware Jaccard. + - Otherwise, normal set Jaccard. + """ + if self.multiset_jaccard: + q_counts, c_counts = Counter(query_tokens), Counter(chunk_tokens) + numerator = sum(min(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts)) + denominator = sum(max(q_counts[t], c_counts[t]) for t in set(q_counts) | set(c_counts)) + return numerator / denominator if denominator else 0.0 + else: + q_set, c_set = set(query_tokens), set(chunk_tokens) + if not q_set or not c_set: + return 0.0 + return len(q_set & c_set) / len(q_set | c_set) diff --git a/cognee/modules/retrieval/lexical_retriever.py b/cognee/modules/retrieval/lexical_retriever.py new file mode 100644 index 000000000..2292b64c8 --- /dev/null +++ b/cognee/modules/retrieval/lexical_retriever.py @@ -0,0 +1,117 @@ +import asyncio +from typing import Any, Callable, Optional +from heapq import nlargest + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.retrieval.base_retriever import BaseRetriever +from cognee.modules.retrieval.exceptions.exceptions import NoDataError +from cognee.shared.logging_utils import get_logger + + +logger = get_logger("LexicalRetriever") + + +class LexicalRetriever(BaseRetriever): + + def __init__(self, tokenizer: Callable, scorer: Callable, top_k: int = 10, with_scores: bool = False): + if not callable(tokenizer) or not callable(scorer): + raise TypeError("tokenizer and scorer must be callables") + if not isinstance(top_k, int) or top_k <= 0: + raise ValueError("top_k must be a positive integer") + + self.tokenizer = tokenizer + self.scorer = scorer + self.top_k = top_k + self.with_scores = bool(with_scores) + + # Cache keyed by dataset context + self.chunks: dict[str, Any] = {} # {chunk_id: tokens} + self.payloads: dict[str, Any] = {} # {chunk_id: original_document} + self._initialized = False + self._init_lock = asyncio.Lock() + + async def initialize(self): + """Initialize retriever by reading all DocumentChunks from graph_engine.""" + async with self._init_lock: + if self._initialized: + return + + logger.info("Initializing LexicalRetriever by loading DocumentChunks from graph engine") + + try: + graph_engine = await get_graph_engine() + nodes, _ = await graph_engine.get_filtered_graph_data([{"type": ["DocumentChunk"]}]) + except Exception as e: + logger.error("Graph engine initialization failed") + raise NoDataError("Graph engine initialization failed") from e + + chunk_count = 0 + for node in nodes: + try: + chunk_id, document = node + except Exception: + logger.warning("Skipping node with unexpected shape: %r", node) + continue + + if document.get("type") == "DocumentChunk" and document.get("text"): + try: + tokens = self.tokenizer(document["text"]) + if not tokens: + continue + self.chunks[str(document.get("id",chunk_id))] = tokens + self.payloads[str(document.get("id",chunk_id))] = document + chunk_count += 1 + except Exception as e: + logger.error("Tokenizer failed for chunk %s: %s", chunk_id, str(e)) + + if chunk_count == 0: + logger.error("Initialization completed but no valid chunks were loaded.") + raise NoDataError("No valid chunks loaded during initialization.") + + self._initialized = True + logger.info("Initialized with %d document chunks", len(self.chunks)) + + async def get_context(self, query: str) -> Any: + """Retrieves relevant chunks for the given query.""" + if not self._initialized: + await self.initialize() + + if not self.chunks: + logger.warning("No chunks available in retriever") + return [] + + try: + query_tokens = self.tokenizer(query) + except Exception as e: + logger.error("Failed to tokenize query: %s", str(e)) + return [] + + if not query_tokens: + logger.warning("Query produced no tokens") + return [] + + results = [] + for chunk_id, chunk_tokens in self.chunks.items(): + try: + score = self.scorer(query_tokens, chunk_tokens) + if not isinstance(score, (int, float)): + logger.warning("Non-numeric score for chunk %s → treated as 0.0", chunk_id) + score = 0.0 + except Exception as e: + logger.error("Scorer failed for chunk %s: %s", chunk_id, str(e)) + score = 0.0 + results.append((chunk_id, score)) + + top_results = nlargest(self.top_k, results, key=lambda x: x[1]) + logger.info("Retrieved %d/%d chunks for query (len=%d)", len(top_results), len(results), len(query_tokens)) + + if self.with_scores: + return [(self.payloads[chunk_id], score) for chunk_id, score in top_results] + else: + return [self.payloads[chunk_id] for chunk_id, _ in top_results] + + async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: + """Returns context for the given query (retrieves if not provided).""" + if context is None: + context = await self.get_context(query) + return context diff --git a/cognee/modules/search/methods/get_search_type_tools.py b/cognee/modules/search/methods/get_search_type_tools.py index 551f77a16..c5ea53a62 100644 --- a/cognee/modules/search/methods/get_search_type_tools.py +++ b/cognee/modules/search/methods/get_search_type_tools.py @@ -15,6 +15,7 @@ from cognee.modules.retrieval.completion_retriever import CompletionRetriever from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever from cognee.modules.retrieval.temporal_retriever import TemporalRetriever from cognee.modules.retrieval.coding_rules_retriever import CodingRulesRetriever +from cognee.modules.retrieval.jaccard_retrival import JaccardChunksRetriever from cognee.modules.retrieval.graph_summary_completion_retriever import ( GraphSummaryCompletionRetriever, ) @@ -152,6 +153,10 @@ async def get_search_type_tools( TemporalRetriever(top_k=top_k).get_completion, TemporalRetriever(top_k=top_k).get_context, ], + SearchType.CHUNKS_LEXICAL: (lambda _r=JaccardChunksRetriever(top_k=top_k): [ + _r.get_completion, + _r.get_context, + ])(), SearchType.CODING_RULES: [ CodingRulesRetriever(rules_nodeset_name=node_name).get_existing_rules, ], diff --git a/cognee/modules/search/types/SearchType.py b/cognee/modules/search/types/SearchType.py index f5a23efff..418aec0b5 100644 --- a/cognee/modules/search/types/SearchType.py +++ b/cognee/modules/search/types/SearchType.py @@ -17,3 +17,4 @@ class SearchType(Enum): FEEDBACK = "FEEDBACK" TEMPORAL = "TEMPORAL" CODING_RULES = "CODING_RULES" + CHUNKS_LEXICAL = "CHUNKS_LEXICAL"