From d9761118b28fcd60dfcbe3c6e85c47f28f6cbd52 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Fri, 25 Jul 2025 12:01:38 +0200 Subject: [PATCH] refactor: remove unnecessary methods and enforce config objects --- cognee/api/v1/cognify/cognify.py | 26 ++-- cognee/api/v1/config/config.py | 6 +- cognee/base_config.py | 10 +- cognee/context_global_variables.py | 38 +++--- cognee/eval_framework/eval_config.py | 2 + cognee/infrastructure/data/chunking/config.py | 21 +-- .../data/chunking/create_chunking_engine.py | 53 +++----- .../data/chunking/get_chunking_engine.py | 8 +- .../infrastructure/databases/graph/config.py | 72 ++--------- .../databases/graph/get_graph_engine.py | 26 ++-- .../databases/graph/kuzu/adapter.py | 6 +- .../databases/graph/neo4j_driver/adapter.py | 12 +- .../databases/graph/networkx/adapter.py | 9 +- .../hybrid/falkordb/FalkorDBAdapter.py | 10 +- .../databases/relational/config.py | 42 ------ .../relational/get_relational_engine.py | 10 +- .../sqlalchemy/SqlAlchemyAdapter.py | 4 +- .../infrastructure/databases/vector/config.py | 32 ++--- .../databases/vector/create_vector_engine.py | 5 +- .../databases/vector/embeddings/config.py | 23 ---- .../databases/vector/get_vector_engine.py | 11 +- .../vector/pgvector/create_db_and_tables.py | 8 +- .../infrastructure/engine/models/DataPoint.py | 121 ++---------------- cognee/infrastructure/files/storage/config.py | 14 +- .../files/storage/get_storage_config.py | 9 +- cognee/infrastructure/llm/config.py | 33 ----- cognee/modules/cognify/config.py | 8 -- cognee/modules/data/deletion/prune_data.py | 2 +- cognee/modules/data/models/Data.py | 13 -- cognee/modules/data/models/Dataset.py | 10 -- .../data/models/graph_relationship_ledger.py | 11 -- cognee/modules/ingestion/save_data_to_file.py | 2 +- .../logging/log_cognee_configuration.py | 36 ++++++ .../modules/pipelines/operations/pipeline.py | 8 +- .../modules/settings/get_current_settings.py | 2 +- cognee/shared/logging_utils.py | 47 +------ cognee/shared/utils.py | 11 +- cognee/tests/test_chromadb.py | 2 +- cognee/tests/test_falkordb.py | 2 +- cognee/tests/test_kuzu.py | 2 +- cognee/tests/test_library.py | 2 +- cognee/tests/test_memgraph.py | 2 +- cognee/tests/test_neo4j.py | 2 +- cognee/tests/test_pgvector.py | 2 +- cognee/tests/test_qdrant.py | 2 +- cognee/tests/test_remote_kuzu.py | 2 +- cognee/tests/test_s3_file_storage.py | 2 +- 47 files changed, 235 insertions(+), 546 deletions(-) create mode 100644 cognee/modules/logging/log_cognee_configuration.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 7c7821460..06b59cbdf 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -7,13 +7,13 @@ from cognee.shared.logging_utils import get_logger from cognee.shared.data_models import KnowledgeGraph from cognee.infrastructure.llm import get_max_chunk_tokens -from cognee.modules.pipelines import cognee_pipeline -from cognee.modules.pipelines.tasks.task import Task +from cognee.modules.users.models import User from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver +from cognee.modules.pipelines.tasks.task import Task +from cognee.modules.pipelines import cognee_pipeline from cognee.modules.pipelines.models.PipelineRunInfo import PipelineRunCompleted, PipelineRunErrored from cognee.modules.pipelines.queues.pipeline_run_info_queues import push_to_queue -from cognee.modules.users.models import User from cognee.tasks.documents import ( check_permissions_on_dataset, @@ -30,14 +30,14 @@ update_status_lock = asyncio.Lock() async def cognify( - datasets: Union[str, list[str], list[UUID]] = None, - user: User = None, - graph_model: BaseModel = KnowledgeGraph, + datasets: Optional[Union[str, list[str], list[UUID]]] = None, + user: Optional[User] = None, + graph_model: Optional[BaseModel] = KnowledgeGraph, chunker=TextChunker, - chunk_size: int = None, + chunk_size: Optional[int] = None, ontology_file_path: Optional[str] = None, - vector_db_config: dict = None, - graph_db_config: dict = None, + vector_db_config: Optional[dict] = None, + graph_db_config: Optional[dict] = None, run_in_background: bool = False, ): """ @@ -209,8 +209,8 @@ async def run_cognify_blocking( tasks, user, datasets, - graph_db_config: dict = None, - vector_db_config: dict = False, + graph_db_config=None, + vector_db_config=None, ): total_run_info = {} @@ -234,8 +234,8 @@ async def run_cognify_as_background_process( tasks, user, datasets, - graph_db_config: dict = None, - vector_db_config: dict = False, + graph_db_config=None, + vector_db_config=None, ): # Convert dataset to list if it's a string if isinstance(datasets, str): diff --git a/cognee/api/v1/config/config.py b/cognee/api/v1/config/config.py index b3113e8c7..279a13b8b 100644 --- a/cognee/api/v1/config/config.py +++ b/cognee/api/v1/config/config.py @@ -2,7 +2,7 @@ import os from cognee.base_config import get_base_config -from cognee.exceptions import InvalidValueError, InvalidAttributeError +from cognee.exceptions import InvalidAttributeError from cognee.modules.cognify.config import get_cognify_config from cognee.infrastructure.data.chunking.config import get_chunk_config from cognee.infrastructure.databases.vector import get_vectordb_config @@ -105,12 +105,12 @@ class config: chunk_config.chunk_engine = chunk_engine @staticmethod - def set_chunk_overlap(chunk_overlap: object): + def set_chunk_overlap(chunk_overlap: int): chunk_config = get_chunk_config() chunk_config.chunk_overlap = chunk_overlap @staticmethod - def set_chunk_size(chunk_size: object): + def set_chunk_size(chunk_size: int): chunk_config = get_chunk_config() chunk_config.chunk_size = chunk_size diff --git a/cognee/base_config.py b/cognee/base_config.py index 5081acaff..1c71fe8e1 100644 --- a/cognee/base_config.py +++ b/cognee/base_config.py @@ -10,18 +10,10 @@ class BaseConfig(BaseSettings): data_root_directory: str = get_absolute_path(".data_storage") system_root_directory: str = get_absolute_path(".cognee_system") monitoring_tool: object = Observer.LANGFUSE - langfuse_public_key: Optional[str] = os.getenv("LANGFUSE_PUBLIC_KEY") - langfuse_secret_key: Optional[str] = os.getenv("LANGFUSE_SECRET_KEY") - langfuse_host: Optional[str] = os.getenv("LANGFUSE_HOST") default_user_email: Optional[str] = os.getenv("DEFAULT_USER_EMAIL") default_user_password: Optional[str] = os.getenv("DEFAULT_USER_PASSWORD") - model_config = SettingsConfigDict(env_file=".env", extra="allow") - def to_dict(self) -> dict: - return { - "data_root_directory": self.data_root_directory, - "monitoring_tool": self.monitoring_tool, - } + model_config = SettingsConfigDict(env_file=".env", extra="allow") @lru_cache diff --git a/cognee/context_global_variables.py b/cognee/context_global_variables.py index f6642e3dc..11757048d 100644 --- a/cognee/context_global_variables.py +++ b/cognee/context_global_variables.py @@ -1,17 +1,19 @@ import os from contextvars import ContextVar -from typing import Union +from typing import Optional, Union from uuid import UUID from cognee.base_config import get_base_config +from cognee.infrastructure.databases.graph.config import GraphConfig +from cognee.infrastructure.databases.vector.config import VectorConfig from cognee.infrastructure.databases.utils import get_or_create_dataset_database -from cognee.infrastructure.files.storage.config import file_storage_config +from cognee.infrastructure.files.storage.config import StorageConfig, file_storage_config from cognee.modules.users.methods import get_user # Note: ContextVar allows us to use different graph db configurations in Cognee # for different async tasks, threads and processes -vector_db_config = ContextVar("vector_db_config", default=None) -graph_db_config = ContextVar("graph_db_config", default=None) +vector_db_config = ContextVar[Optional[VectorConfig]]("vector_db_config", default=None) +graph_db_config = ContextVar[Optional[GraphConfig]]("graph_db_config", default=None) async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID): @@ -51,24 +53,24 @@ async def set_database_global_context_variables(dataset: Union[str, UUID], user_ ) # Set vector and graph database configuration based on dataset database information - vector_config = { - "vector_db_url": os.path.join( - databases_directory_path, dataset_database.vector_database_name + vector_config = VectorConfig( + vector_db_url=os.path.join( + databases_directory_path, str(dataset_database.vector_database_name) ), - "vector_db_key": "", - "vector_db_provider": "lancedb", - } + vector_db_key="", + vector_db_provider="lancedb", + ) - graph_config = { - "graph_database_provider": "kuzu", - "graph_file_path": os.path.join( - databases_directory_path, dataset_database.graph_database_name + graph_config = GraphConfig( + graph_database_provider="kuzu", + graph_file_path=os.path.join( + databases_directory_path, str(dataset_database.graph_database_name) ), - } + ) - storage_config = { - "data_root_directory": data_root_directory, - } + storage_config = StorageConfig( + data_root_directory=data_root_directory, + ) # Use ContextVar to use these graph and vector configurations are used # in the current async context across Cognee diff --git a/cognee/eval_framework/eval_config.py b/cognee/eval_framework/eval_config.py index 7baf58cf9..7fff0a003 100644 --- a/cognee/eval_framework/eval_config.py +++ b/cognee/eval_framework/eval_config.py @@ -1,4 +1,5 @@ from functools import lru_cache +from deprecated import deprecated from pydantic_settings import BaseSettings, SettingsConfigDict from typing import List, Optional @@ -45,6 +46,7 @@ class EvalConfig(BaseSettings): model_config = SettingsConfigDict(env_file=".env", extra="allow") + @deprecated(reason="Call model_dump() instead of calling to_dict() method.") def to_dict(self) -> dict: return { "building_corpus_from_scratch": self.building_corpus_from_scratch, diff --git a/cognee/infrastructure/data/chunking/config.py b/cognee/infrastructure/data/chunking/config.py index 3c81ee7dd..a6a49d99a 100644 --- a/cognee/infrastructure/data/chunking/config.py +++ b/cognee/infrastructure/data/chunking/config.py @@ -1,7 +1,6 @@ from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict -from cognee.infrastructure.data.chunking.DefaultChunkEngine import DefaultChunkEngine from cognee.shared.data_models import ChunkStrategy, ChunkEngine @@ -12,27 +11,11 @@ class ChunkConfig(BaseSettings): chunk_size: int = 1500 chunk_overlap: int = 10 - chunk_strategy: object = ChunkStrategy.PARAGRAPH - chunk_engine: object = ChunkEngine.DEFAULT_ENGINE + chunk_strategy: ChunkStrategy = ChunkStrategy.PARAGRAPH + chunk_engine: ChunkEngine = ChunkEngine.DEFAULT_ENGINE model_config = SettingsConfigDict(env_file=".env", extra="allow") - def to_dict(self) -> dict: - """ - Convert the chunk settings to a dictionary format. - - Returns: - -------- - - - dict: A dictionary representation of the chunk configuration settings. - """ - return { - "chunk_size": self.chunk_size, - "chunk_overlap": self.chunk_overlap, - "chunk_strategy": self.chunk_strategy, - "chunk_engine": self.chunk_engine, - } - @lru_cache def get_chunk_config(): diff --git a/cognee/infrastructure/data/chunking/create_chunking_engine.py b/cognee/infrastructure/data/chunking/create_chunking_engine.py index bd6246800..844c2ebd1 100644 --- a/cognee/infrastructure/data/chunking/create_chunking_engine.py +++ b/cognee/infrastructure/data/chunking/create_chunking_engine.py @@ -1,25 +1,12 @@ -from typing import Dict +from cognee.shared.data_models import ChunkEngine, ChunkStrategy -from cognee.shared.data_models import ChunkEngine - - -class ChunkingConfig(Dict): - """ - Represent configuration settings for chunking operations, inheriting from the built-in - Dict class. The class contains the following public attributes: - - - vector_db_url: A string representing the URL of the vector database. - - vector_db_key: A string representing the key for accessing the vector database. - - vector_db_provider: A string representing the provider of the vector database. - """ - - vector_db_url: str - vector_db_key: str - vector_db_provider: str - - -def create_chunking_engine(config: ChunkingConfig): +def create_chunking_engine( + chunk_size: int, + chunk_overlap: int, + chunk_engine: ChunkEngine, + chunk_strategy: ChunkStrategy, +): """ Create a chunking engine based on the provided configuration. @@ -30,7 +17,7 @@ def create_chunking_engine(config: ChunkingConfig): Parameters: ----------- - - config (ChunkingConfig): Configuration object containing the settings for the + - config (ChunkConfig): Configuration object containing the settings for the chunking engine, including the engine type, chunk size, chunk overlap, and chunk strategy. @@ -40,27 +27,27 @@ def create_chunking_engine(config: ChunkingConfig): An instance of the selected chunking engine class (LangchainChunkEngine, DefaultChunkEngine, or HaystackChunkEngine). """ - if config["chunk_engine"] == ChunkEngine.LANGCHAIN_ENGINE: + if chunk_engine == ChunkEngine.LANGCHAIN_ENGINE: from cognee.infrastructure.data.chunking.LangchainChunkingEngine import LangchainChunkEngine return LangchainChunkEngine( - chunk_size=config["chunk_size"], - chunk_overlap=config["chunk_overlap"], - chunk_strategy=config["chunk_strategy"], + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + chunk_strategy=chunk_strategy, ) - elif config["chunk_engine"] == ChunkEngine.DEFAULT_ENGINE: + elif chunk_engine == ChunkEngine.DEFAULT_ENGINE: from cognee.infrastructure.data.chunking.DefaultChunkEngine import DefaultChunkEngine return DefaultChunkEngine( - chunk_size=config["chunk_size"], - chunk_overlap=config["chunk_overlap"], - chunk_strategy=config["chunk_strategy"], + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + chunk_strategy=chunk_strategy, ) - elif config["chunk_engine"] == ChunkEngine.HAYSTACK_ENGINE: + elif chunk_engine == ChunkEngine.HAYSTACK_ENGINE: from cognee.infrastructure.data.chunking.HaystackChunkEngine import HaystackChunkEngine return HaystackChunkEngine( - chunk_size=config["chunk_size"], - chunk_overlap=config["chunk_overlap"], - chunk_strategy=config["chunk_strategy"], + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + chunk_strategy=chunk_strategy, ) diff --git a/cognee/infrastructure/data/chunking/get_chunking_engine.py b/cognee/infrastructure/data/chunking/get_chunking_engine.py index 9d54ed49b..a717514b7 100644 --- a/cognee/infrastructure/data/chunking/get_chunking_engine.py +++ b/cognee/infrastructure/data/chunking/get_chunking_engine.py @@ -13,4 +13,10 @@ def get_chunk_engine(): Returns an instance of the chunking engine created based on the configuration settings. """ - return create_chunking_engine(get_chunk_config().to_dict()) + chunk_config = get_chunk_config() + return create_chunking_engine( + chunk_engine=chunk_config.chunk_engine, + chunk_size=chunk_config.chunk_size, + chunk_overlap=chunk_config.chunk_overlap, + chunk_strategy=chunk_config.chunk_strategy, + ) diff --git a/cognee/infrastructure/databases/graph/config.py b/cognee/infrastructure/databases/graph/config.py index 3575f7c07..2c2cfa28f 100644 --- a/cognee/infrastructure/databases/graph/config.py +++ b/cognee/infrastructure/databases/graph/config.py @@ -1,10 +1,11 @@ """This module contains the configuration for the graph database.""" import os +import pydantic +from typing import Optional from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict -import pydantic -from pydantic import Field + from cognee.base_config import get_base_config from cognee.shared.data_models import KnowledgeGraph @@ -14,10 +15,6 @@ class GraphConfig(BaseSettings): Represents the configuration for a graph system, including parameters for graph file storage and database connections. - Public methods: - - to_dict - - to_hashable_dict - Instance variables: - graph_filename - graph_database_provider @@ -31,10 +28,7 @@ class GraphConfig(BaseSettings): - model_config """ - # Using Field we are able to dynamically load current GRAPH_DATABASE_PROVIDER value in the model validator part - # and determine default graph db file and path based on this parameter if no values are provided - graph_database_provider: str = Field("kuzu", env="GRAPH_DATABASE_PROVIDER") - + graph_database_provider: str = "kuzu" graph_database_url: str = "" graph_database_username: str = "" graph_database_password: str = "" @@ -64,53 +58,6 @@ class GraphConfig(BaseSettings): return values - def to_dict(self) -> dict: - """ - Return the configuration as a dictionary. - - This dictionary contains all the configurations related to the graph, which includes - details for file storage and database connectivity. - - Returns: - -------- - - - dict: A dictionary representation of the configuration settings. - """ - return { - "graph_filename": self.graph_filename, - "graph_database_provider": self.graph_database_provider, - "graph_database_url": self.graph_database_url, - "graph_database_username": self.graph_database_username, - "graph_database_password": self.graph_database_password, - "graph_database_port": self.graph_database_port, - "graph_file_path": self.graph_file_path, - "graph_model": self.graph_model, - "graph_topology": self.graph_topology, - "model_config": self.model_config, - } - - def to_hashable_dict(self) -> dict: - """ - Return a hashable dictionary with essential database configuration parameters. - - This dictionary excludes certain non-hashable objects and focuses on unique identifiers - for database configurations. - - Returns: - -------- - - - dict: A dictionary representation of the essential database configuration - settings. - """ - return { - "graph_database_provider": self.graph_database_provider, - "graph_database_url": self.graph_database_url, - "graph_database_username": self.graph_database_username, - "graph_database_password": self.graph_database_password, - "graph_database_port": self.graph_database_port, - "graph_file_path": self.graph_file_path, - } - @lru_cache def get_graph_config(): @@ -126,15 +73,18 @@ def get_graph_config(): - GraphConfig: A GraphConfig instance containing the graph configuration settings. """ + context_config = get_graph_context_config() + + if context_config: + return context_config + return GraphConfig() -def get_graph_context_config(): +def get_graph_context_config() -> Optional[GraphConfig]: """This function will get the appropriate graph db config based on async context. This allows the use of multiple graph databases for different threads, async tasks and parallelization """ from cognee.context_global_variables import graph_db_config - if graph_db_config.get(): - return graph_db_config.get() - return get_graph_config().to_hashable_dict() + return graph_db_config.get() diff --git a/cognee/infrastructure/databases/graph/get_graph_engine.py b/cognee/infrastructure/databases/graph/get_graph_engine.py index 4607b93ca..fce0f68f9 100644 --- a/cognee/infrastructure/databases/graph/get_graph_engine.py +++ b/cognee/infrastructure/databases/graph/get_graph_engine.py @@ -2,7 +2,7 @@ from functools import lru_cache -from .config import get_graph_context_config +from .config import get_graph_config from .graph_db_interface import GraphDBInterface from .supported_databases import supported_databases @@ -10,20 +10,16 @@ from .supported_databases import supported_databases async def get_graph_engine() -> GraphDBInterface: """Factory function to get the appropriate graph client based on the graph type.""" # Get appropriate graph configuration based on current async context - config = get_graph_context_config() + config = get_graph_config() - graph_client = create_graph_engine(**config) - - # Async functions can't be cached. After creating and caching the graph engine - # handle all necessary async operations for different graph types bellow. - - # Run any adapter‐specific async initialization - if hasattr(graph_client, "initialize"): - await graph_client.initialize() - - # Handle loading of graph for NetworkX - if config["graph_database_provider"].lower() == "networkx" and graph_client.graph is None: - await graph_client.load_graph_from_file() + graph_client = create_graph_engine( + graph_database_provider=config.graph_database_provider, + graph_file_path=config.graph_file_path, + graph_database_url=config.graph_database_url, + graph_database_username=config.graph_database_username, + graph_database_password=config.graph_database_password, + graph_database_port=config.graph_database_port, + ) return graph_client @@ -35,7 +31,7 @@ def create_graph_engine( graph_database_url="", graph_database_username="", graph_database_password="", - graph_database_port="", + graph_database_port=None, ): """ Create a graph engine based on the specified provider type. diff --git a/cognee/infrastructure/databases/graph/kuzu/adapter.py b/cognee/infrastructure/databases/graph/kuzu/adapter.py index db9127e25..1541c954a 100644 --- a/cognee/infrastructure/databases/graph/kuzu/adapter.py +++ b/cognee/infrastructure/databases/graph/kuzu/adapter.py @@ -104,7 +104,6 @@ class KuzuAdapter(GraphDBInterface): max_db_size=4096 * 1024 * 1024, ) - self.db.init_database() self.connection = Connection(self.db) # Create node table with essential fields and timestamp @@ -1465,11 +1464,8 @@ class KuzuAdapter(GraphDBInterface): It raises exceptions for failures occurring during deletion processes. """ try: - # Use DETACH DELETE to remove both nodes and their relationships in one operation - await self.query("MATCH (n:Node) DETACH DELETE n") - logger.info("Cleared all data from graph while preserving structure") - if self.connection: + self.connection.close() self.connection = None if self.db: self.db.close() diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index b23bf8e00..2c238f448 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -8,9 +8,10 @@ from neo4j import AsyncSession from neo4j import AsyncGraphDatabase from neo4j.exceptions import Neo4jError from contextlib import asynccontextmanager -from typing import Optional, Any, List, Dict, Type, Tuple +from typing import AsyncGenerator, Optional, Any, List, Dict, Type, Tuple from cognee.infrastructure.engine import DataPoint +from cognee.infrastructure.utils.run_sync import run_sync from cognee.shared.logging_utils import get_logger, ERROR from cognee.infrastructure.databases.graph.graph_db_interface import ( GraphDBInterface, @@ -67,16 +68,17 @@ class Neo4jAdapter(GraphDBInterface): notifications_min_severity="OFF", ) - async def initialize(self) -> None: """ Initializes the database: adds uniqueness constraint on id and performs indexing """ - await self.query( - (f"CREATE CONSTRAINT IF NOT EXISTS FOR (n:`{BASE_LABEL}`) REQUIRE n.id IS UNIQUE;") + run_sync( + self.query( + (f"CREATE CONSTRAINT IF NOT EXISTS FOR (n:`{BASE_LABEL}`) REQUIRE n.id IS UNIQUE;") + ) ) @asynccontextmanager - async def get_session(self) -> AsyncSession: + async def get_session(self) -> AsyncGenerator[AsyncSession]: """ Get a session for database operations. """ diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index 7431b3fc8..a0a34dac6 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -11,6 +11,7 @@ from typing import Dict, Any, List, Union, Type, Tuple from cognee.infrastructure.databases.exceptions.exceptions import NodesetFilterNotSupportedError from cognee.infrastructure.files.storage import get_file_storage +from cognee.infrastructure.utils.run_sync import run_sync from cognee.shared.logging_utils import get_logger from cognee.infrastructure.databases.graph.graph_db_interface import ( GraphDBInterface, @@ -42,6 +43,8 @@ class NetworkXAdapter(GraphDBInterface): def __init__(self, filename="cognee_graph.pkl"): self.filename = filename + run_sync(self.load_graph_from_file()) + async def get_graph_data(self): """ Retrieve graph data including nodes and edges. @@ -576,7 +579,7 @@ class NetworkXAdapter(GraphDBInterface): await file_storage.store(file_path, json_data, overwrite=True) - async def load_graph_from_file(self, file_path: str = None): + async def load_graph_from_file(self): """ Load graph data asynchronously from a specified file in JSON format. @@ -586,8 +589,8 @@ class NetworkXAdapter(GraphDBInterface): - file_path (str): The file path from which to load the graph data; if None, loads from the default filename. (default None) """ - if not file_path: - file_path = self.filename + file_path = self.filename + try: file_dir_path = os.path.dirname(file_path) file_name = os.path.basename(file_path) diff --git a/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py index 35ce7c77e..cb50cc76a 100644 --- a/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +++ b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py @@ -17,7 +17,7 @@ from cognee.infrastructure.databases.graph.graph_db_interface import ( EdgeData, Node, ) -from cognee.infrastructure.databases.vector.embeddings import EmbeddingEngine +from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.vector.vector_db_interface import VectorDBInterface from cognee.infrastructure.engine import DataPoint @@ -80,7 +80,7 @@ class FalkorDBAdapter(VectorDBInterface, GraphDBInterface): self, database_url: str, database_port: int, - embedding_engine=EmbeddingEngine, + embedding_engine: EmbeddingEngine, ): self.driver = FalkorDB( host=database_url, @@ -213,7 +213,7 @@ class FalkorDBAdapter(VectorDBInterface, GraphDBInterface): A tuple containing the query string and parameters dictionary. """ node_label = type(data_point).__name__ - property_names = DataPoint.get_embeddable_property_names(data_point) + property_names = data_point.get_embeddable_property_names() properties = { **data_point.model_dump(), @@ -357,7 +357,7 @@ class FalkorDBAdapter(VectorDBInterface, GraphDBInterface): vector_map = {} for data_point in data_points: - property_names = DataPoint.get_embeddable_property_names(data_point) + property_names = data_point.get_embeddable_property_names() key = str(data_point.id) vector_map[key] = {} @@ -377,7 +377,7 @@ class FalkorDBAdapter(VectorDBInterface, GraphDBInterface): vectorized_values[vector_map[str(data_point.id)][property_name]] if vector_map[str(data_point.id)][property_name] is not None else None - for property_name in DataPoint.get_embeddable_property_names(data_point) + for property_name in data_point.get_embeddable_property_names() ] query, params = await self.create_data_point_query(data_point, vectorized_data) diff --git a/cognee/infrastructure/databases/relational/config.py b/cognee/infrastructure/databases/relational/config.py index 4e3c26042..971a5f85f 100644 --- a/cognee/infrastructure/databases/relational/config.py +++ b/cognee/infrastructure/databases/relational/config.py @@ -32,26 +32,6 @@ class RelationalConfig(BaseSettings): return values - def to_dict(self) -> dict: - """ - Return the database configuration as a dictionary. - - Returns: - -------- - - - dict: A dictionary containing database configuration settings including db_path, - db_name, db_host, db_port, db_username, db_password, and db_provider. - """ - return { - "db_path": self.db_path, - "db_name": self.db_name, - "db_host": self.db_host, - "db_port": self.db_port, - "db_username": self.db_username, - "db_password": self.db_password, - "db_provider": self.db_provider, - } - @lru_cache def get_relational_config(): @@ -75,9 +55,6 @@ class MigrationConfig(BaseSettings): """ Manage and configure migration settings for a database, inheriting from BaseSettings. - Public methods: - - to_dict: Convert the migration configuration to a dictionary format. - Instance variables: - migration_db_path: Path to the migration database. - migration_db_name: Name of the migration database. @@ -98,25 +75,6 @@ class MigrationConfig(BaseSettings): model_config = SettingsConfigDict(env_file=".env", extra="allow") - def to_dict(self) -> dict: - """ - Convert migration configuration to dictionary format. - - Returns: - -------- - - - dict: A dictionary containing the migration configuration details. - """ - return { - "migration_db_path": self.migration_db_path, - "migration_db_name": self.migration_db_name, - "migration_db_host": self.migration_db_host, - "migration_db_port": self.migration_db_port, - "migration_db_username": self.migration_db_username, - "migration_db_password": self.migration_db_password, - "migration_db_provider": self.migration_db_provider, - } - @lru_cache def get_migration_config(): diff --git a/cognee/infrastructure/databases/relational/get_relational_engine.py b/cognee/infrastructure/databases/relational/get_relational_engine.py index a346f4954..71e365851 100644 --- a/cognee/infrastructure/databases/relational/get_relational_engine.py +++ b/cognee/infrastructure/databases/relational/get_relational_engine.py @@ -18,4 +18,12 @@ def get_relational_engine(): """ relational_config = get_relational_config() - return create_relational_engine(**relational_config.to_dict()) + return create_relational_engine( + db_path=relational_config.db_path, + db_name=relational_config.db_name, + db_host=relational_config.db_host, + db_port=relational_config.db_port, + db_username=relational_config.db_username, + db_password=relational_config.db_password, + db_provider=relational_config.db_provider, + ) diff --git a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py index 8ac8d04c4..27cd6ed53 100644 --- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py @@ -294,10 +294,10 @@ class SQLAlchemyAdapter: storage_config = get_storage_config() if ( - storage_config["data_root_directory"] + storage_config.data_root_directory in raw_data_location_entities[0].raw_data_location ): - file_storage = get_file_storage(storage_config["data_root_directory"]) + file_storage = get_file_storage(storage_config.data_root_directory) file_path = os.path.basename(raw_data_location_entities[0].raw_data_location) diff --git a/cognee/infrastructure/databases/vector/config.py b/cognee/infrastructure/databases/vector/config.py index 07a3d1e05..b07d06311 100644 --- a/cognee/infrastructure/databases/vector/config.py +++ b/cognee/infrastructure/databases/vector/config.py @@ -1,4 +1,5 @@ import os +from typing import Optional import pydantic from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict @@ -10,10 +11,6 @@ class VectorConfig(BaseSettings): """ Manage the configuration settings for the vector database. - Public methods: - - - to_dict: Convert the configuration to a dictionary. - Instance variables: - vector_db_url: The URL of the vector database. @@ -39,22 +36,6 @@ class VectorConfig(BaseSettings): return values - def to_dict(self) -> dict: - """ - Convert the configuration settings to a dictionary. - - Returns: - -------- - - - dict: A dictionary containing the vector database configuration settings. - """ - return { - "vector_db_url": self.vector_db_url, - "vector_db_port": self.vector_db_port, - "vector_db_key": self.vector_db_key, - "vector_db_provider": self.vector_db_provider, - } - @lru_cache def get_vectordb_config(): @@ -71,13 +52,16 @@ def get_vectordb_config(): - VectorConfig: An instance of `VectorConfig` containing the vector database configuration. """ + context_config = get_vectordb_context_config() + + if context_config: + return context_config + return VectorConfig() -def get_vectordb_context_config(): +def get_vectordb_context_config() -> Optional[VectorConfig]: """This function will get the appropriate vector db config based on async context.""" from cognee.context_global_variables import vector_db_config - if vector_db_config.get(): - return vector_db_config.get() - return get_vectordb_config().to_dict() + return vector_db_config.get() diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index 8bbceaf7f..314f7e358 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -1,3 +1,4 @@ +from typing import Optional from .supported_databases import supported_databases from .embeddings import get_embedding_engine @@ -8,7 +9,7 @@ from functools import lru_cache def create_vector_engine( vector_db_provider: str, vector_db_url: str, - vector_db_port: str = "", + vector_db_port: Optional[int] = None, vector_db_key: str = "", ): """ @@ -26,7 +27,7 @@ def create_vector_engine( ----------- - vector_db_url (str): The URL for the vector database instance. - - vector_db_port (str): The port for the vector database instance. Required for some + - vector_db_port (int): The port for the vector database instance. Required for some providers. - vector_db_key (str): The API key or access token for the vector database instance. - vector_db_provider (str): The name of the vector database provider to use (e.g., diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py index 2ae60d64a..fb05f92d3 100644 --- a/cognee/infrastructure/databases/vector/embeddings/config.py +++ b/cognee/infrastructure/databases/vector/embeddings/config.py @@ -7,9 +7,6 @@ class EmbeddingConfig(BaseSettings): """ Manage configuration settings for embedding operations, including provider, model details, API configuration, and tokenizer settings. - - Public methods: - - to_dict: Serialize the configuration settings to a dictionary. """ embedding_provider: Optional[str] = "openai" @@ -22,26 +19,6 @@ class EmbeddingConfig(BaseSettings): huggingface_tokenizer: Optional[str] = None model_config = SettingsConfigDict(env_file=".env", extra="allow") - def to_dict(self) -> dict: - """ - Serialize all embedding configuration settings to a dictionary. - - Returns: - -------- - - - dict: A dictionary containing the embedding configuration settings. - """ - return { - "embedding_provider": self.embedding_provider, - "embedding_model": self.embedding_model, - "embedding_dimensions": self.embedding_dimensions, - "embedding_endpoint": self.embedding_endpoint, - "embedding_api_key": self.embedding_api_key, - "embedding_api_version": self.embedding_api_version, - "embedding_max_tokens": self.embedding_max_tokens, - "huggingface_tokenizer": self.huggingface_tokenizer, - } - @lru_cache def get_embedding_config(): diff --git a/cognee/infrastructure/databases/vector/get_vector_engine.py b/cognee/infrastructure/databases/vector/get_vector_engine.py index d1f253ac6..81384d9bf 100644 --- a/cognee/infrastructure/databases/vector/get_vector_engine.py +++ b/cognee/infrastructure/databases/vector/get_vector_engine.py @@ -1,7 +1,14 @@ -from .config import get_vectordb_context_config +from .config import get_vectordb_config from .create_vector_engine import create_vector_engine def get_vector_engine(): # Get appropriate vector db configuration based on current async context - return create_vector_engine(**get_vectordb_context_config()) + vector_config = get_vectordb_config() + + return create_vector_engine( + vector_db_provider=vector_config.vector_db_provider, + vector_db_url=vector_config.vector_db_url, + vector_db_port=vector_config.vector_db_port, + vector_db_key=vector_config.vector_db_key, + ) diff --git a/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py b/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py index c2477086d..5cb4195c9 100644 --- a/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py +++ b/cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py @@ -1,12 +1,14 @@ from sqlalchemy import text -from ..get_vector_engine import get_vector_engine, get_vectordb_context_config + +from ..config import get_vectordb_config +from ..get_vector_engine import get_vector_engine async def create_db_and_tables(): # Get appropriate vector db configuration based on current async context - vector_config = get_vectordb_context_config() + vector_config = get_vectordb_config() vector_engine = get_vector_engine() - if vector_config["vector_db_provider"] == "pgvector": + if vector_config.vector_db_provider == "pgvector": async with vector_engine.engine.begin() as connection: await connection.execute(text("CREATE EXTENSION IF NOT EXISTS vector;")) diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 812380eaa..6614d8728 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -1,12 +1,10 @@ -import pickle from uuid import UUID, uuid4 -from pydantic import BaseModel, Field, ConfigDict -from datetime import datetime, timezone +from typing import Optional, List from typing_extensions import TypedDict -from typing import Optional, Any, Dict, List +from datetime import datetime, timezone +from pydantic import BaseModel, Field, ConfigDict -# Define metadata type class MetaData(TypedDict): """ Represent a metadata structure with type and index fields. @@ -16,7 +14,6 @@ class MetaData(TypedDict): index_fields: list[str] -# Updated DataPoint model with versioning and new fields class DataPoint(BaseModel): """ Model representing a data point with versioning and metadata support. @@ -26,12 +23,6 @@ class DataPoint(BaseModel): - get_embeddable_properties - get_embeddable_property_names - update_version - - to_json - - from_json - - to_pickle - - from_pickle - - to_dict - - from_dict """ model_config = ConfigDict(arbitrary_types_allowed=True) @@ -85,52 +76,39 @@ class DataPoint(BaseModel): return attribute.strip() return attribute - @classmethod - def get_embeddable_properties(self, data_point: "DataPoint"): + def get_embeddable_properties(self): """ Retrieve a list of embeddable properties from the data point. This method returns a list of attribute values based on the index fields defined in the data point's metadata. If there are no index fields, it returns an empty list. - Parameters: - ----------- - - - data_point ('DataPoint'): The DataPoint instance from which to retrieve embeddable - properties. - Returns: -------- A list of embeddable property values, or an empty list if none exist. """ - if data_point.metadata and len(data_point.metadata["index_fields"]) > 0: - return [ - getattr(data_point, field, None) for field in data_point.metadata["index_fields"] - ] + if self.metadata and len(self.metadata["index_fields"]) > 0: + return [getattr(self, field, None) for field in self.metadata["index_fields"]] return [] - @classmethod - def get_embeddable_property_names(self, data_point: "DataPoint"): + def get_embeddable_property_names(self): """ Retrieve the names of embeddable properties defined in the metadata. If no index fields are defined in the metadata, this method will return an empty list. - Parameters: - ----------- - - - data_point ('DataPoint'): The DataPoint instance from which to retrieve embeddable - property names. - Returns: -------- A list of property names corresponding to the index fields, or an empty list if none exist. """ - return data_point.metadata["index_fields"] or [] + if self.metadata: + return self.metadata["index_fields"] or [] + + return [] def update_version(self): """ @@ -141,80 +119,3 @@ class DataPoint(BaseModel): """ self.version += 1 self.updated_at = int(datetime.now(timezone.utc).timestamp() * 1000) - - # JSON Serialization - def to_json(self) -> str: - """ - Serialize the DataPoint instance to a JSON string format. - - This method uses the model's built-in serialization functionality to convert the - instance into a JSON-compatible string. - - Returns: - -------- - - - str: The JSON string representation of the DataPoint instance. - """ - return self.json() - - @classmethod - def from_json(self, json_str: str): - """ - Deserialize a DataPoint instance from a JSON string. - - The method transforms the input JSON string back into a DataPoint instance using model - validation. - - Parameters: - ----------- - - - json_str (str): The JSON string representation of a DataPoint instance to be - deserialized. - - Returns: - -------- - - A new DataPoint instance created from the JSON data. - """ - return self.model_validate_json(json_str) - - def to_dict(self, **kwargs) -> Dict[str, Any]: - """ - Convert the DataPoint instance to a dictionary representation. - - This method uses the model's built-in functionality to serialize the instance attributes - to a dictionary, which can optionally include additional arguments. - - Parameters: - ----------- - - - **kwargs: Additional keyword arguments for serialization options. - - Returns: - -------- - - - Dict[str, Any]: A dictionary representation of the DataPoint instance. - """ - return self.model_dump(**kwargs) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "DataPoint": - """ - Instantiate a DataPoint from a dictionary of attribute values. - - The method validates the incoming dictionary data against the model's schema and - constructs a new DataPoint instance accordingly. - - Parameters: - ----------- - - - data (Dict[str, Any]): A dictionary containing the attributes of a DataPoint - instance. - - Returns: - -------- - - - 'DataPoint': A new DataPoint instance constructed from the provided dictionary - data. - """ - return cls.model_validate(data) diff --git a/cognee/infrastructure/files/storage/config.py b/cognee/infrastructure/files/storage/config.py index 6d1b3754f..0a0232fa6 100644 --- a/cognee/infrastructure/files/storage/config.py +++ b/cognee/infrastructure/files/storage/config.py @@ -1,4 +1,16 @@ +from typing import Optional from contextvars import ContextVar +from pydantic_settings import BaseSettings, SettingsConfigDict -file_storage_config = ContextVar("file_storage_config", default=None) +class StorageConfig(BaseSettings): + """ + Manage configuration settings for file storage. + """ + + data_root_directory: str = "" + + model_config = SettingsConfigDict(env_file=".env", extra="allow") + + +file_storage_config = ContextVar[Optional[StorageConfig]]("file_storage_config", default=None) diff --git a/cognee/infrastructure/files/storage/get_storage_config.py b/cognee/infrastructure/files/storage/get_storage_config.py index dc6d50ee2..04c14fd90 100644 --- a/cognee/infrastructure/files/storage/get_storage_config.py +++ b/cognee/infrastructure/files/storage/get_storage_config.py @@ -1,17 +1,18 @@ from cognee.base_config import get_base_config -from .config import file_storage_config +from .config import file_storage_config, StorageConfig def get_global_storage_config(): base_config = get_base_config() - return { - "data_root_directory": base_config.data_root_directory, - } + return StorageConfig( + data_root_directory=base_config.data_root_directory, + ) def get_storage_config(): context_config = file_storage_config.get() + if context_config: return context_config diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index b26d3d463..98849a91b 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -29,7 +29,6 @@ class LLMConfig(BaseSettings): Public methods include: - ensure_env_vars_for_ollama - - to_dict """ llm_provider: str = "openai" @@ -125,38 +124,6 @@ class LLMConfig(BaseSettings): return self - def to_dict(self) -> dict: - """ - Convert the LLMConfig instance into a dictionary representation. - - Returns: - -------- - - - dict: A dictionary containing the configuration settings of the LLMConfig - instance. - """ - return { - "provider": self.llm_provider, - "model": self.llm_model, - "endpoint": self.llm_endpoint, - "api_key": self.llm_api_key, - "api_version": self.llm_api_version, - "temperature": self.llm_temperature, - "streaming": self.llm_streaming, - "max_tokens": self.llm_max_tokens, - "transcription_model": self.transcription_model, - "graph_prompt_path": self.graph_prompt_path, - "rate_limit_enabled": self.llm_rate_limit_enabled, - "rate_limit_requests": self.llm_rate_limit_requests, - "rate_limit_interval": self.llm_rate_limit_interval, - "embedding_rate_limit_enabled": self.embedding_rate_limit_enabled, - "embedding_rate_limit_requests": self.embedding_rate_limit_requests, - "embedding_rate_limit_interval": self.embedding_rate_limit_interval, - "fallback_api_key": self.fallback_api_key, - "fallback_endpoint": self.fallback_endpoint, - "fallback_model": self.fallback_model, - } - @lru_cache def get_llm_config(): diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py index 4ba0f4bd6..0542de718 100644 --- a/cognee/modules/cognify/config.py +++ b/cognee/modules/cognify/config.py @@ -1,8 +1,6 @@ from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent -from typing import Optional -import os class CognifyConfig(BaseSettings): @@ -10,12 +8,6 @@ class CognifyConfig(BaseSettings): summarization_model: object = SummarizedContent model_config = SettingsConfigDict(env_file=".env", extra="allow") - def to_dict(self) -> dict: - return { - "classification_model": self.classification_model, - "summarization_model": self.summarization_model, - } - @lru_cache def get_cognify_config(): diff --git a/cognee/modules/data/deletion/prune_data.py b/cognee/modules/data/deletion/prune_data.py index 947601167..2793b1ee6 100644 --- a/cognee/modules/data/deletion/prune_data.py +++ b/cognee/modules/data/deletion/prune_data.py @@ -3,5 +3,5 @@ from cognee.infrastructure.files.storage import get_file_storage, get_storage_co async def prune_data(): storage_config = get_storage_config() - data_root_directory = storage_config["data_root_directory"] + data_root_directory = storage_config.data_root_directory await get_file_storage(data_root_directory).remove_all() diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py index c22cc338e..f02266e69 100644 --- a/cognee/modules/data/models/Data.py +++ b/cognee/modules/data/models/Data.py @@ -34,16 +34,3 @@ class Data(Base): lazy="noload", cascade="all, delete", ) - - def to_json(self) -> dict: - return { - "id": str(self.id), - "name": self.name, - "extension": self.extension, - "mimeType": self.mime_type, - "rawDataLocation": self.raw_data_location, - "createdAt": self.created_at.isoformat(), - "updatedAt": self.updated_at.isoformat() if self.updated_at else None, - "nodeSet": self.node_set, - # "datasets": [dataset.to_json() for dataset in self.datasets] - } diff --git a/cognee/modules/data/models/Dataset.py b/cognee/modules/data/models/Dataset.py index 797401d5a..2984c33f1 100644 --- a/cognee/modules/data/models/Dataset.py +++ b/cognee/modules/data/models/Dataset.py @@ -28,13 +28,3 @@ class Dataset(Base): lazy="noload", cascade="all, delete", ) - - def to_json(self) -> dict: - return { - "id": str(self.id), - "name": self.name, - "createdAt": self.created_at.isoformat(), - "updatedAt": self.updated_at.isoformat() if self.updated_at else None, - "ownerId": str(self.owner_id), - "data": [data.to_json() for data in self.data], - } diff --git a/cognee/modules/data/models/graph_relationship_ledger.py b/cognee/modules/data/models/graph_relationship_ledger.py index 2027714db..0a81c51a0 100644 --- a/cognee/modules/data/models/graph_relationship_ledger.py +++ b/cognee/modules/data/models/graph_relationship_ledger.py @@ -27,14 +27,3 @@ class GraphRelationshipLedger(Base): Index("idx_graph_relationship_ledger_source_node_id", "source_node_id"), Index("idx_graph_relationship_ledger_destination_node_id", "destination_node_id"), ) - - def to_json(self) -> dict: - return { - "id": str(self.id), - "source_node_id": str(self.parent_id), - "destination_node_id": str(self.child_id), - "creator_function": self.creator_function, - "created_at": self.created_at.isoformat(), - "deleted_at": self.deleted_at.isoformat() if self.deleted_at else None, - "user_id": str(self.user_id), - } diff --git a/cognee/modules/ingestion/save_data_to_file.py b/cognee/modules/ingestion/save_data_to_file.py index 0aa2948f6..163141987 100644 --- a/cognee/modules/ingestion/save_data_to_file.py +++ b/cognee/modules/ingestion/save_data_to_file.py @@ -7,7 +7,7 @@ from .classify import classify async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): storage_config = get_storage_config() - data_root_directory = storage_config["data_root_directory"] + data_root_directory = storage_config.data_root_directory classified_data = classify(data, filename) diff --git a/cognee/modules/logging/log_cognee_configuration.py b/cognee/modules/logging/log_cognee_configuration.py new file mode 100644 index 000000000..525da0d73 --- /dev/null +++ b/cognee/modules/logging/log_cognee_configuration.py @@ -0,0 +1,36 @@ +def log_cognee_configuration(logger): + """Log the current database configuration for all database types""" + # NOTE: Has to be imporated at runtime to avoid circular import + from cognee.infrastructure.databases.relational.config import get_relational_config + from cognee.infrastructure.databases.vector.config import get_vectordb_config + from cognee.infrastructure.databases.graph.config import get_graph_config + + try: + # Log relational database configuration + relational_config = get_relational_config() + logger.info(f"Relational database: {relational_config.db_provider}") + if relational_config.db_provider == "postgres": + logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}") + logger.info(f"Postgres database: {relational_config.db_name}") + elif relational_config.db_provider == "sqlite": + logger.info(f"SQLite path: {relational_config.db_path}/{relational_config.db_name}") + logger.info(f"SQLite database: {relational_config.db_name}") + + # Log vector database configuration + vector_config = get_vectordb_config() + logger.info(f"Vector database: {vector_config.vector_db_provider}") + if vector_config.vector_db_provider == "lancedb": + logger.info(f"Vector database path: {vector_config.vector_db_url}") + else: + logger.info(f"Vector database URL: {vector_config.vector_db_url}") + + # Log graph database configuration + graph_config = get_graph_config() + logger.info(f"Graph database: {graph_config.graph_database_provider}") + if graph_config.graph_database_provider == "kuzu": + logger.info(f"Graph database path: {graph_config.graph_file_path}") + else: + logger.info(f"Graph database URL: {graph_config.graph_database_url}") + + except Exception as e: + logger.warning(f"Could not retrieve database configuration: {str(e)}") diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py index e58c15254..f62cf25f1 100644 --- a/cognee/modules/pipelines/operations/pipeline.py +++ b/cognee/modules/pipelines/operations/pipeline.py @@ -1,7 +1,9 @@ import asyncio from uuid import UUID -from typing import Union +from typing import Optional, Union +from cognee.infrastructure.databases.graph.config import GraphConfig +from cognee.infrastructure.databases.vector.config import VectorConfig from cognee.shared.logging_utils import get_logger from cognee.modules.data.methods.get_dataset_data import get_dataset_data from cognee.modules.data.models import Data, Dataset @@ -50,8 +52,8 @@ async def cognee_pipeline( datasets: Union[str, list[str], list[UUID]] = None, user: User = None, pipeline_name: str = "custom_pipeline", - vector_db_config: dict = None, - graph_db_config: dict = None, + vector_db_config: Optional[VectorConfig] = None, + graph_db_config: Optional[GraphConfig] = None, ): # Note: These context variables allow different value assignment for databases in Cognee # per async task, thread, process and etc. diff --git a/cognee/modules/settings/get_current_settings.py b/cognee/modules/settings/get_current_settings.py index 2fb1d3c81..2f495577d 100644 --- a/cognee/modules/settings/get_current_settings.py +++ b/cognee/modules/settings/get_current_settings.py @@ -38,7 +38,7 @@ def get_current_settings() -> SettingsDict: vector_config = get_vectordb_config() relational_config = get_relational_config() - return dict( + return SettingsDict( llm={ "provider": llm_config.llm_provider, "model": llm_config.llm_model, diff --git a/cognee/shared/logging_utils.py b/cognee/shared/logging_utils.py index 989bcba64..9156455ba 100644 --- a/cognee/shared/logging_utils.py +++ b/cognee/shared/logging_utils.py @@ -4,12 +4,12 @@ import logging import structlog import traceback import platform -from datetime import datetime +from typing import Protocol from pathlib import Path -import importlib.metadata +from datetime import datetime from cognee import __version__ as cognee_version -from typing import Protocol +from cognee.modules.logging.log_cognee_configuration import log_cognee_configuration # Configure external library logging @@ -165,44 +165,6 @@ def get_logger(name=None, level=None) -> LoggerInterface: return logger -def log_database_configuration(logger): - """Log the current database configuration for all database types""" - # NOTE: Has to be imporated at runtime to avoid circular import - from cognee.infrastructure.databases.relational.config import get_relational_config - from cognee.infrastructure.databases.vector.config import get_vectordb_config - from cognee.infrastructure.databases.graph.config import get_graph_config - - try: - # Log relational database configuration - relational_config = get_relational_config() - logger.info(f"Relational database: {relational_config.db_provider}") - if relational_config.db_provider == "postgres": - logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}") - logger.info(f"Postgres database: {relational_config.db_name}") - elif relational_config.db_provider == "sqlite": - logger.info(f"SQLite path: {relational_config.db_path}") - logger.info(f"SQLite database: {relational_config.db_name}") - - # Log vector database configuration - vector_config = get_vectordb_config() - logger.info(f"Vector database: {vector_config.vector_db_provider}") - if vector_config.vector_db_provider == "lancedb": - logger.info(f"Vector database path: {vector_config.vector_db_url}") - else: - logger.info(f"Vector database URL: {vector_config.vector_db_url}") - - # Log graph database configuration - graph_config = get_graph_config() - logger.info(f"Graph database: {graph_config.graph_database_provider}") - if graph_config.graph_database_provider == "kuzu": - logger.info(f"Graph database path: {graph_config.graph_file_path}") - else: - logger.info(f"Graph database URL: {graph_config.graph_database_url}") - - except Exception as e: - logger.warning(f"Could not retrieve database configuration: {str(e)}") - - def cleanup_old_logs(logs_dir, max_files): """ Removes old log files, keeping only the most recent ones. @@ -396,9 +358,8 @@ def setup_logging(log_level=None, name=None): ) logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai") - # Log database configuration - log_database_configuration(logger) + log_cognee_configuration(logger) # Return the configured logger return logger diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index f965119fd..655be3d09 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -1,18 +1,13 @@ """This module contains utility functions for the cognee.""" import os +import pathlib import requests -from datetime import datetime, timezone -import networkx as nx -import matplotlib.pyplot as plt import http.server import socketserver -from threading import Thread -import pathlib from uuid import uuid4 - -from cognee.base_config import get_base_config -from cognee.infrastructure.databases.graph import get_graph_engine +from threading import Thread +from datetime import datetime, timezone # Analytics Proxy Url, currently hosted by Vercel diff --git a/cognee/tests/test_chromadb.py b/cognee/tests/test_chromadb.py index cf4046c07..98de63d3f 100644 --- a/cognee/tests/test_chromadb.py +++ b/cognee/tests/test_chromadb.py @@ -158,7 +158,7 @@ async def main(): assert len(history) == 8, "Search history is not correct." await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_falkordb.py b/cognee/tests/test_falkordb.py index a42470e54..921e61437 100755 --- a/cognee/tests/test_falkordb.py +++ b/cognee/tests/test_falkordb.py @@ -121,7 +121,7 @@ async def main(): # Assert local data files are cleaned properly await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" # Assert relational, vector and graph databases have been cleaned properly diff --git a/cognee/tests/test_kuzu.py b/cognee/tests/test_kuzu.py index d60c6085e..933a13a4a 100644 --- a/cognee/tests/test_kuzu.py +++ b/cognee/tests/test_kuzu.py @@ -113,7 +113,7 @@ async def main(): ) await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index c5e6cc64b..dabca0040 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -79,7 +79,7 @@ async def main(): # Assert local data files are cleaned properly await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" # Assert relational, vector and graph databases have been cleaned properly diff --git a/cognee/tests/test_memgraph.py b/cognee/tests/test_memgraph.py index b21cf3735..0a21da98e 100644 --- a/cognee/tests/test_memgraph.py +++ b/cognee/tests/test_memgraph.py @@ -92,7 +92,7 @@ async def main(): assert len(history) == 8, "Search history is not correct." await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index dcbb38963..dd2f341ec 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -117,7 +117,7 @@ async def main(): ) await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index f08043ed5..59fd99664 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -167,7 +167,7 @@ async def main(): await test_local_file_deletion(text, explanation_file_path) await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_qdrant.py b/cognee/tests/test_qdrant.py index 94e1ac074..477272bdf 100644 --- a/cognee/tests/test_qdrant.py +++ b/cognee/tests/test_qdrant.py @@ -84,7 +84,7 @@ async def main(): assert len(history) == 6, "Search history is not correct." await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_remote_kuzu.py b/cognee/tests/test_remote_kuzu.py index bbff48096..b1a4233d2 100644 --- a/cognee/tests/test_remote_kuzu.py +++ b/cognee/tests/test_remote_kuzu.py @@ -93,7 +93,7 @@ async def main(): assert len(history) == 6, "Search history is not correct." await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_s3_file_storage.py b/cognee/tests/test_s3_file_storage.py index 3a20b232c..a940b8b5c 100755 --- a/cognee/tests/test_s3_file_storage.py +++ b/cognee/tests/test_s3_file_storage.py @@ -75,7 +75,7 @@ async def main(): # Assert local data files are cleaned properly await cognee.prune.prune_data() - data_root_directory = get_storage_config()["data_root_directory"] + data_root_directory = get_storage_config().data_root_directory assert not os.path.isdir(data_root_directory), "Local data files are not deleted" # Assert relational, vector and graph databases have been cleaned properly