diff --git a/cognee/infrastructure/InfrastructureConfig.py b/cognee/infrastructure/InfrastructureConfig.py index 5cb046e1c..1363c78f7 100644 --- a/cognee/infrastructure/InfrastructureConfig.py +++ b/cognee/infrastructure/InfrastructureConfig.py @@ -2,6 +2,7 @@ import logging import os from cognee.config import Config +from .data.chunking.config import get_chunk_config from .databases.relational import DuckDBAdapter, DatabaseEngine from .databases.vector.vector_db_interface import VectorDBInterface from .databases.vector.embeddings.DefaultEmbeddingEngine import DefaultEmbeddingEngine @@ -18,6 +19,7 @@ config.load() from cognee.infrastructure.databases.relational.config import get_relationaldb_config relational = get_relationaldb_config() +chunk_config = get_chunk_config() class InfrastructureConfig(): system_root_directory: str = config.system_root_directory @@ -38,7 +40,7 @@ class InfrastructureConfig(): connect_documents = config.connect_documents database_directory_path: str = None database_file_path: str = None - chunk_strategy = config.chunk_strategy + chunk_strategy = chunk_config.chunk_strategy chunk_engine = None graph_topology = config.graph_topology monitoring_tool = config.monitoring_tool @@ -86,10 +88,10 @@ class InfrastructureConfig(): self.connect_documents = config.connect_documents if self.chunk_strategy is None: - self.chunk_strategy = config.chunk_strategy + self.chunk_strategy = chunk_config.chunk_strategy if self.chunk_engine is None: - self.chunk_engine = DefaultChunkEngine() + self.chunk_engine = chunk_config.chunk_engine if self.graph_topology is None: self.graph_topology = config.graph_topology diff --git a/cognee/infrastructure/data/chunking/config.py b/cognee/infrastructure/data/chunking/config.py new file mode 100644 index 000000000..cf55ff9d2 --- /dev/null +++ b/cognee/infrastructure/data/chunking/config.py @@ -0,0 +1,27 @@ +from functools import lru_cache +from pydantic_settings import BaseSettings, SettingsConfigDict + +from cognee.infrastructure.data.chunking.DefaultChunkEngine import DefaultChunkEngine +from cognee.shared.data_models import ChunkStrategy + + +class ChunkConfig(BaseSettings): + chunk_size: int = 1500 + chunk_overlap: int = 0 + chunk_strategy: object = ChunkStrategy.PARAGRAPH + chunk_engine: object = DefaultChunkEngine() + + + model_config = SettingsConfigDict(env_file = ".env", extra = "allow") + + def to_dict(self) -> dict: + return { + "chunk_size": self.chunk_size, + "chunk_overlap": self.chunk_overlap, + "chunk_strategy": self.chunk_strategy + } + + +@lru_cache +def get_chunk_config(): + return ChunkConfig() diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 535aae892..9012f515b 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -24,9 +24,9 @@ async def main(): dataset_name = "cs_explanations" - # explanation_file_path = "test_data/Natural_language_processing.txt" + explanation_file_path = os.path.join(os.getcwd(), "test_data/Natural_language_processing.txt") # - # await cognee.add([explanation_file_path], dataset_name) + await cognee.add([explanation_file_path], dataset_name) # dataset_name = "short_stories" # # data_directory_path is defined above @@ -46,15 +46,15 @@ async def main(): Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), Google's PaLM and Gemini (the latter of which is currently used in the chatbot of the same name), xAI's Grok, Meta's LLaMA family of open-source models, Anthropic's Claude models, Mistral AI's open source models, and Databricks' open source DBRX. """ - - dataset_name = "cs_explanations" - await cognee.add( - [ - text_1, - text_2 - ], - dataset_name - ) + # + # dataset_name = "cs_explanations" + # await cognee.add( + # [ + # text_1, + # text_2 + # ], + # dataset_name + # ) await cognee.cognify([ "cs_explanations"])