added chunking config options

2024-06-09 20:41:33 +02:00 · 2024-06-09 20:41:33 +02:00 · 00b60a9aef
commit 00b60a9aef
parent f79631d5da dee036ceaf
6 changed files with 72 additions and 5 deletions
--- a/cognee/api/v1/topology/add_topology.py
+++ b/cognee/api/v1/topology/add_topology.py
@ -1,10 +1,43 @@
 import pandas as pd
 from pydantic import BaseModel
 from typing import List, Dict, Any, Union, Optional
 from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
 from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel
 from cognee.infrastructure.databases.graph.config import get_graph_config
 import os
 import pandas as pd
 import json
 from pydantic import BaseModel, Field
 from typing import Dict, List, Optional, Union, Type, Any
 from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
 class Relationship(BaseModel):
    type: str = Field(..., description="The type of relationship, e.g., 'belongs_to'.")
    source: Optional[str] = Field(None, description="The identifier of the source id of in the relationship being a directory or subdirectory")
    target: Optional[str] = Field(None, description="The identifier of the target id in the relationship being the directory, subdirectory or file")
    properties: Optional[Dict[str, Any]] = Field(None, description="A dictionary of additional properties and values related to the relationship.")
 class JSONEntity(BaseModel):
    name: str
    set_type_as: Optional[str] = None
    property_columns: List[str]
    description: Optional[str] = None
 class JSONPattern(BaseModel):
    head: str
    relation: str
    tail: str
    description: Optional[str] = None
 class JSONModel(BaseModel):
    node_id: str
    entities: List[JSONEntity]
    patterns: List[JSONPattern]
 USER_ID = "default_user"
 async def add_topology(directory: str = "example", model: BaseModel = GitHubRepositoryModel) -> Any:
@ -44,11 +77,12 @@ async def add_topology(directory: str = "example", model: BaseModel = GitHubRepo
        """ Flatten the entire repository model, starting with the top-level model """
        return recursive_flatten(repo_model)
-    flt_topology = flatten_repository(topology)
+    async def add_graph_topology():
-    df = pd.DataFrame(flt_topology)
+        flt_topology = flatten_repository(topology)
        df = pd.DataFrame(flt_topology)
    print(df.head(10))
    for _, row in df.iterrows():
        node_data = row.to_dict()
--- a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py
+++ b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py
@ -29,6 +29,9 @@ class LangchainChunkEngine():
        if chunk_strategy == ChunkStrategy.CODE:
            chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap)
        elif chunk_strategy == ChunkStrategy.LANGCHAIN_CHARACTER:
            chunked_data = LangchainChunkEngine.chunk_data_by_character(source_data,chunk_size, chunk_overlap)
        else:
            chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap)
        return chunked_data
@ -50,3 +53,12 @@ class LangchainChunkEngine():
        return only_content
    def chunk_data_by_character(self, data_chunks, chunk_size, chunk_overlap):
        from langchain_text_splitters import RecursiveCharacterTextSplitter
        splitter = RecursiveCharacterTextSplitter(chunk_size, chunk_overlap)
        data = splitter.split(data_chunks)
        only_content = [chunk.page_content for chunk in data]
        return only_content
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@ -35,6 +35,7 @@ class ChunkStrategy(Enum):
    PARAGRAPH = "paragraph"
    SENTENCE = "sentence"
    CODE    = "code"
    LANGCHAIN_CHARACTER = "langchain_character"
 class MemorySummary(BaseModel):
    """ Memory summary. """
--- a/docs/blog/index.md
+++ b/docs/blog/index.md
@ -2,13 +2,31 @@
 The goal of the blog is to discuss broader topics around the cognee project, including the motivation behind the project, the technical details, and the future of the project.
-## cognee library announcements
+
 ## knowledge graphs + rags
 In progress
 [//]: # (1. [LLMOps stack + Graphs]&#40;posts/llmops-and-knowledge-graphs.md&#41;)
 [//]: # (2. [Where do knowledge graphs fit, and where do they not? A case study with dynamo.fyi]&#40;posts/where-do-knowledge-graphs-fit.md&#41;)
 [//]: # (3. [Knowledge Graphs vs basic RAGs, some metrics]&#40;posts/knowledge-graphs-vs-basic-rags.md&#41;)
 [//]: # ()
 ## product announcements
 This section covers the release notes for the cognee library. It includes the new features, bug fixes, and improvements in each release.
 1. [Cognee - library release](posts/cognee-library-release.md)
 2. [Cognee - v0.1.11 announcement](posts/cognee-v0.1.1.md)
 3. [New website for cognee](posts/new-website-for-cognee.md)
 [//]: # (2. [Cognee - v0.1.4 announcement]&#40;posts/cognee-v0.1.4.md&#41;)
 ## Towards deterministic data pipelines for LLMs step by step
 This series mostly deals with product discovery, data engineering, and the development of robust AI memory data pipelines.
--- a/docs/blog/posts/llmops-and-knowledge-graphs.md
+++ b/docs/blog/posts/llmops-and-knowledge-graphs.md
@ -0,0 +1 @@
 Test
--- a/docs/research.md
+++ b/docs/research.md
@ -5,6 +5,7 @@ The page is dedicated to collecting all research that was collected in the past
 This is not an exhaustive list, and any PRs would be welcome
 ### Research Papers
 - [2024/06/04] [Transformers and episodic memory](https://arxiv.org/abs/2405.14992)
 - [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103)
 - [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143)
 - [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/)