added chunking config options

This commit is contained in:
Vasilije 2024-06-09 20:41:33 +02:00
commit 00b60a9aef
6 changed files with 72 additions and 5 deletions

View file

@ -1,10 +1,43 @@
import pandas as pd import pandas as pd
from pydantic import BaseModel from pydantic import BaseModel
from typing import List, Dict, Any, Union, Optional from typing import List, Dict, Any, Union, Optional
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel
from cognee.infrastructure.databases.graph.config import get_graph_config from cognee.infrastructure.databases.graph.config import get_graph_config
import os
import pandas as pd
import json
from pydantic import BaseModel, Field
from typing import Dict, List, Optional, Union, Type, Any
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
class Relationship(BaseModel):
type: str = Field(..., description="The type of relationship, e.g., 'belongs_to'.")
source: Optional[str] = Field(None, description="The identifier of the source id of in the relationship being a directory or subdirectory")
target: Optional[str] = Field(None, description="The identifier of the target id in the relationship being the directory, subdirectory or file")
properties: Optional[Dict[str, Any]] = Field(None, description="A dictionary of additional properties and values related to the relationship.")
class JSONEntity(BaseModel):
name: str
set_type_as: Optional[str] = None
property_columns: List[str]
description: Optional[str] = None
class JSONPattern(BaseModel):
head: str
relation: str
tail: str
description: Optional[str] = None
class JSONModel(BaseModel):
node_id: str
entities: List[JSONEntity]
patterns: List[JSONPattern]
USER_ID = "default_user" USER_ID = "default_user"
async def add_topology(directory: str = "example", model: BaseModel = GitHubRepositoryModel) -> Any: async def add_topology(directory: str = "example", model: BaseModel = GitHubRepositoryModel) -> Any:
@ -44,11 +77,12 @@ async def add_topology(directory: str = "example", model: BaseModel = GitHubRepo
""" Flatten the entire repository model, starting with the top-level model """ """ Flatten the entire repository model, starting with the top-level model """
return recursive_flatten(repo_model) return recursive_flatten(repo_model)
flt_topology = flatten_repository(topology) async def add_graph_topology():
df = pd.DataFrame(flt_topology) flt_topology = flatten_repository(topology)
df = pd.DataFrame(flt_topology)
print(df.head(10))
for _, row in df.iterrows(): for _, row in df.iterrows():
node_data = row.to_dict() node_data = row.to_dict()

View file

@ -29,6 +29,9 @@ class LangchainChunkEngine():
if chunk_strategy == ChunkStrategy.CODE: if chunk_strategy == ChunkStrategy.CODE:
chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap) chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap)
elif chunk_strategy == ChunkStrategy.LANGCHAIN_CHARACTER:
chunked_data = LangchainChunkEngine.chunk_data_by_character(source_data,chunk_size, chunk_overlap)
else: else:
chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap) chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap)
return chunked_data return chunked_data
@ -50,3 +53,12 @@ class LangchainChunkEngine():
return only_content return only_content
def chunk_data_by_character(self, data_chunks, chunk_size, chunk_overlap):
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size, chunk_overlap)
data = splitter.split(data_chunks)
only_content = [chunk.page_content for chunk in data]
return only_content

View file

@ -35,6 +35,7 @@ class ChunkStrategy(Enum):
PARAGRAPH = "paragraph" PARAGRAPH = "paragraph"
SENTENCE = "sentence" SENTENCE = "sentence"
CODE = "code" CODE = "code"
LANGCHAIN_CHARACTER = "langchain_character"
class MemorySummary(BaseModel): class MemorySummary(BaseModel):
""" Memory summary. """ """ Memory summary. """

View file

@ -2,13 +2,31 @@
The goal of the blog is to discuss broader topics around the cognee project, including the motivation behind the project, the technical details, and the future of the project. The goal of the blog is to discuss broader topics around the cognee project, including the motivation behind the project, the technical details, and the future of the project.
## cognee library announcements
## knowledge graphs + rags
In progress
[//]: # (1. [LLMOps stack + Graphs](posts/llmops-and-knowledge-graphs.md))
[//]: # (2. [Where do knowledge graphs fit, and where do they not? A case study with dynamo.fyi](posts/where-do-knowledge-graphs-fit.md))
[//]: # (3. [Knowledge Graphs vs basic RAGs, some metrics](posts/knowledge-graphs-vs-basic-rags.md))
[//]: # ()
## product announcements
This section covers the release notes for the cognee library. It includes the new features, bug fixes, and improvements in each release. This section covers the release notes for the cognee library. It includes the new features, bug fixes, and improvements in each release.
1. [Cognee - library release](posts/cognee-library-release.md) 1. [Cognee - library release](posts/cognee-library-release.md)
2. [Cognee - v0.1.11 announcement](posts/cognee-v0.1.1.md)
3. [New website for cognee](posts/new-website-for-cognee.md)
[//]: # (2. [Cognee - v0.1.4 announcement](posts/cognee-v0.1.4.md))
## Towards deterministic data pipelines for LLMs step by step ## Towards deterministic data pipelines for LLMs step by step
This series mostly deals with product discovery, data engineering, and the development of robust AI memory data pipelines. This series mostly deals with product discovery, data engineering, and the development of robust AI memory data pipelines.

View file

@ -0,0 +1 @@
Test

View file

@ -5,6 +5,7 @@ The page is dedicated to collecting all research that was collected in the past
This is not an exhaustive list, and any PRs would be welcome This is not an exhaustive list, and any PRs would be welcome
### Research Papers ### Research Papers
- [2024/06/04] [Transformers and episodic memory](https://arxiv.org/abs/2405.14992)
- [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103) - [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103)
- [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143) - [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143)
- [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/) - [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/)