added chunking config options
This commit is contained in:
commit
00b60a9aef
6 changed files with 72 additions and 5 deletions
|
|
@ -1,10 +1,43 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from typing import List, Dict, Any, Union, Optional
|
from typing import List, Dict, Any, Union, Optional
|
||||||
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
|
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
|
||||||
from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel
|
from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel
|
||||||
from cognee.infrastructure.databases.graph.config import get_graph_config
|
from cognee.infrastructure.databases.graph.config import get_graph_config
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing import Dict, List, Optional, Union, Type, Any
|
||||||
|
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Relationship(BaseModel):
|
||||||
|
type: str = Field(..., description="The type of relationship, e.g., 'belongs_to'.")
|
||||||
|
source: Optional[str] = Field(None, description="The identifier of the source id of in the relationship being a directory or subdirectory")
|
||||||
|
target: Optional[str] = Field(None, description="The identifier of the target id in the relationship being the directory, subdirectory or file")
|
||||||
|
properties: Optional[Dict[str, Any]] = Field(None, description="A dictionary of additional properties and values related to the relationship.")
|
||||||
|
|
||||||
|
class JSONEntity(BaseModel):
|
||||||
|
name: str
|
||||||
|
set_type_as: Optional[str] = None
|
||||||
|
property_columns: List[str]
|
||||||
|
description: Optional[str] = None
|
||||||
|
|
||||||
|
class JSONPattern(BaseModel):
|
||||||
|
head: str
|
||||||
|
relation: str
|
||||||
|
tail: str
|
||||||
|
description: Optional[str] = None
|
||||||
|
|
||||||
|
class JSONModel(BaseModel):
|
||||||
|
node_id: str
|
||||||
|
entities: List[JSONEntity]
|
||||||
|
patterns: List[JSONPattern]
|
||||||
USER_ID = "default_user"
|
USER_ID = "default_user"
|
||||||
|
|
||||||
async def add_topology(directory: str = "example", model: BaseModel = GitHubRepositoryModel) -> Any:
|
async def add_topology(directory: str = "example", model: BaseModel = GitHubRepositoryModel) -> Any:
|
||||||
|
|
@ -44,11 +77,12 @@ async def add_topology(directory: str = "example", model: BaseModel = GitHubRepo
|
||||||
""" Flatten the entire repository model, starting with the top-level model """
|
""" Flatten the entire repository model, starting with the top-level model """
|
||||||
return recursive_flatten(repo_model)
|
return recursive_flatten(repo_model)
|
||||||
|
|
||||||
flt_topology = flatten_repository(topology)
|
async def add_graph_topology():
|
||||||
|
|
||||||
df = pd.DataFrame(flt_topology)
|
flt_topology = flatten_repository(topology)
|
||||||
|
|
||||||
|
df = pd.DataFrame(flt_topology)
|
||||||
|
|
||||||
print(df.head(10))
|
|
||||||
|
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
node_data = row.to_dict()
|
node_data = row.to_dict()
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,9 @@ class LangchainChunkEngine():
|
||||||
|
|
||||||
if chunk_strategy == ChunkStrategy.CODE:
|
if chunk_strategy == ChunkStrategy.CODE:
|
||||||
chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap)
|
chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap)
|
||||||
|
|
||||||
|
elif chunk_strategy == ChunkStrategy.LANGCHAIN_CHARACTER:
|
||||||
|
chunked_data = LangchainChunkEngine.chunk_data_by_character(source_data,chunk_size, chunk_overlap)
|
||||||
else:
|
else:
|
||||||
chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap)
|
chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap)
|
||||||
return chunked_data
|
return chunked_data
|
||||||
|
|
@ -50,3 +53,12 @@ class LangchainChunkEngine():
|
||||||
|
|
||||||
return only_content
|
return only_content
|
||||||
|
|
||||||
|
def chunk_data_by_character(self, data_chunks, chunk_size, chunk_overlap):
|
||||||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size, chunk_overlap)
|
||||||
|
data = splitter.split(data_chunks)
|
||||||
|
|
||||||
|
only_content = [chunk.page_content for chunk in data]
|
||||||
|
|
||||||
|
return only_content
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,7 @@ class ChunkStrategy(Enum):
|
||||||
PARAGRAPH = "paragraph"
|
PARAGRAPH = "paragraph"
|
||||||
SENTENCE = "sentence"
|
SENTENCE = "sentence"
|
||||||
CODE = "code"
|
CODE = "code"
|
||||||
|
LANGCHAIN_CHARACTER = "langchain_character"
|
||||||
|
|
||||||
class MemorySummary(BaseModel):
|
class MemorySummary(BaseModel):
|
||||||
""" Memory summary. """
|
""" Memory summary. """
|
||||||
|
|
|
||||||
|
|
@ -2,13 +2,31 @@
|
||||||
|
|
||||||
The goal of the blog is to discuss broader topics around the cognee project, including the motivation behind the project, the technical details, and the future of the project.
|
The goal of the blog is to discuss broader topics around the cognee project, including the motivation behind the project, the technical details, and the future of the project.
|
||||||
|
|
||||||
## cognee library announcements
|
|
||||||
|
## knowledge graphs + rags
|
||||||
|
|
||||||
|
In progress
|
||||||
|
|
||||||
|
[//]: # (1. [LLMOps stack + Graphs](posts/llmops-and-knowledge-graphs.md))
|
||||||
|
|
||||||
|
[//]: # (2. [Where do knowledge graphs fit, and where do they not? A case study with dynamo.fyi](posts/where-do-knowledge-graphs-fit.md))
|
||||||
|
|
||||||
|
[//]: # (3. [Knowledge Graphs vs basic RAGs, some metrics](posts/knowledge-graphs-vs-basic-rags.md))
|
||||||
|
|
||||||
|
[//]: # ()
|
||||||
|
|
||||||
|
|
||||||
|
## product announcements
|
||||||
|
|
||||||
This section covers the release notes for the cognee library. It includes the new features, bug fixes, and improvements in each release.
|
This section covers the release notes for the cognee library. It includes the new features, bug fixes, and improvements in each release.
|
||||||
|
|
||||||
1. [Cognee - library release](posts/cognee-library-release.md)
|
1. [Cognee - library release](posts/cognee-library-release.md)
|
||||||
|
2. [Cognee - v0.1.11 announcement](posts/cognee-v0.1.1.md)
|
||||||
|
3. [New website for cognee](posts/new-website-for-cognee.md)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
[//]: # (2. [Cognee - v0.1.4 announcement](posts/cognee-v0.1.4.md))
|
|
||||||
|
|
||||||
## Towards deterministic data pipelines for LLMs step by step
|
## Towards deterministic data pipelines for LLMs step by step
|
||||||
This series mostly deals with product discovery, data engineering, and the development of robust AI memory data pipelines.
|
This series mostly deals with product discovery, data engineering, and the development of robust AI memory data pipelines.
|
||||||
|
|
|
||||||
1
docs/blog/posts/llmops-and-knowledge-graphs.md
Normal file
1
docs/blog/posts/llmops-and-knowledge-graphs.md
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
Test
|
||||||
|
|
@ -5,6 +5,7 @@ The page is dedicated to collecting all research that was collected in the past
|
||||||
This is not an exhaustive list, and any PRs would be welcome
|
This is not an exhaustive list, and any PRs would be welcome
|
||||||
|
|
||||||
### Research Papers
|
### Research Papers
|
||||||
|
- [2024/06/04] [Transformers and episodic memory](https://arxiv.org/abs/2405.14992)
|
||||||
- [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103)
|
- [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103)
|
||||||
- [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143)
|
- [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143)
|
||||||
- [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/)
|
- [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue