working topology inference
This commit is contained in:
parent
01446deb6f
commit
7249c240d0
13 changed files with 1169 additions and 798 deletions
|
|
@ -1,4 +1,5 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -6,7 +7,11 @@ import instructor
|
||||||
import nltk
|
import nltk
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
from cognee.api.v1.prune import prune
|
||||||
from cognee.config import Config
|
from cognee.config import Config
|
||||||
|
from cognee.infrastructure.data.chunking.LangchainChunkingEngine import LangchainChunkEngine
|
||||||
|
from cognee.infrastructure.databases.vector.embeddings.DefaultEmbeddingEngine import OpenAIEmbeddingEngine
|
||||||
from cognee.modules.cognify.graph.add_data_chunks import add_data_chunks
|
from cognee.modules.cognify.graph.add_data_chunks import add_data_chunks
|
||||||
from cognee.modules.cognify.graph.add_document_node import add_document_node
|
from cognee.modules.cognify.graph.add_document_node import add_document_node
|
||||||
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
|
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
|
||||||
|
|
@ -14,7 +19,6 @@ from cognee.modules.cognify.graph.add_cognitive_layer_graphs import add_cognitiv
|
||||||
from cognee.modules.cognify.graph.add_summary_nodes import add_summary_nodes
|
from cognee.modules.cognify.graph.add_summary_nodes import add_summary_nodes
|
||||||
from cognee.modules.cognify.graph.add_node_connections import group_nodes_by_layer, \
|
from cognee.modules.cognify.graph.add_node_connections import group_nodes_by_layer, \
|
||||||
graph_ready_output, connect_nodes_in_graph
|
graph_ready_output, connect_nodes_in_graph
|
||||||
from cognee.modules.cognify.graph.initialize_graph import initialize_graph
|
|
||||||
from cognee.modules.cognify.llm.resolve_cross_graph_references import resolve_cross_graph_references
|
from cognee.modules.cognify.llm.resolve_cross_graph_references import resolve_cross_graph_references
|
||||||
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
|
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
|
||||||
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
|
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
|
||||||
|
|
@ -27,6 +31,8 @@ from cognee.modules.data.get_content_categories import get_content_categories
|
||||||
from cognee.modules.data.get_content_summary import get_content_summary
|
from cognee.modules.data.get_content_summary import get_content_summary
|
||||||
from cognee.modules.data.get_cognitive_layers import get_cognitive_layers
|
from cognee.modules.data.get_cognitive_layers import get_cognitive_layers
|
||||||
from cognee.modules.data.get_layer_graphs import get_layer_graphs
|
from cognee.modules.data.get_layer_graphs import get_layer_graphs
|
||||||
|
from cognee.modules.topology.topology import TopologyEngine
|
||||||
|
from cognee.shared.data_models import ChunkStrategy
|
||||||
from cognee.utils import send_telemetry
|
from cognee.utils import send_telemetry
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -69,27 +75,12 @@ async def cognify(datasets: Union[str, List[str]] = None):
|
||||||
dataset_files = []
|
dataset_files = []
|
||||||
# datasets is a dataset name string
|
# datasets is a dataset name string
|
||||||
dataset_name = datasets.replace(".", "_").replace(" ", "_")
|
dataset_name = datasets.replace(".", "_").replace(" ", "_")
|
||||||
|
|
||||||
for added_dataset in added_datasets:
|
for added_dataset in added_datasets:
|
||||||
if dataset_name in added_dataset:
|
if dataset_name in added_dataset:
|
||||||
dataset_files.append((added_dataset, db_engine.get_files_metadata(added_dataset)))
|
dataset_files.append((added_dataset, db_engine.get_files_metadata(added_dataset)))
|
||||||
|
|
||||||
|
# print("dataset_files", dataset_files)
|
||||||
|
|
||||||
print(dataset_files)
|
|
||||||
|
|
||||||
# topology can be inferred, loaded, or extrapolated from the data in the end of the flow
|
|
||||||
# for code generation, we infer the topology from the folder structure as simple step
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
graph_topology = infrastructure_config.get_config()["graph_topology"]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
await initialize_graph(USER_ID, graph_client=graph_client)
|
|
||||||
|
|
||||||
data_chunks = {}
|
data_chunks = {}
|
||||||
|
|
||||||
|
|
@ -102,6 +93,8 @@ async def cognify(datasets: Union[str, List[str]] = None):
|
||||||
try:
|
try:
|
||||||
file_type = guess_file_type(file)
|
file_type = guess_file_type(file)
|
||||||
text = extract_text_from_file(file, file_type)
|
text = extract_text_from_file(file, file_type)
|
||||||
|
if text is None:
|
||||||
|
text = ""
|
||||||
subchunks = chunk_engine.chunk_data(chunk_strategy, text, config.chunk_size, config.chunk_overlap)
|
subchunks = chunk_engine.chunk_data(chunk_strategy, text, config.chunk_size, config.chunk_overlap)
|
||||||
|
|
||||||
if dataset_name not in data_chunks:
|
if dataset_name not in data_chunks:
|
||||||
|
|
@ -125,22 +118,25 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi
|
||||||
|
|
||||||
graph_client = await get_graph_client(infrastructure_config.get_config()["graph_engine"])
|
graph_client = await get_graph_client(infrastructure_config.get_config()["graph_engine"])
|
||||||
|
|
||||||
|
graph_topology = infrastructure_config.get_config()["graph_topology"]
|
||||||
|
|
||||||
|
|
||||||
document_id = await add_document_node(
|
document_id = await add_document_node(
|
||||||
graph_client,
|
graph_client,
|
||||||
parent_node_id = f"DefaultGraphModel__{USER_ID}", #make a param of defaultgraph model to make sure when user passes his stuff, it doesn't break pipeline
|
parent_node_id = f"{file_metadata['name']}.{file_metadata['extension']}", #make a param of defaultgraph model to make sure when user passes his stuff, it doesn't break pipeline
|
||||||
document_metadata = file_metadata,
|
document_metadata = file_metadata,
|
||||||
)
|
)
|
||||||
|
#
|
||||||
await add_label_nodes(graph_client, document_id, chunk_id, file_metadata["keywords"].split("|"))
|
await add_label_nodes(graph_client, document_id, chunk_id, file_metadata["keywords"].split("|"))
|
||||||
|
|
||||||
classified_categories = await get_content_categories(input_text)
|
# classified_categories = await get_content_categories(input_text)
|
||||||
await add_classification_nodes(
|
# await add_classification_nodes(
|
||||||
graph_client,
|
# graph_client,
|
||||||
parent_node_id = document_id,
|
# parent_node_id = document_id,
|
||||||
categories = classified_categories,
|
# categories = classified_categories,
|
||||||
)
|
# )
|
||||||
|
|
||||||
print(f"Chunk ({chunk_id}) classified.")
|
# print(f"Chunk ({chunk_id}) classified.")
|
||||||
|
|
||||||
print("document_id", document_id)
|
print("document_id", document_id)
|
||||||
|
|
||||||
|
|
@ -149,53 +145,61 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi
|
||||||
|
|
||||||
print(f"Chunk ({chunk_id}) summarized.")
|
print(f"Chunk ({chunk_id}) summarized.")
|
||||||
|
|
||||||
cognitive_layers = await get_cognitive_layers(input_text, classified_categories)
|
# cognitive_layers = await get_cognitive_layers(input_text, classified_categories)
|
||||||
cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2]
|
# cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2]
|
||||||
|
#
|
||||||
layer_graphs = await get_layer_graphs(input_text, cognitive_layers)
|
# layer_graphs = await get_layer_graphs(input_text, cognitive_layers)
|
||||||
await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs)
|
# await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs)
|
||||||
|
#
|
||||||
if infrastructure_config.get_config()["connect_documents"] is True:
|
# if infrastructure_config.get_config()["connect_documents"] is True:
|
||||||
db_engine = infrastructure_config.get_config()["database_engine"]
|
# db_engine = infrastructure_config.get_config()["database_engine"]
|
||||||
relevant_documents_to_connect = db_engine.fetch_cognify_data(excluded_document_id = file_metadata["id"])
|
# relevant_documents_to_connect = db_engine.fetch_cognify_data(excluded_document_id = file_metadata["id"])
|
||||||
|
#
|
||||||
list_of_nodes = []
|
# list_of_nodes = []
|
||||||
|
#
|
||||||
relevant_documents_to_connect.append({
|
# relevant_documents_to_connect.append({
|
||||||
"layer_id": document_id,
|
# "layer_id": document_id,
|
||||||
})
|
# })
|
||||||
|
#
|
||||||
for document in relevant_documents_to_connect:
|
# for document in relevant_documents_to_connect:
|
||||||
node_descriptions_to_match = await graph_client.extract_node_description(document["layer_id"])
|
# node_descriptions_to_match = await graph_client.extract_node_description(document["layer_id"])
|
||||||
list_of_nodes.extend(node_descriptions_to_match)
|
# list_of_nodes.extend(node_descriptions_to_match)
|
||||||
|
#
|
||||||
nodes_by_layer = await group_nodes_by_layer(list_of_nodes)
|
# nodes_by_layer = await group_nodes_by_layer(list_of_nodes)
|
||||||
|
#
|
||||||
results = await resolve_cross_graph_references(nodes_by_layer)
|
# results = await resolve_cross_graph_references(nodes_by_layer)
|
||||||
|
#
|
||||||
relationships = graph_ready_output(results)
|
# relationships = graph_ready_output(results)
|
||||||
|
#
|
||||||
await connect_nodes_in_graph(
|
# await connect_nodes_in_graph(
|
||||||
graph_client,
|
# graph_client,
|
||||||
relationships,
|
# relationships,
|
||||||
score_threshold = infrastructure_config.get_config()["intra_layer_score_treshold"]
|
# score_threshold = infrastructure_config.get_config()["intra_layer_score_treshold"]
|
||||||
)
|
# )
|
||||||
|
#
|
||||||
send_telemetry("cognee.cognify")
|
# send_telemetry("cognee.cognify")
|
||||||
|
#
|
||||||
print(f"Chunk ({chunk_id}) cognified.")
|
# print(f"Chunk ({chunk_id}) cognified.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
async def test():
|
async def test():
|
||||||
|
# await prune.prune_system()
|
||||||
|
# #
|
||||||
|
# from cognee.api.v1.add import add
|
||||||
|
# data_directory_path = os.path.abspath("../../../.data")
|
||||||
|
# # print(data_directory_path)
|
||||||
|
# # config.data_root_directory(data_directory_path)
|
||||||
|
# # cognee_directory_path = os.path.abspath("../.cognee_system")
|
||||||
|
# # config.system_root_directory(cognee_directory_path)
|
||||||
#
|
#
|
||||||
from cognee.api.v1.add import add
|
# await add("data://" +data_directory_path, "example")
|
||||||
|
|
||||||
await add(["A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification"], "code")
|
infrastructure_config.set_config( {"chunk_engine": LangchainChunkEngine() , "chunk_strategy": ChunkStrategy.CODE,'embedding_engine': OpenAIEmbeddingEngine()})
|
||||||
|
|
||||||
graph = await cognify()
|
graph = await cognify()
|
||||||
|
#
|
||||||
from cognee.utils import render_graph
|
from cognee.utils import render_graph
|
||||||
|
|
||||||
await render_graph(graph, include_color=True, include_nodes=False, include_size=False)
|
await render_graph(graph, include_color=True, include_nodes=False, include_size=False)
|
||||||
|
|
|
||||||
0
cognee/api/v1/topology/__init__.py
Normal file
0
cognee/api/v1/topology/__init__.py
Normal file
91
cognee/api/v1/topology/add_topology.py
Normal file
91
cognee/api/v1/topology/add_topology.py
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
from typing import List, Dict, Any, Union, Optional
|
||||||
|
|
||||||
|
from cognee.infrastructure import infrastructure_config
|
||||||
|
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
|
||||||
|
|
||||||
|
from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel
|
||||||
|
import pandas as pd
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
USER_ID = "default_user"
|
||||||
|
async def add_topology(directory="example", model=GitHubRepositoryModel):
|
||||||
|
graph_db_type = infrastructure_config.get_config()["graph_engine"]
|
||||||
|
|
||||||
|
graph_client = await get_graph_client(graph_db_type)
|
||||||
|
|
||||||
|
graph_topology = infrastructure_config.get_config()["graph_topology"]
|
||||||
|
|
||||||
|
engine = TopologyEngine()
|
||||||
|
topology = await engine.infer_from_directory_structure(node_id =USER_ID , repository = directory, model=model)
|
||||||
|
|
||||||
|
def flatten_model(model: BaseModel, parent_id: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Flatten a single Pydantic model to a dictionary handling nested structures."""
|
||||||
|
result = {**model.dict(), 'parent_id': parent_id}
|
||||||
|
if hasattr(model, 'default_relationship') and model.default_relationship:
|
||||||
|
result.update({
|
||||||
|
'relationship_type': model.default_relationship.type,
|
||||||
|
'relationship_source': model.default_relationship.source,
|
||||||
|
'relationship_target': model.default_relationship.target
|
||||||
|
})
|
||||||
|
return result
|
||||||
|
|
||||||
|
def recursive_flatten(items: Union[List[Any], BaseModel], parent_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||||
|
"""Recursively flatten nested Pydantic models or lists of models."""
|
||||||
|
if isinstance(items, list):
|
||||||
|
return [entry for item in items for entry in recursive_flatten(item, parent_id)]
|
||||||
|
elif isinstance(items, BaseModel):
|
||||||
|
flat = [flatten_model(items, parent_id)]
|
||||||
|
for field, value in items:
|
||||||
|
if isinstance(value, (BaseModel, list)):
|
||||||
|
flat.extend(recursive_flatten(value, items.dict().get('node_id', None)))
|
||||||
|
return flat
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def flatten_repository(repo_model):
|
||||||
|
""" Flatten the entire repository model, starting with the top-level model """
|
||||||
|
return recursive_flatten(repo_model)
|
||||||
|
|
||||||
|
flt_topology = flatten_repository(topology)
|
||||||
|
|
||||||
|
df =pd.DataFrame(flt_topology)
|
||||||
|
|
||||||
|
print(df.head(10))
|
||||||
|
|
||||||
|
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
node_data = row.to_dict()
|
||||||
|
node_id = node_data.pop('node_id')
|
||||||
|
|
||||||
|
# Remove 'node_id' and get its value
|
||||||
|
await graph_client.add_node(node_id, node_data)
|
||||||
|
if pd.notna(row['relationship_source']) and pd.notna(row['relationship_target']):
|
||||||
|
await graph_client.add_edge(row['relationship_source'], row['relationship_target'], relationship_name=row['relationship_type'])
|
||||||
|
|
||||||
|
return graph_client.graph
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
async def test():
|
||||||
|
# await prune.prune_system()
|
||||||
|
# #
|
||||||
|
# from cognee.api.v1.add import add
|
||||||
|
# data_directory_path = os.path.abspath("../../../.data")
|
||||||
|
# # print(data_directory_path)
|
||||||
|
# # config.data_root_directory(data_directory_path)
|
||||||
|
# # cognee_directory_path = os.path.abspath("../.cognee_system")
|
||||||
|
# # config.system_root_directory(cognee_directory_path)
|
||||||
|
#
|
||||||
|
# await add("data://" +data_directory_path, "example")
|
||||||
|
|
||||||
|
graph = await add_topology()
|
||||||
|
|
||||||
|
graph_db_type = infrastructure_config.get_config()["graph_engine"]
|
||||||
|
|
||||||
|
graph_client = await get_graph_client(graph_db_type)
|
||||||
|
#
|
||||||
|
from cognee.utils import render_graph
|
||||||
|
|
||||||
|
await render_graph(graph_client.graph, include_color=True, include_nodes=False, include_size=False)
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(test())
|
||||||
|
|
@ -52,6 +52,7 @@ class DefaultChunkEngine():
|
||||||
elif chunk_strategy == ChunkStrategy.EXACT:
|
elif chunk_strategy == ChunkStrategy.EXACT:
|
||||||
chunked_data = DefaultChunkEngine.chunk_data_exact(source_data, chunk_size, chunk_overlap)
|
chunked_data = DefaultChunkEngine.chunk_data_exact(source_data, chunk_size, chunk_overlap)
|
||||||
|
|
||||||
|
|
||||||
return chunked_data
|
return chunked_data
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
import asyncio
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import instructor
|
import instructor
|
||||||
|
|
@ -8,8 +9,8 @@ from openai import AsyncOpenAI
|
||||||
|
|
||||||
from cognee.config import Config
|
from cognee.config import Config
|
||||||
from cognee.root_dir import get_absolute_path
|
from cognee.root_dir import get_absolute_path
|
||||||
from .EmbeddingEngine import EmbeddingEngine
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
||||||
|
from litellm import aembedding
|
||||||
config = Config()
|
config = Config()
|
||||||
config.load()
|
config.load()
|
||||||
|
|
||||||
|
|
@ -27,16 +28,28 @@ class DefaultEmbeddingEngine(EmbeddingEngine):
|
||||||
class OpenAIEmbeddingEngine(EmbeddingEngine):
|
class OpenAIEmbeddingEngine(EmbeddingEngine):
|
||||||
async def embed_text(self, text: List[str]) -> List[float]:
|
async def embed_text(self, text: List[str]) -> List[float]:
|
||||||
|
|
||||||
OPENAI_API_KEY = config.openai_key
|
|
||||||
|
|
||||||
aclient = instructor.patch(AsyncOpenAI())
|
response = await aembedding(config.openai_embedding_model, input=text)
|
||||||
text = text.replace("\n", " ")
|
|
||||||
response = await aclient.embeddings.create(input = text, model = config.openai_embedding_model)
|
|
||||||
embedding = response.data[0].embedding
|
# embedding = response.data[0].embedding
|
||||||
# embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
|
# embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
|
||||||
return embedding
|
return response.data[0]['embedding']
|
||||||
|
|
||||||
|
|
||||||
def get_vector_size(self) -> int:
|
def get_vector_size(self) -> int:
|
||||||
return config.openai_embedding_dimensions
|
return config.openai_embedding_dimensions
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
async def gg():
|
||||||
|
openai_embedding_engine = OpenAIEmbeddingEngine()
|
||||||
|
# print(openai_embedding_engine.embed_text(["Hello, how are you?"]))
|
||||||
|
# print(openai_embedding_engine.get_vector_size())
|
||||||
|
# default_embedding_engine = DefaultEmbeddingEngine()
|
||||||
|
sds = await openai_embedding_engine.embed_text(["Hello, sadasdas are you?"])
|
||||||
|
print(sds)
|
||||||
|
# print(default_embedding_engine.get_vector_size())
|
||||||
|
|
||||||
|
asyncio.run(gg())
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1 +1,4 @@
|
||||||
You are a topology master and need to extract the following topology information from the text provided to you
|
You are a topology master and need to extract the following topology information from the text provided to you.
|
||||||
|
Relationship part'S can't be empty, and have to be logical AND CONNECTING ELEMENTS OF THE TOPOLOGY
|
||||||
|
The source is the parent of the target. And the target is the child of the source.
|
||||||
|
Have in mind this model needs to become a graph later.
|
||||||
|
|
@ -37,24 +37,24 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw
|
||||||
])
|
])
|
||||||
|
|
||||||
# Add data to vector
|
# Add data to vector
|
||||||
keyword_data_points = [
|
# keyword_data_points = [
|
||||||
DataPoint(
|
# DataPoint(
|
||||||
id = str(uuid4()),
|
# id = str(uuid4()),
|
||||||
payload = dict(
|
# payload = dict(
|
||||||
value = keyword_data["keyword"],
|
# value = keyword_data["keyword"],
|
||||||
references = dict(
|
# references = dict(
|
||||||
node_id = keyword_node_id,
|
# node_id = keyword_node_id,
|
||||||
cognitive_layer = parent_node_id,
|
# cognitive_layer = parent_node_id,
|
||||||
),
|
# ),
|
||||||
),
|
# ),
|
||||||
embed_field = "value"
|
# embed_field = "value"
|
||||||
) for (keyword_node_id, keyword_data) in keyword_nodes
|
# ) for (keyword_node_id, keyword_data) in keyword_nodes
|
||||||
]
|
# ]
|
||||||
|
#
|
||||||
try:
|
# try:
|
||||||
await vector_client.create_collection(parent_node_id)
|
# await vector_client.create_collection(parent_node_id)
|
||||||
except Exception:
|
# except Exception:
|
||||||
# It's ok if the collection already exists.
|
# # It's ok if the collection already exists.
|
||||||
pass
|
# pass
|
||||||
|
#
|
||||||
await vector_client.create_data_points(parent_node_id, keyword_data_points)
|
# await vector_client.create_data_points(parent_node_id, keyword_data_points)
|
||||||
|
|
|
||||||
|
|
@ -40,12 +40,20 @@ async def add_node(client, parent_id: Optional[str], node_id: str, node_data: di
|
||||||
if node_id != "Relationship_default":
|
if node_id != "Relationship_default":
|
||||||
try:
|
try:
|
||||||
# Attempt to add the node to the graph database
|
# Attempt to add the node to the graph database
|
||||||
|
# print('NODE ID', node_id)
|
||||||
|
# print('NODE DATA', node_data)
|
||||||
result = await client.add_node(node_id, node_properties = node_data)
|
result = await client.add_node(node_id, node_properties = node_data)
|
||||||
|
print("added node", result)
|
||||||
|
|
||||||
# Add an edge if a parent ID is provided and the graph engine is NETWORKX
|
# Add an edge if a parent ID is provided and the graph engine is NETWORKX
|
||||||
if parent_id and "default_relationship" in node_data and infrastructure_config.get_config()["graph_engine"] == GraphDBType.NETWORKX:
|
if parent_id and "default_relationship" in node_data and infrastructure_config.get_config()["graph_engine"] == GraphDBType.NETWORKX:
|
||||||
print("Node id", node_id)
|
|
||||||
await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data)
|
try:
|
||||||
|
await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data)
|
||||||
|
print("added edge")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error adding edge: {e}")
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Log the exception; consider a logging framework for production use
|
# Log the exception; consider a logging framework for production use
|
||||||
print(f"Error adding node or edge: {e}")
|
print(f"Error adding node or edge: {e}")
|
||||||
|
|
@ -139,39 +147,67 @@ async def add_edge(client, parent_id: Optional[str], node_id: str, node_data: di
|
||||||
await client.add_edge(relationship_details['source'], relationship_details['target'], relationship_name = relationship_details['type'])
|
await client.add_edge(relationship_details['source'], relationship_details['target'], relationship_name = relationship_details['type'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
|
async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
|
||||||
|
|
||||||
if created_node_ids is None:
|
if created_node_ids is None:
|
||||||
created_node_ids = []
|
created_node_ids = []
|
||||||
|
|
||||||
if isinstance(value, BaseModel):
|
if isinstance(value, BaseModel):
|
||||||
node_id = await generate_node_id(value)
|
node_id = await generate_node_id(value)
|
||||||
node_data = value.model_dump()
|
node_data = value.model_dump()
|
||||||
|
|
||||||
# Use the specified default relationship for the edge between the parent node and the current node
|
# Add node to the graph
|
||||||
|
|
||||||
created_node_id = await add_node(graph_client, parent_id, node_id, node_data)
|
created_node_id = await add_node(graph_client, parent_id, node_id, node_data)
|
||||||
|
created_node_ids.append(node_id)
|
||||||
|
|
||||||
created_node_ids.append(created_node_id)
|
# Add an edge if a default_relationship exists
|
||||||
|
if hasattr(value, 'default_relationship') and value.default_relationship:
|
||||||
# await add_edge(graph_client, parent_id, node_id, node_data, relationship_data,created_node_ids)
|
await add_edge(graph_client, parent_id, node_id, value.default_relationship.dict())
|
||||||
|
|
||||||
# Recursively process nested attributes to ensure all nodes and relationships are added to the graph
|
# Recursively process nested attributes to ensure all nodes and relationships are added to the graph
|
||||||
for sub_attr, sub_val in value.__dict__.items(): # Access attributes and their values directly
|
for sub_attr, sub_val in value.dict().items():
|
||||||
|
if isinstance(sub_val, (BaseModel, list)): # Check if the value is a model or list
|
||||||
out = await process_attribute(graph_client, node_id, sub_attr, sub_val)
|
await process_attribute(graph_client, node_id, sub_attr, sub_val, created_node_ids)
|
||||||
|
|
||||||
created_node_ids.extend(out)
|
|
||||||
|
|
||||||
elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value):
|
elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value):
|
||||||
# For lists of BaseModel instances, process each item in the list
|
# For lists of BaseModel instances, process each item in the list
|
||||||
for item in value:
|
for item in value:
|
||||||
out = await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
|
await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
|
||||||
created_node_ids.extend(out)
|
|
||||||
|
|
||||||
return created_node_ids
|
return created_node_ids
|
||||||
|
|
||||||
|
# async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
|
||||||
|
#
|
||||||
|
# if created_node_ids is None:
|
||||||
|
# created_node_ids = []
|
||||||
|
# if isinstance(value, BaseModel):
|
||||||
|
# node_id = await generate_node_id(value)
|
||||||
|
# node_data = value.model_dump()
|
||||||
|
#
|
||||||
|
# # Use the specified default relationship for the edge between the parent node and the current node
|
||||||
|
#
|
||||||
|
# created_node_id = await add_node(graph_client, parent_id, node_id, node_data)
|
||||||
|
#
|
||||||
|
# created_node_ids.append(created_node_id)
|
||||||
|
#
|
||||||
|
# # await add_edge(graph_client, parent_id, node_id, node_data, relationship_data,created_node_ids)
|
||||||
|
#
|
||||||
|
# # Recursively process nested attributes to ensure all nodes and relationships are added to the graph
|
||||||
|
# # for sub_attr, sub_val in value.__dict__.items(): # Access attributes and their values directly
|
||||||
|
# # print('SUB ATTR', sub_attr)
|
||||||
|
# # print('SUB VAL', sub_val)
|
||||||
|
# #
|
||||||
|
# # out = await process_attribute(graph_client, node_id, sub_attr, sub_val)
|
||||||
|
# #
|
||||||
|
# # created_node_ids.extend(out)
|
||||||
|
#
|
||||||
|
# elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value):
|
||||||
|
# # For lists of BaseModel instances, process each item in the list
|
||||||
|
# for item in value:
|
||||||
|
# out = await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
|
||||||
|
# created_node_ids.extend(out)
|
||||||
|
#
|
||||||
|
# return created_node_ids
|
||||||
|
|
||||||
|
|
||||||
async def process_attribute_edge(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
|
async def process_attribute_edge(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
|
||||||
|
|
||||||
|
|
@ -212,20 +248,22 @@ async def create_dynamic(graph_model, graph_client) :
|
||||||
|
|
||||||
created_node_ids.append(out)
|
created_node_ids.append(out)
|
||||||
|
|
||||||
|
print('CREATED NODE IDS', created_node_ids)
|
||||||
|
|
||||||
for attribute_name, attribute_value in graph_model:
|
for attribute_name, attribute_value in graph_model:
|
||||||
ids = await process_attribute(graph_client, root_id, attribute_name, attribute_value)
|
ids = await process_attribute(graph_client, root_id, attribute_name, attribute_value)
|
||||||
created_node_ids.extend(ids)
|
# created_node_ids.extend(ids)
|
||||||
|
#
|
||||||
flattened_and_deduplicated = list({
|
# flattened_and_deduplicated = list({
|
||||||
item["nodeId"]: item
|
# item["nodeId"]: item
|
||||||
# Use the 'nodeId' as the unique key in the dictionary comprehension
|
# # Use the 'nodeId' as the unique key in the dictionary comprehension
|
||||||
for sublist in created_node_ids if sublist # Ensure sublist is not None
|
# for sublist in created_node_ids if sublist # Ensure sublist is not None
|
||||||
for item in sublist # Iterate over items in the sublist
|
# for item in sublist # Iterate over items in the sublist
|
||||||
}.values())
|
# }.values())
|
||||||
|
#
|
||||||
for attribute_name, attribute_value in graph_model:
|
# for attribute_name, attribute_value in graph_model:
|
||||||
ids = await process_attribute_edge(graph_client, root_id, attribute_name, attribute_value, flattened_and_deduplicated)
|
# ids = await process_attribute_edge(graph_client, root_id, attribute_name, attribute_value, flattened_and_deduplicated)
|
||||||
|
#
|
||||||
return graph_client
|
return graph_client
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
||||||
from datetime import datetime
|
|
||||||
from cognee.shared.data_models import DefaultGraphModel, Relationship, UserProperties, UserLocation
|
|
||||||
from cognee.modules.cognify.graph.create import create_semantic_graph
|
|
||||||
|
|
||||||
async def initialize_graph(root_id: str, graphdatamodel=None, graph_client=None):
|
|
||||||
if graphdatamodel:
|
|
||||||
graph = graphdatamodel(node_id= root_id)
|
|
||||||
graph_ = await create_semantic_graph(graph, graph_client)
|
|
||||||
return graph_
|
|
||||||
else:
|
|
||||||
print("Creating default graph")
|
|
||||||
|
|
||||||
graph = DefaultGraphModel(
|
|
||||||
node_id = root_id,
|
|
||||||
user_properties = UserProperties(
|
|
||||||
custom_properties = { "age": "30" },
|
|
||||||
location = UserLocation(
|
|
||||||
location_id = "ny",
|
|
||||||
description = "New York",
|
|
||||||
default_relationship = Relationship(
|
|
||||||
type = "located_in",
|
|
||||||
source = "UserProperties",
|
|
||||||
target = "ny",
|
|
||||||
)
|
|
||||||
),
|
|
||||||
default_relationship = Relationship(
|
|
||||||
type = "has_properties",
|
|
||||||
source = root_id,
|
|
||||||
target = "UserProperties",
|
|
||||||
)
|
|
||||||
),
|
|
||||||
default_relationship = Relationship(
|
|
||||||
type = "has_properties",
|
|
||||||
source = root_id,
|
|
||||||
target = "UserProperties"
|
|
||||||
),
|
|
||||||
default_fields = {
|
|
||||||
"created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
||||||
"updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return await create_semantic_graph(graph, graph_client)
|
|
||||||
|
|
@ -10,7 +10,7 @@ from datetime import datetime
|
||||||
|
|
||||||
from cognee import config
|
from cognee import config
|
||||||
from cognee.infrastructure import infrastructure_config
|
from cognee.infrastructure import infrastructure_config
|
||||||
from infer_data_topology import infer_data_topology
|
from cognee.modules.topology.infer_data_topology import infer_data_topology
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -31,66 +31,124 @@ from infer_data_topology import infer_data_topology
|
||||||
# default_relationship: Relationship = Relationship(type = "has_properties")
|
# default_relationship: Relationship = Relationship(type = "has_properties")
|
||||||
#
|
#
|
||||||
class Relationship(BaseModel):
|
class Relationship(BaseModel):
|
||||||
type: str
|
type: str = Field(..., description="The type of relationship, e.g., 'belongs_to'.")
|
||||||
source: Optional[str] = None
|
source: Optional[str] = Field(None, description="The identifier of the source id of in the relationship being a directory or subdirectory")
|
||||||
target: Optional[str] = None
|
target: Optional[str] = Field(None, description="The identifier of the target id in the relationship being the directory, subdirectory or file")
|
||||||
properties: Optional[Dict[str, Any]] = None
|
properties: Optional[Dict[str, Any]] = Field(None, description="A dictionary of additional properties and values related to the relationship.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Document(BaseModel):
|
class Document(BaseModel):
|
||||||
id: str
|
node_id: str
|
||||||
title: str
|
title: str
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to"))
|
default_relationship: Relationship
|
||||||
|
|
||||||
|
|
||||||
class DirectoryModel(BaseModel):
|
class DirectoryModel(BaseModel):
|
||||||
name: str
|
node_id: str
|
||||||
path: str
|
path: str
|
||||||
summary: str
|
summary: str
|
||||||
documents: List[Document] = []
|
documents: List[Document] = []
|
||||||
subdirectories: List['DirectoryModel'] = []
|
subdirectories: List['DirectoryModel'] = []
|
||||||
default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to"))
|
default_relationship: Relationship
|
||||||
|
|
||||||
DirectoryModel.update_forward_refs()
|
DirectoryModel.update_forward_refs()
|
||||||
|
|
||||||
class RepositoryMetadata(BaseModel):
|
class DirMetadata(BaseModel):
|
||||||
name: str
|
node_id: str
|
||||||
summary: str
|
summary: str
|
||||||
owner: str
|
owner: str
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
directories: List[DirectoryModel] = []
|
directories: List[DirectoryModel] = []
|
||||||
documents: List[Document] = []
|
documents: List[Document] = []
|
||||||
default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to"))
|
default_relationship: Relationship
|
||||||
|
|
||||||
class GitHubRepositoryModel(BaseModel):
|
class GitHubRepositoryModel(BaseModel):
|
||||||
metadata: RepositoryMetadata
|
node_id: str
|
||||||
|
metadata: DirMetadata
|
||||||
root_directory: DirectoryModel
|
root_directory: DirectoryModel
|
||||||
|
|
||||||
|
|
||||||
class TopologyEngine:
|
class TopologyEngine:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.models: Dict[str, Type[BaseModel]] = {}
|
self.models: Dict[str, Type[BaseModel]] = {}
|
||||||
|
|
||||||
async def infer(self, repository: str):
|
async def populate_model(self, directory_path, file_structure, parent_id=None):
|
||||||
|
directory_id = os.path.basename(directory_path) or "root"
|
||||||
|
directory = DirectoryModel(
|
||||||
|
node_id=directory_id,
|
||||||
|
path=directory_path,
|
||||||
|
summary=f"Contents of {directory_id}",
|
||||||
|
default_relationship=Relationship(type="contains", source=parent_id, target=directory_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
for key, value in file_structure.items():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
# Recurse into subdirectory
|
||||||
|
subdirectory_path = os.path.join(directory_path, key)
|
||||||
|
subdirectory = await self.populate_model(subdirectory_path, value, parent_id=directory_id)
|
||||||
|
directory.subdirectories.append(subdirectory)
|
||||||
|
elif isinstance(value, tuple) and value[0] == 'file':
|
||||||
|
# Handle file
|
||||||
|
document = Document(
|
||||||
|
node_id=key,
|
||||||
|
title=key,
|
||||||
|
default_relationship=Relationship(type="contained_by", source=key, target=directory_id)
|
||||||
|
)
|
||||||
|
directory.documents.append(document)
|
||||||
|
|
||||||
|
return directory
|
||||||
|
|
||||||
|
async def infer_from_directory_structure(self, node_id:str, repository: str, model):
|
||||||
|
""" Infer the topology of a repository from its file structure """
|
||||||
|
|
||||||
path = infrastructure_config.get_config()["data_root_directory"]
|
path = infrastructure_config.get_config()["data_root_directory"]
|
||||||
|
|
||||||
path = path +"/"+ str(repository)
|
path = path +"/"+ str(repository)
|
||||||
print(path)
|
print(path)
|
||||||
|
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
raise FileNotFoundError(f"No such directory: {path}")
|
raise FileNotFoundError(f"No such directory: {path}")
|
||||||
|
|
||||||
file_structure = {}
|
root = {}
|
||||||
for filename in glob.glob(f"{path}/**", recursive=True):
|
for filename in glob.glob(f"{path}/**", recursive=True):
|
||||||
|
parts = os.path.relpath(filename, start=path).split(os.path.sep)
|
||||||
|
current = root
|
||||||
|
for part in parts[:-1]: # Traverse/create to the last directory
|
||||||
|
if part not in current:
|
||||||
|
current[part] = {}
|
||||||
|
current = current[part]
|
||||||
|
last_part = parts[-1]
|
||||||
if os.path.isfile(filename):
|
if os.path.isfile(filename):
|
||||||
key = os.path.relpath(filename, start=path).replace(os.path.sep, "__")
|
current[last_part] = ("file", ...) # Placeholder for file content or metadata
|
||||||
file_structure[key] = (str, ...) # Assuming content as string for simplicity
|
elif os.path.isdir(filename):
|
||||||
|
if last_part not in current: # Only create a new directory entry if it doesn't exist
|
||||||
|
current[last_part] = {}
|
||||||
|
|
||||||
|
root_directory = await self.populate_model('/', root)
|
||||||
|
|
||||||
result = await infer_data_topology(str(file_structure), GitHubRepositoryModel)
|
# repository_metadata = await infer_data_topology(str(root), DirMetadata)
|
||||||
|
|
||||||
|
repository_metadata = DirMetadata(
|
||||||
|
node_id="repo1",
|
||||||
|
summary="Example repository",
|
||||||
|
owner="user1",
|
||||||
|
directories=[root_directory],
|
||||||
|
documents=[],
|
||||||
|
default_relationship=Relationship(type="contained_by", source="repo1", target=node_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
active_model = GitHubRepositoryModel(
|
||||||
|
node_id=node_id,
|
||||||
|
metadata=repository_metadata,
|
||||||
|
root_directory=root_directory
|
||||||
|
)
|
||||||
|
|
||||||
|
return active_model
|
||||||
|
|
||||||
|
# print(github_repo_model)
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def load(self, model_name: str):
|
def load(self, model_name: str):
|
||||||
return self.models.get(model_name)
|
return self.models.get(model_name)
|
||||||
|
|
|
||||||
1429
poetry.lock
generated
1429
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -63,8 +63,9 @@ tiktoken = "^0.6.0"
|
||||||
dspy-ai = "2.4.3"
|
dspy-ai = "2.4.3"
|
||||||
posthog = "^3.5.0"
|
posthog = "^3.5.0"
|
||||||
lancedb = "^0.6.10"
|
lancedb = "^0.6.10"
|
||||||
importlib-metadata = "6.1.0"
|
importlib-metadata = "6.8.0"
|
||||||
deepeval = "^0.21.36"
|
deepeval = "^0.21.36"
|
||||||
|
litellm = "^1.37.3"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue