working topology inference

This commit is contained in:
Vasilije 2024-05-11 23:10:00 +02:00
parent 01446deb6f
commit 7249c240d0
13 changed files with 1169 additions and 798 deletions

View file

@ -1,4 +1,5 @@
import asyncio import asyncio
import os
from uuid import uuid4 from uuid import uuid4
from typing import List, Union from typing import List, Union
import logging import logging
@ -6,7 +7,11 @@ import instructor
import nltk import nltk
from openai import OpenAI from openai import OpenAI
from nltk.corpus import stopwords from nltk.corpus import stopwords
from cognee.api.v1.prune import prune
from cognee.config import Config from cognee.config import Config
from cognee.infrastructure.data.chunking.LangchainChunkingEngine import LangchainChunkEngine
from cognee.infrastructure.databases.vector.embeddings.DefaultEmbeddingEngine import OpenAIEmbeddingEngine
from cognee.modules.cognify.graph.add_data_chunks import add_data_chunks from cognee.modules.cognify.graph.add_data_chunks import add_data_chunks
from cognee.modules.cognify.graph.add_document_node import add_document_node from cognee.modules.cognify.graph.add_document_node import add_document_node
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
@ -14,7 +19,6 @@ from cognee.modules.cognify.graph.add_cognitive_layer_graphs import add_cognitiv
from cognee.modules.cognify.graph.add_summary_nodes import add_summary_nodes from cognee.modules.cognify.graph.add_summary_nodes import add_summary_nodes
from cognee.modules.cognify.graph.add_node_connections import group_nodes_by_layer, \ from cognee.modules.cognify.graph.add_node_connections import group_nodes_by_layer, \
graph_ready_output, connect_nodes_in_graph graph_ready_output, connect_nodes_in_graph
from cognee.modules.cognify.graph.initialize_graph import initialize_graph
from cognee.modules.cognify.llm.resolve_cross_graph_references import resolve_cross_graph_references from cognee.modules.cognify.llm.resolve_cross_graph_references import resolve_cross_graph_references
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
@ -27,6 +31,8 @@ from cognee.modules.data.get_content_categories import get_content_categories
from cognee.modules.data.get_content_summary import get_content_summary from cognee.modules.data.get_content_summary import get_content_summary
from cognee.modules.data.get_cognitive_layers import get_cognitive_layers from cognee.modules.data.get_cognitive_layers import get_cognitive_layers
from cognee.modules.data.get_layer_graphs import get_layer_graphs from cognee.modules.data.get_layer_graphs import get_layer_graphs
from cognee.modules.topology.topology import TopologyEngine
from cognee.shared.data_models import ChunkStrategy
from cognee.utils import send_telemetry from cognee.utils import send_telemetry
@ -69,27 +75,12 @@ async def cognify(datasets: Union[str, List[str]] = None):
dataset_files = [] dataset_files = []
# datasets is a dataset name string # datasets is a dataset name string
dataset_name = datasets.replace(".", "_").replace(" ", "_") dataset_name = datasets.replace(".", "_").replace(" ", "_")
for added_dataset in added_datasets: for added_dataset in added_datasets:
if dataset_name in added_dataset: if dataset_name in added_dataset:
dataset_files.append((added_dataset, db_engine.get_files_metadata(added_dataset))) dataset_files.append((added_dataset, db_engine.get_files_metadata(added_dataset)))
# print("dataset_files", dataset_files)
print(dataset_files)
# topology can be inferred, loaded, or extrapolated from the data in the end of the flow
# for code generation, we infer the topology from the folder structure as simple step
graph_topology = infrastructure_config.get_config()["graph_topology"]
await initialize_graph(USER_ID, graph_client=graph_client)
data_chunks = {} data_chunks = {}
@ -102,6 +93,8 @@ async def cognify(datasets: Union[str, List[str]] = None):
try: try:
file_type = guess_file_type(file) file_type = guess_file_type(file)
text = extract_text_from_file(file, file_type) text = extract_text_from_file(file, file_type)
if text is None:
text = ""
subchunks = chunk_engine.chunk_data(chunk_strategy, text, config.chunk_size, config.chunk_overlap) subchunks = chunk_engine.chunk_data(chunk_strategy, text, config.chunk_size, config.chunk_overlap)
if dataset_name not in data_chunks: if dataset_name not in data_chunks:
@ -125,22 +118,25 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi
graph_client = await get_graph_client(infrastructure_config.get_config()["graph_engine"]) graph_client = await get_graph_client(infrastructure_config.get_config()["graph_engine"])
graph_topology = infrastructure_config.get_config()["graph_topology"]
document_id = await add_document_node( document_id = await add_document_node(
graph_client, graph_client,
parent_node_id = f"DefaultGraphModel__{USER_ID}", #make a param of defaultgraph model to make sure when user passes his stuff, it doesn't break pipeline parent_node_id = f"{file_metadata['name']}.{file_metadata['extension']}", #make a param of defaultgraph model to make sure when user passes his stuff, it doesn't break pipeline
document_metadata = file_metadata, document_metadata = file_metadata,
) )
#
await add_label_nodes(graph_client, document_id, chunk_id, file_metadata["keywords"].split("|")) await add_label_nodes(graph_client, document_id, chunk_id, file_metadata["keywords"].split("|"))
classified_categories = await get_content_categories(input_text) # classified_categories = await get_content_categories(input_text)
await add_classification_nodes( # await add_classification_nodes(
graph_client, # graph_client,
parent_node_id = document_id, # parent_node_id = document_id,
categories = classified_categories, # categories = classified_categories,
) # )
print(f"Chunk ({chunk_id}) classified.") # print(f"Chunk ({chunk_id}) classified.")
print("document_id", document_id) print("document_id", document_id)
@ -149,53 +145,61 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi
print(f"Chunk ({chunk_id}) summarized.") print(f"Chunk ({chunk_id}) summarized.")
cognitive_layers = await get_cognitive_layers(input_text, classified_categories) # cognitive_layers = await get_cognitive_layers(input_text, classified_categories)
cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2] # cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2]
#
layer_graphs = await get_layer_graphs(input_text, cognitive_layers) # layer_graphs = await get_layer_graphs(input_text, cognitive_layers)
await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs) # await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs)
#
if infrastructure_config.get_config()["connect_documents"] is True: # if infrastructure_config.get_config()["connect_documents"] is True:
db_engine = infrastructure_config.get_config()["database_engine"] # db_engine = infrastructure_config.get_config()["database_engine"]
relevant_documents_to_connect = db_engine.fetch_cognify_data(excluded_document_id = file_metadata["id"]) # relevant_documents_to_connect = db_engine.fetch_cognify_data(excluded_document_id = file_metadata["id"])
#
list_of_nodes = [] # list_of_nodes = []
#
relevant_documents_to_connect.append({ # relevant_documents_to_connect.append({
"layer_id": document_id, # "layer_id": document_id,
}) # })
#
for document in relevant_documents_to_connect: # for document in relevant_documents_to_connect:
node_descriptions_to_match = await graph_client.extract_node_description(document["layer_id"]) # node_descriptions_to_match = await graph_client.extract_node_description(document["layer_id"])
list_of_nodes.extend(node_descriptions_to_match) # list_of_nodes.extend(node_descriptions_to_match)
#
nodes_by_layer = await group_nodes_by_layer(list_of_nodes) # nodes_by_layer = await group_nodes_by_layer(list_of_nodes)
#
results = await resolve_cross_graph_references(nodes_by_layer) # results = await resolve_cross_graph_references(nodes_by_layer)
#
relationships = graph_ready_output(results) # relationships = graph_ready_output(results)
#
await connect_nodes_in_graph( # await connect_nodes_in_graph(
graph_client, # graph_client,
relationships, # relationships,
score_threshold = infrastructure_config.get_config()["intra_layer_score_treshold"] # score_threshold = infrastructure_config.get_config()["intra_layer_score_treshold"]
) # )
#
send_telemetry("cognee.cognify") # send_telemetry("cognee.cognify")
#
print(f"Chunk ({chunk_id}) cognified.") # print(f"Chunk ({chunk_id}) cognified.")
if __name__ == "__main__": if __name__ == "__main__":
async def test(): async def test():
# await prune.prune_system()
# #
# from cognee.api.v1.add import add
# data_directory_path = os.path.abspath("../../../.data")
# # print(data_directory_path)
# # config.data_root_directory(data_directory_path)
# # cognee_directory_path = os.path.abspath("../.cognee_system")
# # config.system_root_directory(cognee_directory_path)
# #
from cognee.api.v1.add import add # await add("data://" +data_directory_path, "example")
await add(["A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification"], "code") infrastructure_config.set_config( {"chunk_engine": LangchainChunkEngine() , "chunk_strategy": ChunkStrategy.CODE,'embedding_engine': OpenAIEmbeddingEngine()})
graph = await cognify() graph = await cognify()
#
from cognee.utils import render_graph from cognee.utils import render_graph
await render_graph(graph, include_color=True, include_nodes=False, include_size=False) await render_graph(graph, include_color=True, include_nodes=False, include_size=False)

View file

View file

@ -0,0 +1,91 @@
from typing import List, Dict, Any, Union, Optional
from cognee.infrastructure import infrastructure_config
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel
import pandas as pd
from pydantic import BaseModel
USER_ID = "default_user"
async def add_topology(directory="example", model=GitHubRepositoryModel):
graph_db_type = infrastructure_config.get_config()["graph_engine"]
graph_client = await get_graph_client(graph_db_type)
graph_topology = infrastructure_config.get_config()["graph_topology"]
engine = TopologyEngine()
topology = await engine.infer_from_directory_structure(node_id =USER_ID , repository = directory, model=model)
def flatten_model(model: BaseModel, parent_id: Optional[str] = None) -> Dict[str, Any]:
"""Flatten a single Pydantic model to a dictionary handling nested structures."""
result = {**model.dict(), 'parent_id': parent_id}
if hasattr(model, 'default_relationship') and model.default_relationship:
result.update({
'relationship_type': model.default_relationship.type,
'relationship_source': model.default_relationship.source,
'relationship_target': model.default_relationship.target
})
return result
def recursive_flatten(items: Union[List[Any], BaseModel], parent_id: Optional[str] = None) -> List[Dict[str, Any]]:
"""Recursively flatten nested Pydantic models or lists of models."""
if isinstance(items, list):
return [entry for item in items for entry in recursive_flatten(item, parent_id)]
elif isinstance(items, BaseModel):
flat = [flatten_model(items, parent_id)]
for field, value in items:
if isinstance(value, (BaseModel, list)):
flat.extend(recursive_flatten(value, items.dict().get('node_id', None)))
return flat
else:
return []
def flatten_repository(repo_model):
""" Flatten the entire repository model, starting with the top-level model """
return recursive_flatten(repo_model)
flt_topology = flatten_repository(topology)
df =pd.DataFrame(flt_topology)
print(df.head(10))
for _, row in df.iterrows():
node_data = row.to_dict()
node_id = node_data.pop('node_id')
# Remove 'node_id' and get its value
await graph_client.add_node(node_id, node_data)
if pd.notna(row['relationship_source']) and pd.notna(row['relationship_target']):
await graph_client.add_edge(row['relationship_source'], row['relationship_target'], relationship_name=row['relationship_type'])
return graph_client.graph
if __name__ == "__main__":
async def test():
# await prune.prune_system()
# #
# from cognee.api.v1.add import add
# data_directory_path = os.path.abspath("../../../.data")
# # print(data_directory_path)
# # config.data_root_directory(data_directory_path)
# # cognee_directory_path = os.path.abspath("../.cognee_system")
# # config.system_root_directory(cognee_directory_path)
#
# await add("data://" +data_directory_path, "example")
graph = await add_topology()
graph_db_type = infrastructure_config.get_config()["graph_engine"]
graph_client = await get_graph_client(graph_db_type)
#
from cognee.utils import render_graph
await render_graph(graph_client.graph, include_color=True, include_nodes=False, include_size=False)
import asyncio
asyncio.run(test())

View file

@ -52,6 +52,7 @@ class DefaultChunkEngine():
elif chunk_strategy == ChunkStrategy.EXACT: elif chunk_strategy == ChunkStrategy.EXACT:
chunked_data = DefaultChunkEngine.chunk_data_exact(source_data, chunk_size, chunk_overlap) chunked_data = DefaultChunkEngine.chunk_data_exact(source_data, chunk_size, chunk_overlap)
return chunked_data return chunked_data

View file

@ -1,3 +1,4 @@
import asyncio
from typing import List from typing import List
import instructor import instructor
@ -8,8 +9,8 @@ from openai import AsyncOpenAI
from cognee.config import Config from cognee.config import Config
from cognee.root_dir import get_absolute_path from cognee.root_dir import get_absolute_path
from .EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
from litellm import aembedding
config = Config() config = Config()
config.load() config.load()
@ -27,16 +28,28 @@ class DefaultEmbeddingEngine(EmbeddingEngine):
class OpenAIEmbeddingEngine(EmbeddingEngine): class OpenAIEmbeddingEngine(EmbeddingEngine):
async def embed_text(self, text: List[str]) -> List[float]: async def embed_text(self, text: List[str]) -> List[float]:
OPENAI_API_KEY = config.openai_key
aclient = instructor.patch(AsyncOpenAI()) response = await aembedding(config.openai_embedding_model, input=text)
text = text.replace("\n", " ")
response = await aclient.embeddings.create(input = text, model = config.openai_embedding_model)
embedding = response.data[0].embedding # embedding = response.data[0].embedding
# embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text))) # embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
return embedding return response.data[0]['embedding']
def get_vector_size(self) -> int: def get_vector_size(self) -> int:
return config.openai_embedding_dimensions return config.openai_embedding_dimensions
if __name__ == "__main__":
async def gg():
openai_embedding_engine = OpenAIEmbeddingEngine()
# print(openai_embedding_engine.embed_text(["Hello, how are you?"]))
# print(openai_embedding_engine.get_vector_size())
# default_embedding_engine = DefaultEmbeddingEngine()
sds = await openai_embedding_engine.embed_text(["Hello, sadasdas are you?"])
print(sds)
# print(default_embedding_engine.get_vector_size())
asyncio.run(gg())

View file

@ -1 +1,4 @@
You are a topology master and need to extract the following topology information from the text provided to you You are a topology master and need to extract the following topology information from the text provided to you.
Relationship part'S can't be empty, and have to be logical AND CONNECTING ELEMENTS OF THE TOPOLOGY
The source is the parent of the target. And the target is the child of the source.
Have in mind this model needs to become a graph later.

View file

@ -37,24 +37,24 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw
]) ])
# Add data to vector # Add data to vector
keyword_data_points = [ # keyword_data_points = [
DataPoint( # DataPoint(
id = str(uuid4()), # id = str(uuid4()),
payload = dict( # payload = dict(
value = keyword_data["keyword"], # value = keyword_data["keyword"],
references = dict( # references = dict(
node_id = keyword_node_id, # node_id = keyword_node_id,
cognitive_layer = parent_node_id, # cognitive_layer = parent_node_id,
), # ),
), # ),
embed_field = "value" # embed_field = "value"
) for (keyword_node_id, keyword_data) in keyword_nodes # ) for (keyword_node_id, keyword_data) in keyword_nodes
] # ]
#
try: # try:
await vector_client.create_collection(parent_node_id) # await vector_client.create_collection(parent_node_id)
except Exception: # except Exception:
# It's ok if the collection already exists. # # It's ok if the collection already exists.
pass # pass
#
await vector_client.create_data_points(parent_node_id, keyword_data_points) # await vector_client.create_data_points(parent_node_id, keyword_data_points)

View file

@ -40,12 +40,20 @@ async def add_node(client, parent_id: Optional[str], node_id: str, node_data: di
if node_id != "Relationship_default": if node_id != "Relationship_default":
try: try:
# Attempt to add the node to the graph database # Attempt to add the node to the graph database
# print('NODE ID', node_id)
# print('NODE DATA', node_data)
result = await client.add_node(node_id, node_properties = node_data) result = await client.add_node(node_id, node_properties = node_data)
print("added node", result)
# Add an edge if a parent ID is provided and the graph engine is NETWORKX # Add an edge if a parent ID is provided and the graph engine is NETWORKX
if parent_id and "default_relationship" in node_data and infrastructure_config.get_config()["graph_engine"] == GraphDBType.NETWORKX: if parent_id and "default_relationship" in node_data and infrastructure_config.get_config()["graph_engine"] == GraphDBType.NETWORKX:
print("Node id", node_id)
await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data) try:
await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data)
print("added edge")
except Exception as e:
print(f"Error adding edge: {e}")
pass
except Exception as e: except Exception as e:
# Log the exception; consider a logging framework for production use # Log the exception; consider a logging framework for production use
print(f"Error adding node or edge: {e}") print(f"Error adding node or edge: {e}")
@ -139,39 +147,67 @@ async def add_edge(client, parent_id: Optional[str], node_id: str, node_data: di
await client.add_edge(relationship_details['source'], relationship_details['target'], relationship_name = relationship_details['type']) await client.add_edge(relationship_details['source'], relationship_details['target'], relationship_name = relationship_details['type'])
async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None): async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
if created_node_ids is None: if created_node_ids is None:
created_node_ids = [] created_node_ids = []
if isinstance(value, BaseModel): if isinstance(value, BaseModel):
node_id = await generate_node_id(value) node_id = await generate_node_id(value)
node_data = value.model_dump() node_data = value.model_dump()
# Use the specified default relationship for the edge between the parent node and the current node # Add node to the graph
created_node_id = await add_node(graph_client, parent_id, node_id, node_data) created_node_id = await add_node(graph_client, parent_id, node_id, node_data)
created_node_ids.append(node_id)
created_node_ids.append(created_node_id) # Add an edge if a default_relationship exists
if hasattr(value, 'default_relationship') and value.default_relationship:
# await add_edge(graph_client, parent_id, node_id, node_data, relationship_data,created_node_ids) await add_edge(graph_client, parent_id, node_id, value.default_relationship.dict())
# Recursively process nested attributes to ensure all nodes and relationships are added to the graph # Recursively process nested attributes to ensure all nodes and relationships are added to the graph
for sub_attr, sub_val in value.__dict__.items(): # Access attributes and their values directly for sub_attr, sub_val in value.dict().items():
if isinstance(sub_val, (BaseModel, list)): # Check if the value is a model or list
out = await process_attribute(graph_client, node_id, sub_attr, sub_val) await process_attribute(graph_client, node_id, sub_attr, sub_val, created_node_ids)
created_node_ids.extend(out)
elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value): elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value):
# For lists of BaseModel instances, process each item in the list # For lists of BaseModel instances, process each item in the list
for item in value: for item in value:
out = await process_attribute(graph_client, parent_id, attribute, item, created_node_ids) await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
created_node_ids.extend(out)
return created_node_ids return created_node_ids
# async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
#
# if created_node_ids is None:
# created_node_ids = []
# if isinstance(value, BaseModel):
# node_id = await generate_node_id(value)
# node_data = value.model_dump()
#
# # Use the specified default relationship for the edge between the parent node and the current node
#
# created_node_id = await add_node(graph_client, parent_id, node_id, node_data)
#
# created_node_ids.append(created_node_id)
#
# # await add_edge(graph_client, parent_id, node_id, node_data, relationship_data,created_node_ids)
#
# # Recursively process nested attributes to ensure all nodes and relationships are added to the graph
# # for sub_attr, sub_val in value.__dict__.items(): # Access attributes and their values directly
# # print('SUB ATTR', sub_attr)
# # print('SUB VAL', sub_val)
# #
# # out = await process_attribute(graph_client, node_id, sub_attr, sub_val)
# #
# # created_node_ids.extend(out)
#
# elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value):
# # For lists of BaseModel instances, process each item in the list
# for item in value:
# out = await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
# created_node_ids.extend(out)
#
# return created_node_ids
async def process_attribute_edge(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None): async def process_attribute_edge(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
@ -212,20 +248,22 @@ async def create_dynamic(graph_model, graph_client) :
created_node_ids.append(out) created_node_ids.append(out)
print('CREATED NODE IDS', created_node_ids)
for attribute_name, attribute_value in graph_model: for attribute_name, attribute_value in graph_model:
ids = await process_attribute(graph_client, root_id, attribute_name, attribute_value) ids = await process_attribute(graph_client, root_id, attribute_name, attribute_value)
created_node_ids.extend(ids) # created_node_ids.extend(ids)
#
flattened_and_deduplicated = list({ # flattened_and_deduplicated = list({
item["nodeId"]: item # item["nodeId"]: item
# Use the 'nodeId' as the unique key in the dictionary comprehension # # Use the 'nodeId' as the unique key in the dictionary comprehension
for sublist in created_node_ids if sublist # Ensure sublist is not None # for sublist in created_node_ids if sublist # Ensure sublist is not None
for item in sublist # Iterate over items in the sublist # for item in sublist # Iterate over items in the sublist
}.values()) # }.values())
#
for attribute_name, attribute_value in graph_model: # for attribute_name, attribute_value in graph_model:
ids = await process_attribute_edge(graph_client, root_id, attribute_name, attribute_value, flattened_and_deduplicated) # ids = await process_attribute_edge(graph_client, root_id, attribute_name, attribute_value, flattened_and_deduplicated)
#
return graph_client return graph_client

View file

@ -1,43 +0,0 @@
from datetime import datetime
from cognee.shared.data_models import DefaultGraphModel, Relationship, UserProperties, UserLocation
from cognee.modules.cognify.graph.create import create_semantic_graph
async def initialize_graph(root_id: str, graphdatamodel=None, graph_client=None):
if graphdatamodel:
graph = graphdatamodel(node_id= root_id)
graph_ = await create_semantic_graph(graph, graph_client)
return graph_
else:
print("Creating default graph")
graph = DefaultGraphModel(
node_id = root_id,
user_properties = UserProperties(
custom_properties = { "age": "30" },
location = UserLocation(
location_id = "ny",
description = "New York",
default_relationship = Relationship(
type = "located_in",
source = "UserProperties",
target = "ny",
)
),
default_relationship = Relationship(
type = "has_properties",
source = root_id,
target = "UserProperties",
)
),
default_relationship = Relationship(
type = "has_properties",
source = root_id,
target = "UserProperties"
),
default_fields = {
"created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
)
return await create_semantic_graph(graph, graph_client)

View file

@ -10,7 +10,7 @@ from datetime import datetime
from cognee import config from cognee import config
from cognee.infrastructure import infrastructure_config from cognee.infrastructure import infrastructure_config
from infer_data_topology import infer_data_topology from cognee.modules.topology.infer_data_topology import infer_data_topology
@ -31,66 +31,124 @@ from infer_data_topology import infer_data_topology
# default_relationship: Relationship = Relationship(type = "has_properties") # default_relationship: Relationship = Relationship(type = "has_properties")
# #
class Relationship(BaseModel): class Relationship(BaseModel):
type: str type: str = Field(..., description="The type of relationship, e.g., 'belongs_to'.")
source: Optional[str] = None source: Optional[str] = Field(None, description="The identifier of the source id of in the relationship being a directory or subdirectory")
target: Optional[str] = None target: Optional[str] = Field(None, description="The identifier of the target id in the relationship being the directory, subdirectory or file")
properties: Optional[Dict[str, Any]] = None properties: Optional[Dict[str, Any]] = Field(None, description="A dictionary of additional properties and values related to the relationship.")
class Document(BaseModel): class Document(BaseModel):
id: str node_id: str
title: str title: str
description: Optional[str] = None description: Optional[str] = None
default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to")) default_relationship: Relationship
class DirectoryModel(BaseModel): class DirectoryModel(BaseModel):
name: str node_id: str
path: str path: str
summary: str summary: str
documents: List[Document] = [] documents: List[Document] = []
subdirectories: List['DirectoryModel'] = [] subdirectories: List['DirectoryModel'] = []
default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to")) default_relationship: Relationship
DirectoryModel.update_forward_refs() DirectoryModel.update_forward_refs()
class RepositoryMetadata(BaseModel): class DirMetadata(BaseModel):
name: str node_id: str
summary: str summary: str
owner: str owner: str
description: Optional[str] = None description: Optional[str] = None
directories: List[DirectoryModel] = [] directories: List[DirectoryModel] = []
documents: List[Document] = [] documents: List[Document] = []
default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to")) default_relationship: Relationship
class GitHubRepositoryModel(BaseModel): class GitHubRepositoryModel(BaseModel):
metadata: RepositoryMetadata node_id: str
metadata: DirMetadata
root_directory: DirectoryModel root_directory: DirectoryModel
class TopologyEngine: class TopologyEngine:
def __init__(self): def __init__(self):
self.models: Dict[str, Type[BaseModel]] = {} self.models: Dict[str, Type[BaseModel]] = {}
async def infer(self, repository: str): async def populate_model(self, directory_path, file_structure, parent_id=None):
directory_id = os.path.basename(directory_path) or "root"
directory = DirectoryModel(
node_id=directory_id,
path=directory_path,
summary=f"Contents of {directory_id}",
default_relationship=Relationship(type="contains", source=parent_id, target=directory_id)
)
for key, value in file_structure.items():
if isinstance(value, dict):
# Recurse into subdirectory
subdirectory_path = os.path.join(directory_path, key)
subdirectory = await self.populate_model(subdirectory_path, value, parent_id=directory_id)
directory.subdirectories.append(subdirectory)
elif isinstance(value, tuple) and value[0] == 'file':
# Handle file
document = Document(
node_id=key,
title=key,
default_relationship=Relationship(type="contained_by", source=key, target=directory_id)
)
directory.documents.append(document)
return directory
async def infer_from_directory_structure(self, node_id:str, repository: str, model):
""" Infer the topology of a repository from its file structure """
path = infrastructure_config.get_config()["data_root_directory"] path = infrastructure_config.get_config()["data_root_directory"]
path = path +"/"+ str(repository) path = path +"/"+ str(repository)
print(path) print(path)
if not os.path.exists(path): if not os.path.exists(path):
raise FileNotFoundError(f"No such directory: {path}") raise FileNotFoundError(f"No such directory: {path}")
file_structure = {} root = {}
for filename in glob.glob(f"{path}/**", recursive=True): for filename in glob.glob(f"{path}/**", recursive=True):
parts = os.path.relpath(filename, start=path).split(os.path.sep)
current = root
for part in parts[:-1]: # Traverse/create to the last directory
if part not in current:
current[part] = {}
current = current[part]
last_part = parts[-1]
if os.path.isfile(filename): if os.path.isfile(filename):
key = os.path.relpath(filename, start=path).replace(os.path.sep, "__") current[last_part] = ("file", ...) # Placeholder for file content or metadata
file_structure[key] = (str, ...) # Assuming content as string for simplicity elif os.path.isdir(filename):
if last_part not in current: # Only create a new directory entry if it doesn't exist
current[last_part] = {}
root_directory = await self.populate_model('/', root)
result = await infer_data_topology(str(file_structure), GitHubRepositoryModel) # repository_metadata = await infer_data_topology(str(root), DirMetadata)
repository_metadata = DirMetadata(
node_id="repo1",
summary="Example repository",
owner="user1",
directories=[root_directory],
documents=[],
default_relationship=Relationship(type="contained_by", source="repo1", target=node_id)
)
active_model = GitHubRepositoryModel(
node_id=node_id,
metadata=repository_metadata,
root_directory=root_directory
)
return active_model
# print(github_repo_model)
return result
def load(self, model_name: str): def load(self, model_name: str):
return self.models.get(model_name) return self.models.get(model_name)

1429
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -63,8 +63,9 @@ tiktoken = "^0.6.0"
dspy-ai = "2.4.3" dspy-ai = "2.4.3"
posthog = "^3.5.0" posthog = "^3.5.0"
lancedb = "^0.6.10" lancedb = "^0.6.10"
importlib-metadata = "6.1.0" importlib-metadata = "6.8.0"
deepeval = "^0.21.36" deepeval = "^0.21.36"
litellm = "^1.37.3"