working topology inference

2024-05-11 23:10:00 +02:00 · 2024-05-11 23:10:00 +02:00 · 7249c240d0
commit 7249c240d0
parent 01446deb6f
13 changed files with 1169 additions and 798 deletions
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -1,4 +1,5 @@
 import asyncio
 import os
 from uuid import uuid4
 from typing import List, Union
 import logging
@ -6,7 +7,11 @@ import instructor
 import nltk
 from openai import OpenAI
 from nltk.corpus import stopwords
 from cognee.api.v1.prune import prune
 from cognee.config import Config
 from cognee.infrastructure.data.chunking.LangchainChunkingEngine import LangchainChunkEngine
 from cognee.infrastructure.databases.vector.embeddings.DefaultEmbeddingEngine import OpenAIEmbeddingEngine
 from cognee.modules.cognify.graph.add_data_chunks import add_data_chunks
 from cognee.modules.cognify.graph.add_document_node import add_document_node
 from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
@ -14,7 +19,6 @@ from cognee.modules.cognify.graph.add_cognitive_layer_graphs import add_cognitiv
 from cognee.modules.cognify.graph.add_summary_nodes import add_summary_nodes
 from cognee.modules.cognify.graph.add_node_connections import group_nodes_by_layer, \
    graph_ready_output, connect_nodes_in_graph
 from cognee.modules.cognify.graph.initialize_graph import initialize_graph
 from cognee.modules.cognify.llm.resolve_cross_graph_references import resolve_cross_graph_references
 from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
 from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
@ -27,6 +31,8 @@ from cognee.modules.data.get_content_categories import get_content_categories
 from cognee.modules.data.get_content_summary import get_content_summary
 from cognee.modules.data.get_cognitive_layers import get_cognitive_layers
 from cognee.modules.data.get_layer_graphs import get_layer_graphs
 from cognee.modules.topology.topology import TopologyEngine
 from cognee.shared.data_models import ChunkStrategy
 from cognee.utils import send_telemetry
@ -69,27 +75,12 @@ async def cognify(datasets: Union[str, List[str]] = None):
    dataset_files = []
    # datasets is a dataset name string
    dataset_name = datasets.replace(".", "_").replace(" ", "_")
    for added_dataset in added_datasets:
        if dataset_name in added_dataset:
            dataset_files.append((added_dataset, db_engine.get_files_metadata(added_dataset)))
    # print("dataset_files", dataset_files)
    print(dataset_files)
    # topology can be inferred, loaded, or extrapolated from the data in the end of the flow
    # for code generation, we infer the topology from the folder structure as simple step
    graph_topology = infrastructure_config.get_config()["graph_topology"]
    await initialize_graph(USER_ID, graph_client=graph_client)
    data_chunks = {}
@ -102,6 +93,8 @@ async def cognify(datasets: Union[str, List[str]] = None):
                try:
                    file_type = guess_file_type(file)
                    text = extract_text_from_file(file, file_type)
                    if text is None:
                        text = ""
                    subchunks = chunk_engine.chunk_data(chunk_strategy, text, config.chunk_size, config.chunk_overlap)
                    if dataset_name not in data_chunks:
@ -125,22 +118,25 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi
    graph_client = await get_graph_client(infrastructure_config.get_config()["graph_engine"])
    graph_topology = infrastructure_config.get_config()["graph_topology"]
    document_id = await add_document_node(
        graph_client,
-        parent_node_id = f"DefaultGraphModel__{USER_ID}", #make a param of defaultgraph model to make sure when user passes his stuff, it doesn't break pipeline
+        parent_node_id = f"{file_metadata['name']}.{file_metadata['extension']}", #make a param of defaultgraph model to make sure when user passes his stuff, it doesn't break pipeline
        document_metadata = file_metadata,
    )
-
+    #
    await add_label_nodes(graph_client, document_id, chunk_id, file_metadata["keywords"].split("|"))
-    classified_categories = await get_content_categories(input_text)
+    # classified_categories = await get_content_categories(input_text)
-    await add_classification_nodes(
+    # await add_classification_nodes(
-        graph_client,
+    #     graph_client,
-        parent_node_id = document_id,
+    #     parent_node_id = document_id,
-        categories = classified_categories,
+    #     categories = classified_categories,
-    )
+    # )
-    print(f"Chunk ({chunk_id}) classified.")
+    # print(f"Chunk ({chunk_id}) classified.")
    print("document_id", document_id)
@ -149,53 +145,61 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi
    print(f"Chunk ({chunk_id}) summarized.")
-    cognitive_layers = await get_cognitive_layers(input_text, classified_categories)
+    # cognitive_layers = await get_cognitive_layers(input_text, classified_categories)
-    cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2]
+    # cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2]
-
+    #
-    layer_graphs = await get_layer_graphs(input_text, cognitive_layers)
+    # layer_graphs = await get_layer_graphs(input_text, cognitive_layers)
-    await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs)
+    # await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs)
-
+    #
-    if infrastructure_config.get_config()["connect_documents"] is True:
+    # if infrastructure_config.get_config()["connect_documents"] is True:
-        db_engine = infrastructure_config.get_config()["database_engine"]
+    #     db_engine = infrastructure_config.get_config()["database_engine"]
-        relevant_documents_to_connect = db_engine.fetch_cognify_data(excluded_document_id = file_metadata["id"])
+    #     relevant_documents_to_connect = db_engine.fetch_cognify_data(excluded_document_id = file_metadata["id"])
-
+    #
-        list_of_nodes = []
+    #     list_of_nodes = []
-
+    #
-        relevant_documents_to_connect.append({
+    #     relevant_documents_to_connect.append({
-            "layer_id": document_id,
+    #         "layer_id": document_id,
-        })
+    #     })
-
+    #
-        for document in relevant_documents_to_connect:
+    #     for document in relevant_documents_to_connect:
-            node_descriptions_to_match = await graph_client.extract_node_description(document["layer_id"])
+    #         node_descriptions_to_match = await graph_client.extract_node_description(document["layer_id"])
-            list_of_nodes.extend(node_descriptions_to_match)
+    #         list_of_nodes.extend(node_descriptions_to_match)
-
+    #
-        nodes_by_layer = await group_nodes_by_layer(list_of_nodes)
+    #     nodes_by_layer = await group_nodes_by_layer(list_of_nodes)
-
+    #
-        results = await resolve_cross_graph_references(nodes_by_layer)
+    #     results = await resolve_cross_graph_references(nodes_by_layer)
-
+    #
-        relationships = graph_ready_output(results)
+    #     relationships = graph_ready_output(results)
-
+    #
-        await connect_nodes_in_graph(
+    #     await connect_nodes_in_graph(
-            graph_client,
+    #         graph_client,
-            relationships,
+    #         relationships,
-            score_threshold = infrastructure_config.get_config()["intra_layer_score_treshold"]
+    #         score_threshold = infrastructure_config.get_config()["intra_layer_score_treshold"]
-        )
+    #     )
-
+    #
-    send_telemetry("cognee.cognify")
+    # send_telemetry("cognee.cognify")
-
+    #
-    print(f"Chunk ({chunk_id}) cognified.")
+    # print(f"Chunk ({chunk_id}) cognified.")
 if __name__ == "__main__":
    async def test():
        # await prune.prune_system()
        # #
        # from cognee.api.v1.add import add
        # data_directory_path = os.path.abspath("../../../.data")
        # # print(data_directory_path)
        # # config.data_root_directory(data_directory_path)
        # # cognee_directory_path = os.path.abspath("../.cognee_system")
        # # config.system_root_directory(cognee_directory_path)
        #
-        from cognee.api.v1.add import add
+        # await add("data://" +data_directory_path, "example")
-        await add(["A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification"], "code")
+        infrastructure_config.set_config( {"chunk_engine": LangchainChunkEngine() , "chunk_strategy": ChunkStrategy.CODE,'embedding_engine': OpenAIEmbeddingEngine()})
        graph = await cognify()
-
+        #
        from cognee.utils import render_graph
        await render_graph(graph, include_color=True, include_nodes=False, include_size=False)
--- a/cognee/api/v1/topology/init.py
+++ b/cognee/api/v1/topology/init.py
--- a/cognee/api/v1/topology/add_topology.py
+++ b/cognee/api/v1/topology/add_topology.py
@ -0,0 +1,91 @@
 from typing import List, Dict, Any, Union, Optional
 from cognee.infrastructure import infrastructure_config
 from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
 from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel
 import pandas as pd
 from pydantic import BaseModel
 USER_ID = "default_user"
 async def add_topology(directory="example", model=GitHubRepositoryModel):
    graph_db_type = infrastructure_config.get_config()["graph_engine"]
    graph_client = await get_graph_client(graph_db_type)
    graph_topology = infrastructure_config.get_config()["graph_topology"]
    engine = TopologyEngine()
    topology = await engine.infer_from_directory_structure(node_id =USER_ID , repository = directory, model=model)
    def flatten_model(model: BaseModel, parent_id: Optional[str] = None) -> Dict[str, Any]:
        """Flatten a single Pydantic model to a dictionary handling nested structures."""
        result = {**model.dict(), 'parent_id': parent_id}
        if hasattr(model, 'default_relationship') and model.default_relationship:
            result.update({
                'relationship_type': model.default_relationship.type,
                'relationship_source': model.default_relationship.source,
                'relationship_target': model.default_relationship.target
            })
        return result
    def recursive_flatten(items: Union[List[Any], BaseModel], parent_id: Optional[str] = None) -> List[Dict[str, Any]]:
        """Recursively flatten nested Pydantic models or lists of models."""
        if isinstance(items, list):
            return [entry for item in items for entry in recursive_flatten(item, parent_id)]
        elif isinstance(items, BaseModel):
            flat = [flatten_model(items, parent_id)]
            for field, value in items:
                if isinstance(value, (BaseModel, list)):
                    flat.extend(recursive_flatten(value, items.dict().get('node_id', None)))
            return flat
        else:
            return []
    def flatten_repository(repo_model):
        """ Flatten the entire repository model, starting with the top-level model """
        return recursive_flatten(repo_model)
    flt_topology = flatten_repository(topology)
    df =pd.DataFrame(flt_topology)
    print(df.head(10))
    for _, row in df.iterrows():
        node_data = row.to_dict()
        node_id = node_data.pop('node_id')
        # Remove 'node_id' and get its value
        await graph_client.add_node(node_id, node_data)
        if pd.notna(row['relationship_source']) and pd.notna(row['relationship_target']):
            await graph_client.add_edge(row['relationship_source'], row['relationship_target'], relationship_name=row['relationship_type'])
    return graph_client.graph
 if __name__ == "__main__":
    async def test():
        # await prune.prune_system()
        # #
        # from cognee.api.v1.add import add
        # data_directory_path = os.path.abspath("../../../.data")
        # # print(data_directory_path)
        # # config.data_root_directory(data_directory_path)
        # # cognee_directory_path = os.path.abspath("../.cognee_system")
        # # config.system_root_directory(cognee_directory_path)
        #
        # await add("data://" +data_directory_path, "example")
        graph = await add_topology()
        graph_db_type = infrastructure_config.get_config()["graph_engine"]
        graph_client = await get_graph_client(graph_db_type)
        #
        from cognee.utils import render_graph
        await render_graph(graph_client.graph, include_color=True, include_nodes=False, include_size=False)
    import asyncio
    asyncio.run(test())
--- a/cognee/infrastructure/data/chunking/DefaultChunkEngine.py
+++ b/cognee/infrastructure/data/chunking/DefaultChunkEngine.py
@ -52,6 +52,7 @@ class DefaultChunkEngine():
        elif chunk_strategy == ChunkStrategy.EXACT:
            chunked_data = DefaultChunkEngine.chunk_data_exact(source_data, chunk_size, chunk_overlap)
        return chunked_data
--- a/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py
@ -1,3 +1,4 @@
 import asyncio
 from typing import List
 import instructor
@ -8,8 +9,8 @@ from openai import AsyncOpenAI
 from cognee.config import Config
 from cognee.root_dir import get_absolute_path
-from .EmbeddingEngine import EmbeddingEngine
+from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
-
+from litellm import aembedding
 config = Config()
 config.load()
@ -27,16 +28,28 @@ class DefaultEmbeddingEngine(EmbeddingEngine):
 class OpenAIEmbeddingEngine(EmbeddingEngine):
    async def embed_text(self, text: List[str]) -> List[float]:
        OPENAI_API_KEY = config.openai_key
-        aclient = instructor.patch(AsyncOpenAI())
+        response = await aembedding(config.openai_embedding_model, input=text)
-        text = text.replace("\n", " ")
+
-        response = await aclient.embeddings.create(input = text, model = config.openai_embedding_model)
+
-        embedding = response.data[0].embedding
+        # embedding = response.data[0].embedding
        # embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
-        return embedding
+        return response.data[0]['embedding']
    def get_vector_size(self) -> int:
        return config.openai_embedding_dimensions
 if __name__ == "__main__":
    async def gg():
        openai_embedding_engine = OpenAIEmbeddingEngine()
        # print(openai_embedding_engine.embed_text(["Hello, how are you?"]))
        # print(openai_embedding_engine.get_vector_size())
        # default_embedding_engine = DefaultEmbeddingEngine()
        sds = await openai_embedding_engine.embed_text(["Hello, sadasdas are you?"])
        print(sds)
        # print(default_embedding_engine.get_vector_size())
    asyncio.run(gg())
--- a/cognee/infrastructure/databases/vector/lancedb/init.py
+++ b/cognee/infrastructure/databases/vector/lancedb/init.py
--- a/cognee/infrastructure/llm/prompts/extract_topology.txt
+++ b/cognee/infrastructure/llm/prompts/extract_topology.txt
@ -1 +1,4 @@
-You are a topology master and need to extract the following topology information from the text provided to you
+You are a topology master and need to extract the following topology information from the text provided to you.
 Relationship part'S can't be empty, and have to be logical AND CONNECTING ELEMENTS OF THE TOPOLOGY
 The source is the parent of the target. And the target is the child of the source.
 Have in mind this model needs to become a graph later.
--- a/cognee/modules/cognify/graph/add_label_nodes.py
+++ b/cognee/modules/cognify/graph/add_label_nodes.py
@ -37,24 +37,24 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw
    ])
    # Add data to vector
-    keyword_data_points = [
+    # keyword_data_points = [
-        DataPoint(
+    #     DataPoint(
-            id = str(uuid4()),
+    #         id = str(uuid4()),
-            payload = dict(
+    #         payload = dict(
-                value = keyword_data["keyword"],
+    #             value = keyword_data["keyword"],
-                references = dict(
+    #             references = dict(
-                    node_id = keyword_node_id,
+    #                 node_id = keyword_node_id,
-                    cognitive_layer = parent_node_id,
+    #                 cognitive_layer = parent_node_id,
-                ),
+    #             ),
-            ),
+    #         ),
-            embed_field = "value"
+    #         embed_field = "value"
-        ) for (keyword_node_id, keyword_data) in keyword_nodes
+    #     ) for (keyword_node_id, keyword_data) in keyword_nodes
-    ]
+    # ]
-
+    #
-    try:
+    # try:
-        await vector_client.create_collection(parent_node_id)
+    #     await vector_client.create_collection(parent_node_id)
-    except Exception:
+    # except Exception:
-        # It's ok if the collection already exists.
+    #     # It's ok if the collection already exists.
-        pass
+    #     pass
-
+    #
-    await vector_client.create_data_points(parent_node_id, keyword_data_points)
+    # await vector_client.create_data_points(parent_node_id, keyword_data_points)
--- a/cognee/modules/cognify/graph/create.py
+++ b/cognee/modules/cognify/graph/create.py
@ -40,12 +40,20 @@ async def add_node(client, parent_id: Optional[str], node_id: str, node_data: di
    if node_id != "Relationship_default":
        try:
            # Attempt to add the node to the graph database
            # print('NODE ID', node_id)
            # print('NODE DATA', node_data)
            result = await client.add_node(node_id, node_properties = node_data)
            print("added node", result)
            # Add an edge if a parent ID is provided and the graph engine is NETWORKX
            if parent_id and "default_relationship" in node_data and infrastructure_config.get_config()["graph_engine"] == GraphDBType.NETWORKX:
-                print("Node id", node_id)
+
-                await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data)
+                try:
                    await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data)
                    print("added edge")
                except Exception as e:
                    print(f"Error adding edge: {e}")
                    pass
        except Exception as e:
            # Log the exception; consider a logging framework for production use
            print(f"Error adding node or edge: {e}")
@ -139,39 +147,67 @@ async def add_edge(client, parent_id: Optional[str], node_id: str, node_data: di
            await client.add_edge(relationship_details['source'], relationship_details['target'], relationship_name = relationship_details['type'])
 async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
    if created_node_ids is None:
        created_node_ids = []
    if isinstance(value, BaseModel):
        node_id = await generate_node_id(value)
        node_data = value.model_dump()
-        # Use the specified default relationship for the edge between the parent node and the current node
+        # Add node to the graph
        created_node_id = await add_node(graph_client, parent_id, node_id, node_data)
        created_node_ids.append(node_id)
-        created_node_ids.append(created_node_id)
+        # Add an edge if a default_relationship exists
-
+        if hasattr(value, 'default_relationship') and value.default_relationship:
-        # await add_edge(graph_client, parent_id, node_id, node_data, relationship_data,created_node_ids)
+            await add_edge(graph_client, parent_id, node_id, value.default_relationship.dict())
        # Recursively process nested attributes to ensure all nodes and relationships are added to the graph
-        for sub_attr, sub_val in value.__dict__.items():  # Access attributes and their values directly
+        for sub_attr, sub_val in value.dict().items():
-
+            if isinstance(sub_val, (BaseModel, list)):  # Check if the value is a model or list
-            out = await process_attribute(graph_client, node_id, sub_attr, sub_val)
+                await process_attribute(graph_client, node_id, sub_attr, sub_val, created_node_ids)
            created_node_ids.extend(out)
    elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value):
        # For lists of BaseModel instances, process each item in the list
        for item in value:
-            out = await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
+            await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
            created_node_ids.extend(out)
    return created_node_ids
 # async def process_attribute(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
 #
 #     if created_node_ids is None:
 #         created_node_ids = []
 #     if isinstance(value, BaseModel):
 #         node_id = await generate_node_id(value)
 #         node_data = value.model_dump()
 #
 #         # Use the specified default relationship for the edge between the parent node and the current node
 #
 #         created_node_id = await add_node(graph_client, parent_id, node_id, node_data)
 #
 #         created_node_ids.append(created_node_id)
 #
 #         # await add_edge(graph_client, parent_id, node_id, node_data, relationship_data,created_node_ids)
 #
 #         # Recursively process nested attributes to ensure all nodes and relationships are added to the graph
 #         # for sub_attr, sub_val in value.__dict__.items(): # Access attributes and their values directly
 #         #     print('SUB ATTR', sub_attr)
 #         #     print('SUB VAL', sub_val)
 #         #
 #         #     out = await process_attribute(graph_client, node_id, sub_attr, sub_val)
 #         #
 #         #     created_node_ids.extend(out)
 #
 #     elif isinstance(value, list) and all(isinstance(item, BaseModel) for item in value):
 #         # For lists of BaseModel instances, process each item in the list
 #         for item in value:
 #             out = await process_attribute(graph_client, parent_id, attribute, item, created_node_ids)
 #             created_node_ids.extend(out)
 #
 #     return created_node_ids
 async def process_attribute_edge(graph_client, parent_id: Optional[str], attribute: str, value: Any, created_node_ids=None):
@ -212,20 +248,22 @@ async def create_dynamic(graph_model, graph_client) :
    created_node_ids.append(out)
    print('CREATED NODE IDS', created_node_ids)
    for attribute_name, attribute_value in graph_model:
        ids = await process_attribute(graph_client, root_id, attribute_name, attribute_value)
-        created_node_ids.extend(ids)
+    #     created_node_ids.extend(ids)
-
+    #
-    flattened_and_deduplicated = list({
+    # flattened_and_deduplicated = list({
-        item["nodeId"]: item
+    #     item["nodeId"]: item
-            # Use the 'nodeId' as the unique key in the dictionary comprehension
+    #         # Use the 'nodeId' as the unique key in the dictionary comprehension
-            for sublist in created_node_ids if sublist  # Ensure sublist is not None
+    #         for sublist in created_node_ids if sublist  # Ensure sublist is not None
-            for item in sublist  # Iterate over items in the sublist
+    #         for item in sublist  # Iterate over items in the sublist
-    }.values())
+    # }.values())
-
+    #
-    for attribute_name, attribute_value in graph_model:
+    # for attribute_name, attribute_value in graph_model:
-        ids = await process_attribute_edge(graph_client, root_id, attribute_name, attribute_value, flattened_and_deduplicated)
+    #     ids = await process_attribute_edge(graph_client, root_id, attribute_name, attribute_value, flattened_and_deduplicated)
-
+    #
    return graph_client
--- a/cognee/modules/cognify/graph/initialize_graph.py
+++ b/cognee/modules/cognify/graph/initialize_graph.py
@ -1,43 +0,0 @@
 from datetime import datetime
 from cognee.shared.data_models import DefaultGraphModel, Relationship, UserProperties, UserLocation
 from cognee.modules.cognify.graph.create import create_semantic_graph
 async def initialize_graph(root_id: str, graphdatamodel=None, graph_client=None):
    if graphdatamodel:
        graph = graphdatamodel(node_id= root_id)
        graph_ = await create_semantic_graph(graph, graph_client)
        return graph_
    else:
        print("Creating default graph")
        graph = DefaultGraphModel(
            node_id = root_id,
            user_properties = UserProperties(
                custom_properties = { "age": "30" },
                location = UserLocation(
                    location_id = "ny",
                    description = "New York",
                    default_relationship = Relationship(
                        type = "located_in",
                        source = "UserProperties",
                        target = "ny",
                    )
                ),
                default_relationship = Relationship(
                    type = "has_properties",
                    source = root_id,
                    target = "UserProperties",
                )
            ),
            default_relationship = Relationship(
                type = "has_properties",
                source = root_id,
                target = "UserProperties"
            ),
            default_fields = {
                "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
        )
        return await create_semantic_graph(graph, graph_client)
--- a/cognee/modules/topology/topology.py
+++ b/cognee/modules/topology/topology.py
@ -10,7 +10,7 @@ from datetime import datetime
 from cognee import config
 from cognee.infrastructure import infrastructure_config
-from infer_data_topology import infer_data_topology
+from cognee.modules.topology.infer_data_topology import infer_data_topology
@ -31,66 +31,124 @@ from infer_data_topology import infer_data_topology
 #     default_relationship: Relationship = Relationship(type = "has_properties")
 #
 class Relationship(BaseModel):
-    type: str
+    type: str = Field(..., description="The type of relationship, e.g., 'belongs_to'.")
-    source: Optional[str] = None
+    source: Optional[str] = Field(None, description="The identifier of the source id of in the relationship being a directory or subdirectory")
-    target: Optional[str] = None
+    target: Optional[str] = Field(None, description="The identifier of the target id in the relationship being the directory, subdirectory or file")
-    properties: Optional[Dict[str, Any]] = None
+    properties: Optional[Dict[str, Any]] = Field(None, description="A dictionary of additional properties and values related to the relationship.")
 class Document(BaseModel):
-    id: str
+    node_id: str
    title: str
    description: Optional[str] = None
-    default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to"))
+    default_relationship: Relationship
 class DirectoryModel(BaseModel):
-    name: str
+    node_id: str
    path: str
    summary: str
    documents: List[Document] = []
    subdirectories: List['DirectoryModel'] = []
-    default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to"))
+    default_relationship: Relationship
 DirectoryModel.update_forward_refs()
-class RepositoryMetadata(BaseModel):
+class DirMetadata(BaseModel):
-    name: str
+    node_id: str
    summary: str
    owner: str
    description: Optional[str] = None
    directories: List[DirectoryModel] = []
    documents: List[Document] = []
-    default_relationship: Relationship = Field(default_factory=lambda: Relationship(type="belongs_to"))
+    default_relationship: Relationship
 class GitHubRepositoryModel(BaseModel):
-    metadata: RepositoryMetadata
+    node_id: str
    metadata: DirMetadata
    root_directory: DirectoryModel
 class TopologyEngine:
    def __init__(self):
        self.models: Dict[str, Type[BaseModel]] = {}
-    async def infer(self, repository: str):
+    async def populate_model(self, directory_path, file_structure, parent_id=None):
        directory_id = os.path.basename(directory_path) or "root"
        directory = DirectoryModel(
            node_id=directory_id,
            path=directory_path,
            summary=f"Contents of {directory_id}",
            default_relationship=Relationship(type="contains", source=parent_id, target=directory_id)
        )
        for key, value in file_structure.items():
            if isinstance(value, dict):
                # Recurse into subdirectory
                subdirectory_path = os.path.join(directory_path, key)
                subdirectory = await self.populate_model(subdirectory_path, value, parent_id=directory_id)
                directory.subdirectories.append(subdirectory)
            elif isinstance(value, tuple) and value[0] == 'file':
                # Handle file
                document = Document(
                    node_id=key,
                    title=key,
                    default_relationship=Relationship(type="contained_by", source=key, target=directory_id)
                )
                directory.documents.append(document)
        return directory
    async def infer_from_directory_structure(self, node_id:str, repository: str, model):
        """ Infer the topology of a repository from its file structure """
        path = infrastructure_config.get_config()["data_root_directory"]
        path = path +"/"+ str(repository)
        print(path)
        if not os.path.exists(path):
            raise FileNotFoundError(f"No such directory: {path}")
-        file_structure = {}
+        root = {}
        for filename in glob.glob(f"{path}/**", recursive=True):
            parts = os.path.relpath(filename, start=path).split(os.path.sep)
            current = root
            for part in parts[:-1]:  # Traverse/create to the last directory
                if part not in current:
                    current[part] = {}
                current = current[part]
            last_part = parts[-1]
            if os.path.isfile(filename):
-                key = os.path.relpath(filename, start=path).replace(os.path.sep, "__")
+                current[last_part] = ("file", ...)  # Placeholder for file content or metadata
-                file_structure[key] = (str, ...)  # Assuming content as string for simplicity
+            elif os.path.isdir(filename):
                if last_part not in current:  # Only create a new directory entry if it doesn't exist
                    current[last_part] = {}
        root_directory = await self.populate_model('/', root)
-        result = await infer_data_topology(str(file_structure), GitHubRepositoryModel)
+        # repository_metadata = await infer_data_topology(str(root), DirMetadata)
        repository_metadata = DirMetadata(
            node_id="repo1",
            summary="Example repository",
            owner="user1",
            directories=[root_directory],
            documents=[],
            default_relationship=Relationship(type="contained_by", source="repo1", target=node_id)
        )
        active_model = GitHubRepositoryModel(
            node_id=node_id,
            metadata=repository_metadata,
            root_directory=root_directory
        )
        return active_model
        # print(github_repo_model)
        return result
    def load(self, model_name: str):
        return self.models.get(model_name)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -63,8 +63,9 @@ tiktoken = "^0.6.0"
 dspy-ai = "2.4.3"
 posthog = "^3.5.0"
 lancedb = "^0.6.10"
-importlib-metadata = "6.1.0"
+importlib-metadata = "6.8.0"
 deepeval = "^0.21.36"
 litellm = "^1.37.3"