Intermidiate commit

2024-04-24 19:35:36 +02:00 · 2024-04-24 19:35:36 +02:00 · f655ee8194
commit f655ee8194
parent 1c2c72b8bf
12 changed files with 176 additions and 64 deletions
--- a/.data/code/example.txt
+++ b/.data/code/example.txt
@ -0,0 +1,28 @@
+'''
+	Given a string, find the length of the longest substring without repeating characters.
+
+	Examples:
+
+	Given "abcabcbb", the answer is "abc", which the length is 3.
+
+	Given "bbbbb", the answer is "b", with the length of 1.
+
+	Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring.
+'''
+
+class Solution(object):
+    def lengthOfLongestSubstring(self, s):
+        """
+        :type s: str
+        :rtype: int
+        """
+        mapSet = {}
+        start, result = 0, 0
+
+        for end in range(len(s)):
+        	if s[end] in mapSet:
+        		start = max(mapSet[s[end]], start)
+        	result = max(result, end-start+1)
+        	mapSet[s[end]] = end+1
+
+        return result
--- a/cognee/modules/cognify/graph/add_classification_nodes.py
+++ b/cognee/modules/cognify/graph/add_classification_nodes.py
@ -5,20 +5,20 @@ async def add_classification_nodes(graph_client, parent_node_id: str, categories
        data_type = category["data_type"].upper().replace(' ', '_')
        category_name = category["category_name"].upper().replace(' ', '_').replace("'", "").replace("/", "_")

-        data_type_node_id = f"DATA_TYPE__{data_type}"
+        data_type_node_id = data_type

        data_type_node = await graph_client.extract_node(data_type_node_id)

        if not data_type_node:
-            data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, entity_type = "DataType"))
+            data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, type = "DataType"))

        await graph_client.add_edge(data_type_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))

-        category_node_id = f"DATA_CATEGORY__{category_name}"
+        category_node_id = category_name

        category_node = await graph_client.extract_node(category_node_id)

        if not category_node:
-            category_node = await graph_client.add_node(category_node_id, dict(name = category_name, entity_type = "DataCategory"))
+            category_node = await graph_client.add_node(category_node_id, dict(name = category_name, type = "DataCategory"))

        await graph_client.add_edge(category_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))
--- a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py
+++ b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py
@ -3,7 +3,6 @@ from uuid import uuid4
 from typing import List, Tuple, TypedDict
 from cognee.infrastructure import infrastructure_config
 from cognee.infrastructure.databases.vector import DataPoint
-from cognee.shared.data_models import KnowledgeGraph
 from cognee.utils import extract_pos_tags, extract_named_entities, extract_sentiment_vader

 class GraphLike(TypedDict):
@ -18,47 +17,50 @@ async def add_cognitive_layer_graphs(
    layer_graphs: List[Tuple[str, GraphLike]],
 ):
    vector_client = infrastructure_config.get_config("vector_engine")
+    graph_model = infrastructure_config.get_config("graph_model")

    for (layer_id, layer_graph) in layer_graphs:
        graph_nodes = []
        graph_edges = []

-        if not isinstance(layer_graph, KnowledgeGraph):
-            layer_graph = KnowledgeGraph.parse_obj(layer_graph)
+        if not isinstance(layer_graph, graph_model):
+            layer_graph = graph_model.parse_obj(layer_graph)

        for node in layer_graph.nodes:
-            node_id = generate_proposition_node_id(node.id)
+            node_id = generate_node_id(node.id)

-            entity_type_node_id = generate_type_node_id(node.entity_type)
-            entity_type_node = await graph_client.extract_node(entity_type_node_id)
+            type_node_id = generate_node_id(node.type)
+            type_node = await graph_client.extract_node(type_node_id)

-            if not entity_type_node:
-                node_name = node.entity_type.lower().capitalize()
+            if not type_node:
+                node_name = node.type.lower().capitalize()
              
-                entity_type_node = (
-                    entity_type_node_id,
+                type_node = (
+                    type_node_id,
                    dict(
-                        id = entity_type_node_id,
+                        id = type_node_id,
                        name = node_name,
-                        entity_type = node_name,
+                        type = node_name,
                        created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    )
                )

-                graph_nodes.append(entity_type_node)
+                graph_nodes.append(type_node)

                # Add relationship between document and entity type: "Document contains Person"
                graph_edges.append((
                    layer_id,
-                    entity_type_node_id,
+                    type_node_id,
                    "contains",
                    dict(relationship_name = "contains"),
                ))

-            pos_tags = extract_pos_tags(node.entity_description)
-            named_entities = extract_named_entities(node.entity_description)
-            sentiment = extract_sentiment_vader(node.entity_description)
+            # pos_tags = extract_pos_tags(node.description)
+            # named_entities = extract_named_entities(node.description)
+            # sentiment = extract_sentiment_vader(node.description)
+
+            id, type, name, description, *node_properties = node

            graph_nodes.append((
                node_id,
@ -67,21 +69,22 @@ async def add_cognitive_layer_graphs(
                    layer_id = layer_id,
                    chunk_id = chunk_id,
                    chunk_collection = chunk_collection,
-                    name = node.entity_name,
-                    entity_type = node.entity_type.lower().capitalize(),
-                    description = node.entity_description,
-                    pos_tags = pos_tags,
-                    sentiment = sentiment,
-                    named_entities = named_entities,
+                    name = node.name,
+                    type = node.type.lower().capitalize(),
+                    description = node.description,
+                    # pos_tags = pos_tags,
+                    # sentiment = sentiment,
+                    # named_entities = named_entities,
                    created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    *node_properties,
                )
            ))

            # Add relationship between entity type and entity itself: "Jake is Person"
            graph_edges.append((
                node_id,
-                entity_type_node_id,
+                type_node_id,
                "is",
                dict(relationship_name = "is"),
            ))
@ -96,8 +99,8 @@ async def add_cognitive_layer_graphs(
        # Add relationship that came from graphs.
        for edge in layer_graph.edges:
            graph_edges.append((
-                generate_proposition_node_id(edge.source_node_id),
-                generate_proposition_node_id(edge.target_node_id),
+                generate_node_id(edge.source_node_id),
+                generate_node_id(edge.target_node_id),
                edge.relationship_name,
                dict(relationship_name = edge.relationship_name),
            ))
@ -129,8 +132,5 @@ async def add_cognitive_layer_graphs(
        await vector_client.create_data_points(layer_id, data_points)


-def generate_proposition_node_id(node_id: str) -> str:
-    return f"PROPOSITION_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
-
-def generate_type_node_id(node_id: str) -> str:
-    return f"PROPOSITION_TYPE_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
+def generate_node_id(node_id: str) -> str:
+    return node_id.upper().replace(' ', '_').replace("'", "")
--- a/cognee/modules/cognify/graph/add_cognitive_layers.py
+++ b/cognee/modules/cognify/graph/add_cognitive_layers.py
@ -21,4 +21,4 @@ async def add_cognitive_layers(graph_client, parent_node_id: str, cognitive_laye
    return cognitive_layer_nodes

 def generate_cognitive_layer_id(layer_id: str) -> str:
-    return f"COGNITIVE_LAYER__{layer_id.upper().replace(' ', '_')}".replace("'", "").replace("/", "_")
+    return layer_id.upper().replace(" ", "_").replace("'", "").replace("/", "_")
--- a/cognee/modules/cognify/graph/add_document_node.py
+++ b/cognee/modules/cognify/graph/add_document_node.py
@ -13,7 +13,7 @@ async def add_document_node(graph_client: GraphDBInterface, parent_node_id, docu
            file_path = document_metadata["file_path"],
        ).model_dump()

-    document["entity_type"] = "Document"
+    document["type"] = "Document"

    await graph_client.add_node(document_id, document)

--- a/cognee/modules/cognify/graph/add_label_nodes.py
+++ b/cognee/modules/cognify/graph/add_label_nodes.py
@ -19,7 +19,7 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw
                chunk_id = chunk_id,
                name = keyword.lower().capitalize(),
                keyword = keyword.lower(),
-                entity_type = "Keyword",
+                type = "Keyword",
                created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            ),
--- a/cognee/modules/cognify/graph/add_summary_nodes.py
+++ b/cognee/modules/cognify/graph/add_summary_nodes.py
@ -20,7 +20,7 @@ async def add_summary_nodes(graph_client, document_id, summary):
        description_node_id,
        dict(
            name = "Description",
-            summary = summary["description"],
+            description = summary["description"],
        ),
    )

--- a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py
+++ b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py
@ -7,18 +7,18 @@ from .extract_content_graph import extract_content_graph
 logger = logging.getLogger("extract_knowledge_graph(text: str)")

 async def extract_knowledge_graph(text: str, cognitive_layer, graph_model):
-    try:
-        compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
-        compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
+    # try:
+    #     compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
+    #     compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))

-        event_loop = asyncio.get_event_loop()
+    #     event_loop = asyncio.get_event_loop()

-        def sync_extract_knowledge_graph():
-            return compiled_extract_knowledge_graph(context = text, question = "")
+    #     def sync_extract_knowledge_graph():
+    #         return compiled_extract_knowledge_graph(context = text, question = "")

-        return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
-        # return compiled_extract_knowledge_graph(text, question = "").graph
-    except Exception as error:
-        logger.error("Error extracting graph from content: %s", error, exc_info = True)
+    #     return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
+    #     # return compiled_extract_knowledge_graph(text, question = "").graph
+    # except Exception as error:
+    #     logger.error("Error extracting graph from content: %s", error, exc_info = True)
        
-        return await extract_content_graph(text, cognitive_layer, graph_model)
+    return await extract_content_graph(text, cognitive_layer, graph_model)
--- a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py
+++ b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py
@ -29,8 +29,8 @@ class GraphFromText(dspy.Signature):


 def are_all_nodes_and_edges_valid(graph: KnowledgeGraph) -> bool:
-    return all([getattr(node, "entity_type", "").strip() != "" for node in graph.nodes]) and \
-        all([getattr(node, "entity_name", "").strip() != "" for node in graph.nodes]) and \
+    return all([getattr(node, "type", "").strip() != "" for node in graph.nodes]) and \
+        all([getattr(node, "name", "").strip() != "" for node in graph.nodes]) and \
        all([getattr(edge, "relationship_name", "").strip() != "" for edge in graph.edges])

 def is_node_connected(node: Node, edges: List[Edge]) -> bool:
@ -56,7 +56,7 @@ class ExtractKnowledgeGraph(dspy.Module):
            graph = self.generate_graph(text = context).graph

            not_valid_nodes_or_edges_message = """
-                All nodes must contain "entity_name".
+                All nodes must contain "name".
                All edges must contain "relationship_name".
                Please add mandatory fields to nodes and edges."""

--- a/cognee/shared/SourceCodeGraph.py
+++ b/cognee/shared/SourceCodeGraph.py
@ -0,0 +1,84 @@
+from typing import List, Union, Literal, Optional
+from pydantic import BaseModel
+
+class BaseClass(BaseModel):
+    id: str
+    name: str
+    type: Literal["Class"] = "Class"
+    description: str
+    constructor_parameters: Optional[List[str]]
+
+class Class(BaseModel):
+    id: str
+    name: str
+    type: Literal["Class"] = "Class"
+    description: str
+    constructor_parameters: Optional[List[str]]
+    from_class: Optional[BaseClass]
+
+class ClassInstance(BaseModel):
+    id: str
+    name: str
+    type: Literal["ClassInstance"] = "ClassInstance"
+    description: str
+    from_class: Class
+
+class Function(BaseModel):
+    id: str
+    name: str
+    type: Literal["Function"] = "Function"
+    description: str
+    parameters: Optional[List[str]]
+    return_type: str
+    is_static: Optional[bool] = False
+
+class Variable(BaseModel):
+    id: str
+    name: str
+    type: Literal["Variable"] = "Variable"
+    description: str
+    is_static: Optional[bool] = False
+    default_value: Optional[str]
+
+class Operator(BaseModel):
+    id: str
+    name: str
+    type: Literal["Operator"] = "Operator"
+    description: str
+    return_type: str
+
+class ExpressionPart(BaseModel):
+    id: str
+    name: str
+    type: Literal["Expression"] = "Expression"
+    description: str
+    expression: str
+    members: List[Union[Variable, Function, Operator]]
+
+class Expression(BaseModel):
+    id: str
+    name: str
+    type: Literal["Expression"] = "Expression"
+    description: str
+    expression: str
+    members: List[Union[Variable, Function, Operator, ExpressionPart]]
+
+class Edge(BaseModel):
+    source_node_id: str
+    target_node_id: str
+    relationship_name: Literal["called in", "stored in", "defined in", "returned by", "instantiated in", "uses", "updates"]
+
+class SourceCodeGraph(BaseModel):
+    id: str
+    name: str
+    description: str
+    language: str
+    nodes: List[Union[
+        Class,
+        Function,
+        Variable,
+        Operator,
+        Expression,
+        ClassInstance,
+    ]]
+    edges: List[Edge]
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@ -7,9 +7,9 @@ from pydantic import BaseModel, Field
 class Node(BaseModel):
    """Node in a knowledge graph."""
    id: str
-    entity_name: str
-    entity_type: str
-    entity_description: str
+    name: str
+    type: str
+    description: str

 class Edge(BaseModel):
    """Edge in a knowledge graph."""
@ -26,8 +26,6 @@ class GraphQLQuery(BaseModel):
    """GraphQL query."""
    query: str

-
-
 class Answer(BaseModel):
    """Answer."""
    answer: str
@ -42,7 +40,6 @@ class MemorySummary(BaseModel):
    nodes: List[Node] = Field(..., default_factory=list)
    edges: List[Edge] = Field(..., default_factory=list)

-
 class TextSubclass(str, Enum):
    ARTICLES = "Articles, essays, and reports"
    BOOKS = "Books and manuscripts"
@ -107,7 +104,6 @@ class ImageSubclass(str, Enum):
    SCREENSHOTS = "Screenshots and graphical user interfaces"
    OTHER_IMAGES = "Other types of images"

-
 class VideoSubclass(str, Enum):
    MOVIES = "Movies and short films"
    DOCUMENTARIES = "Documentaries and educational videos"
@ -183,7 +179,6 @@ class DefaultContentPrediction(BaseModel):
        ProceduralContent,
    ]

-
 class SummarizedContent(BaseModel):
    """Class for a single class label summary and description."""
    summary: str
@ -194,7 +189,6 @@ class LabeledContent(BaseModel):
    content_labels: str


-
 class CognitiveLayerSubgroup(BaseModel):
    """ CognitiveLayerSubgroup in a general layer """
    id: int
--- a/notebooks/full_run.ipynb
+++ b/notebooks/full_run.ipynb
@ -11,6 +11,7 @@
    "import cognee\n",
    "import dspy\n",
    "from cognee.modules.cognify.dataset import HotPotQA\n",
+    "from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
    "\n",
    "data_directory_path = path.abspath(\"../.data\")\n",
    "cognee.config.data_root_directory(data_directory_path)\n",
@ -18,6 +19,8 @@
    "cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
    "cognee.config.system_root_directory(cognee_directory_path)\n",
    "\n",
+    "cognee.config.set_graph_model(SourceCodeGraph)\n",
+    "\n",
    "await cognee.prune.prune_system()\n",
    "\n",
    "colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = \"http://20.102.90.50:2017/wiki17_abstracts\")\n",
@ -39,7 +42,7 @@
    "\n",
    "#     texts_to_add.append(train_case_text)\n",
    "\n",
-    "dataset_name = \"short_stories\"\n",
+    "dataset_name = \"code\"\n",
    "await cognee.add(\"data://\" + data_directory_path, dataset_name)\n"
   ]
  },
@ -75,6 +78,9 @@
    "from os import path\n",
    "import logging\n",
    "import cognee\n",
+    "from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
+    "\n",
+    "cognee.config.set_graph_model(SourceCodeGraph)\n",
    "\n",
    "logging.basicConfig(level = logging.INFO)\n",
    "\n",
@ -86,7 +92,7 @@
    "cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
    "cognee.config.system_root_directory(cognee_directory_path)\n",
    "\n",
-    "await cognee.cognify('short_stories')"
+    "await cognee.cognify('code')"
   ]
  },
  {