Intermidiate commit

2024-04-24 19:35:36 +02:00 · 2024-04-24 19:35:36 +02:00 · f655ee8194
commit f655ee8194
parent 1c2c72b8bf
12 changed files with 176 additions and 64 deletions
--- a/.data/code/example.txt
+++ b/.data/code/example.txt
@ -0,0 +1,28 @@
 '''
 	Given a string, find the length of the longest substring without repeating characters.
 	Examples:
 	Given "abcabcbb", the answer is "abc", which the length is 3.
 	Given "bbbbb", the answer is "b", with the length of 1.
 	Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring.
 '''
 class Solution(object):
    def lengthOfLongestSubstring(self, s):
        """
        :type s: str
        :rtype: int
        """
        mapSet = {}
        start, result = 0, 0
        for end in range(len(s)):
        	if s[end] in mapSet:
        		start = max(mapSet[s[end]], start)
        	result = max(result, end-start+1)
        	mapSet[s[end]] = end+1
        return result
--- a/cognee/modules/cognify/graph/add_classification_nodes.py
+++ b/cognee/modules/cognify/graph/add_classification_nodes.py
@ -5,20 +5,20 @@ async def add_classification_nodes(graph_client, parent_node_id: str, categories
        data_type = category["data_type"].upper().replace(' ', '_')
        category_name = category["category_name"].upper().replace(' ', '_').replace("'", "").replace("/", "_")
-        data_type_node_id = f"DATA_TYPE__{data_type}"
+        data_type_node_id = data_type
        data_type_node = await graph_client.extract_node(data_type_node_id)
        if not data_type_node:
-            data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, entity_type = "DataType"))
+            data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, type = "DataType"))
        await graph_client.add_edge(data_type_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))
-        category_node_id = f"DATA_CATEGORY__{category_name}"
+        category_node_id = category_name
        category_node = await graph_client.extract_node(category_node_id)
        if not category_node:
-            category_node = await graph_client.add_node(category_node_id, dict(name = category_name, entity_type = "DataCategory"))
+            category_node = await graph_client.add_node(category_node_id, dict(name = category_name, type = "DataCategory"))
        await graph_client.add_edge(category_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))
--- a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py
+++ b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py
@ -3,7 +3,6 @@ from uuid import uuid4
 from typing import List, Tuple, TypedDict
 from cognee.infrastructure import infrastructure_config
 from cognee.infrastructure.databases.vector import DataPoint
 from cognee.shared.data_models import KnowledgeGraph
 from cognee.utils import extract_pos_tags, extract_named_entities, extract_sentiment_vader
 class GraphLike(TypedDict):
@ -18,47 +17,50 @@ async def add_cognitive_layer_graphs(
    layer_graphs: List[Tuple[str, GraphLike]],
 ):
    vector_client = infrastructure_config.get_config("vector_engine")
    graph_model = infrastructure_config.get_config("graph_model")
    for (layer_id, layer_graph) in layer_graphs:
        graph_nodes = []
        graph_edges = []
-        if not isinstance(layer_graph, KnowledgeGraph):
+        if not isinstance(layer_graph, graph_model):
-            layer_graph = KnowledgeGraph.parse_obj(layer_graph)
+            layer_graph = graph_model.parse_obj(layer_graph)
        for node in layer_graph.nodes:
-            node_id = generate_proposition_node_id(node.id)
+            node_id = generate_node_id(node.id)
-            entity_type_node_id = generate_type_node_id(node.entity_type)
+            type_node_id = generate_node_id(node.type)
-            entity_type_node = await graph_client.extract_node(entity_type_node_id)
+            type_node = await graph_client.extract_node(type_node_id)
-            if not entity_type_node:
+            if not type_node:
-                node_name = node.entity_type.lower().capitalize()
+                node_name = node.type.lower().capitalize()
-                entity_type_node = (
+                type_node = (
-                    entity_type_node_id,
+                    type_node_id,
                    dict(
-                        id = entity_type_node_id,
+                        id = type_node_id,
                        name = node_name,
-                        entity_type = node_name,
+                        type = node_name,
                        created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    )
                )
-                graph_nodes.append(entity_type_node)
+                graph_nodes.append(type_node)
                # Add relationship between document and entity type: "Document contains Person"
                graph_edges.append((
                    layer_id,
-                    entity_type_node_id,
+                    type_node_id,
                    "contains",
                    dict(relationship_name = "contains"),
                ))
-            pos_tags = extract_pos_tags(node.entity_description)
+            # pos_tags = extract_pos_tags(node.description)
-            named_entities = extract_named_entities(node.entity_description)
+            # named_entities = extract_named_entities(node.description)
-            sentiment = extract_sentiment_vader(node.entity_description)
+            # sentiment = extract_sentiment_vader(node.description)
            id, type, name, description, *node_properties = node
            graph_nodes.append((
                node_id,
@ -67,21 +69,22 @@ async def add_cognitive_layer_graphs(
                    layer_id = layer_id,
                    chunk_id = chunk_id,
                    chunk_collection = chunk_collection,
-                    name = node.entity_name,
+                    name = node.name,
-                    entity_type = node.entity_type.lower().capitalize(),
+                    type = node.type.lower().capitalize(),
-                    description = node.entity_description,
+                    description = node.description,
-                    pos_tags = pos_tags,
+                    # pos_tags = pos_tags,
-                    sentiment = sentiment,
+                    # sentiment = sentiment,
-                    named_entities = named_entities,
+                    # named_entities = named_entities,
                    created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    *node_properties,
                )
            ))
            # Add relationship between entity type and entity itself: "Jake is Person"
            graph_edges.append((
                node_id,
-                entity_type_node_id,
+                type_node_id,
                "is",
                dict(relationship_name = "is"),
            ))
@ -96,8 +99,8 @@ async def add_cognitive_layer_graphs(
        # Add relationship that came from graphs.
        for edge in layer_graph.edges:
            graph_edges.append((
-                generate_proposition_node_id(edge.source_node_id),
+                generate_node_id(edge.source_node_id),
-                generate_proposition_node_id(edge.target_node_id),
+                generate_node_id(edge.target_node_id),
                edge.relationship_name,
                dict(relationship_name = edge.relationship_name),
            ))
@ -129,8 +132,5 @@ async def add_cognitive_layer_graphs(
        await vector_client.create_data_points(layer_id, data_points)
-def generate_proposition_node_id(node_id: str) -> str:
+def generate_node_id(node_id: str) -> str:
-    return f"PROPOSITION_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
+    return node_id.upper().replace(' ', '_').replace("'", "")
 def generate_type_node_id(node_id: str) -> str:
    return f"PROPOSITION_TYPE_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
--- a/cognee/modules/cognify/graph/add_cognitive_layers.py
+++ b/cognee/modules/cognify/graph/add_cognitive_layers.py
@ -21,4 +21,4 @@ async def add_cognitive_layers(graph_client, parent_node_id: str, cognitive_laye
    return cognitive_layer_nodes
 def generate_cognitive_layer_id(layer_id: str) -> str:
-    return f"COGNITIVE_LAYER__{layer_id.upper().replace(' ', '_')}".replace("'", "").replace("/", "_")
+    return layer_id.upper().replace(" ", "_").replace("'", "").replace("/", "_")
--- a/cognee/modules/cognify/graph/add_document_node.py
+++ b/cognee/modules/cognify/graph/add_document_node.py
@ -13,7 +13,7 @@ async def add_document_node(graph_client: GraphDBInterface, parent_node_id, docu
            file_path = document_metadata["file_path"],
        ).model_dump()
-    document["entity_type"] = "Document"
+    document["type"] = "Document"
    await graph_client.add_node(document_id, document)
--- a/cognee/modules/cognify/graph/add_label_nodes.py
+++ b/cognee/modules/cognify/graph/add_label_nodes.py
@ -19,7 +19,7 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw
                chunk_id = chunk_id,
                name = keyword.lower().capitalize(),
                keyword = keyword.lower(),
-                entity_type = "Keyword",
+                type = "Keyword",
                created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            ),
--- a/cognee/modules/cognify/graph/add_summary_nodes.py
+++ b/cognee/modules/cognify/graph/add_summary_nodes.py
@ -20,7 +20,7 @@ async def add_summary_nodes(graph_client, document_id, summary):
        description_node_id,
        dict(
            name = "Description",
-            summary = summary["description"],
+            description = summary["description"],
        ),
    )
--- a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py
+++ b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py
@ -7,18 +7,18 @@ from .extract_content_graph import extract_content_graph
 logger = logging.getLogger("extract_knowledge_graph(text: str)")
 async def extract_knowledge_graph(text: str, cognitive_layer, graph_model):
-    try:
+    # try:
-        compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
+    #     compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
-        compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
+    #     compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
-        event_loop = asyncio.get_event_loop()
+    #     event_loop = asyncio.get_event_loop()
-        def sync_extract_knowledge_graph():
+    #     def sync_extract_knowledge_graph():
-            return compiled_extract_knowledge_graph(context = text, question = "")
+    #         return compiled_extract_knowledge_graph(context = text, question = "")
-        return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
+    #     return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
-        # return compiled_extract_knowledge_graph(text, question = "").graph
+    #     # return compiled_extract_knowledge_graph(text, question = "").graph
-    except Exception as error:
+    # except Exception as error:
-        logger.error("Error extracting graph from content: %s", error, exc_info = True)
+    #     logger.error("Error extracting graph from content: %s", error, exc_info = True)
-        return await extract_content_graph(text, cognitive_layer, graph_model)
+    return await extract_content_graph(text, cognitive_layer, graph_model)
--- a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py
+++ b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py
@ -29,8 +29,8 @@ class GraphFromText(dspy.Signature):
 def are_all_nodes_and_edges_valid(graph: KnowledgeGraph) -> bool:
-    return all([getattr(node, "entity_type", "").strip() != "" for node in graph.nodes]) and \
+    return all([getattr(node, "type", "").strip() != "" for node in graph.nodes]) and \
-        all([getattr(node, "entity_name", "").strip() != "" for node in graph.nodes]) and \
+        all([getattr(node, "name", "").strip() != "" for node in graph.nodes]) and \
        all([getattr(edge, "relationship_name", "").strip() != "" for edge in graph.edges])
 def is_node_connected(node: Node, edges: List[Edge]) -> bool:
@ -56,7 +56,7 @@ class ExtractKnowledgeGraph(dspy.Module):
            graph = self.generate_graph(text = context).graph
            not_valid_nodes_or_edges_message = """
-                All nodes must contain "entity_name".
+                All nodes must contain "name".
                All edges must contain "relationship_name".
                Please add mandatory fields to nodes and edges."""
--- a/cognee/shared/SourceCodeGraph.py
+++ b/cognee/shared/SourceCodeGraph.py
@ -0,0 +1,84 @@
 from typing import List, Union, Literal, Optional
 from pydantic import BaseModel
 class BaseClass(BaseModel):
    id: str
    name: str
    type: Literal["Class"] = "Class"
    description: str
    constructor_parameters: Optional[List[str]]
 class Class(BaseModel):
    id: str
    name: str
    type: Literal["Class"] = "Class"
    description: str
    constructor_parameters: Optional[List[str]]
    from_class: Optional[BaseClass]
 class ClassInstance(BaseModel):
    id: str
    name: str
    type: Literal["ClassInstance"] = "ClassInstance"
    description: str
    from_class: Class
 class Function(BaseModel):
    id: str
    name: str
    type: Literal["Function"] = "Function"
    description: str
    parameters: Optional[List[str]]
    return_type: str
    is_static: Optional[bool] = False
 class Variable(BaseModel):
    id: str
    name: str
    type: Literal["Variable"] = "Variable"
    description: str
    is_static: Optional[bool] = False
    default_value: Optional[str]
 class Operator(BaseModel):
    id: str
    name: str
    type: Literal["Operator"] = "Operator"
    description: str
    return_type: str
 class ExpressionPart(BaseModel):
    id: str
    name: str
    type: Literal["Expression"] = "Expression"
    description: str
    expression: str
    members: List[Union[Variable, Function, Operator]]
 class Expression(BaseModel):
    id: str
    name: str
    type: Literal["Expression"] = "Expression"
    description: str
    expression: str
    members: List[Union[Variable, Function, Operator, ExpressionPart]]
 class Edge(BaseModel):
    source_node_id: str
    target_node_id: str
    relationship_name: Literal["called in", "stored in", "defined in", "returned by", "instantiated in", "uses", "updates"]
 class SourceCodeGraph(BaseModel):
    id: str
    name: str
    description: str
    language: str
    nodes: List[Union[
        Class,
        Function,
        Variable,
        Operator,
        Expression,
        ClassInstance,
    ]]
    edges: List[Edge]
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@ -7,9 +7,9 @@ from pydantic import BaseModel, Field
 class Node(BaseModel):
    """Node in a knowledge graph."""
    id: str
-    entity_name: str
+    name: str
-    entity_type: str
+    type: str
-    entity_description: str
+    description: str
 class Edge(BaseModel):
    """Edge in a knowledge graph."""
@ -26,8 +26,6 @@ class GraphQLQuery(BaseModel):
    """GraphQL query."""
    query: str
 class Answer(BaseModel):
    """Answer."""
    answer: str
@ -42,7 +40,6 @@ class MemorySummary(BaseModel):
    nodes: List[Node] = Field(..., default_factory=list)
    edges: List[Edge] = Field(..., default_factory=list)
 class TextSubclass(str, Enum):
    ARTICLES = "Articles, essays, and reports"
    BOOKS = "Books and manuscripts"
@ -107,7 +104,6 @@ class ImageSubclass(str, Enum):
    SCREENSHOTS = "Screenshots and graphical user interfaces"
    OTHER_IMAGES = "Other types of images"
 class VideoSubclass(str, Enum):
    MOVIES = "Movies and short films"
    DOCUMENTARIES = "Documentaries and educational videos"
@ -183,7 +179,6 @@ class DefaultContentPrediction(BaseModel):
        ProceduralContent,
    ]
 class SummarizedContent(BaseModel):
    """Class for a single class label summary and description."""
    summary: str
@ -194,7 +189,6 @@ class LabeledContent(BaseModel):
    content_labels: str
 class CognitiveLayerSubgroup(BaseModel):
    """ CognitiveLayerSubgroup in a general layer """
    id: int
--- a/notebooks/full_run.ipynb
+++ b/notebooks/full_run.ipynb
@ -11,6 +11,7 @@
    "import cognee\n",
    "import dspy\n",
    "from cognee.modules.cognify.dataset import HotPotQA\n",
    "from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
    "\n",
    "data_directory_path = path.abspath(\"../.data\")\n",
    "cognee.config.data_root_directory(data_directory_path)\n",
@ -18,6 +19,8 @@
    "cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
    "cognee.config.system_root_directory(cognee_directory_path)\n",
    "\n",
    "cognee.config.set_graph_model(SourceCodeGraph)\n",
    "\n",
    "await cognee.prune.prune_system()\n",
    "\n",
    "colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = \"http://20.102.90.50:2017/wiki17_abstracts\")\n",
@ -39,7 +42,7 @@
    "\n",
    "#     texts_to_add.append(train_case_text)\n",
    "\n",
-    "dataset_name = \"short_stories\"\n",
+    "dataset_name = \"code\"\n",
    "await cognee.add(\"data://\" + data_directory_path, dataset_name)\n"
   ]
  },
@ -75,6 +78,9 @@
    "from os import path\n",
    "import logging\n",
    "import cognee\n",
    "from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
    "\n",
    "cognee.config.set_graph_model(SourceCodeGraph)\n",
    "\n",
    "logging.basicConfig(level = logging.INFO)\n",
    "\n",
@ -86,7 +92,7 @@
    "cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
    "cognee.config.system_root_directory(cognee_directory_path)\n",
    "\n",
-    "await cognee.cognify('short_stories')"
+    "await cognee.cognify('code')"
   ]
  },
  {