diff --git a/.data/code/example.txt b/.data/code/example.txt new file mode 100644 index 000000000..4596a08eb --- /dev/null +++ b/.data/code/example.txt @@ -0,0 +1,28 @@ +''' + Given a string, find the length of the longest substring without repeating characters. + + Examples: + + Given "abcabcbb", the answer is "abc", which the length is 3. + + Given "bbbbb", the answer is "b", with the length of 1. + + Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring. +''' + +class Solution(object): + def lengthOfLongestSubstring(self, s): + """ + :type s: str + :rtype: int + """ + mapSet = {} + start, result = 0, 0 + + for end in range(len(s)): + if s[end] in mapSet: + start = max(mapSet[s[end]], start) + result = max(result, end-start+1) + mapSet[s[end]] = end+1 + + return result diff --git a/cognee/modules/cognify/graph/add_classification_nodes.py b/cognee/modules/cognify/graph/add_classification_nodes.py index 67170b229..7a3a393df 100644 --- a/cognee/modules/cognify/graph/add_classification_nodes.py +++ b/cognee/modules/cognify/graph/add_classification_nodes.py @@ -5,20 +5,20 @@ async def add_classification_nodes(graph_client, parent_node_id: str, categories data_type = category["data_type"].upper().replace(' ', '_') category_name = category["category_name"].upper().replace(' ', '_').replace("'", "").replace("/", "_") - data_type_node_id = f"DATA_TYPE__{data_type}" + data_type_node_id = data_type data_type_node = await graph_client.extract_node(data_type_node_id) if not data_type_node: - data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, entity_type = "DataType")) + data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, type = "DataType")) await graph_client.add_edge(data_type_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as")) - category_node_id = f"DATA_CATEGORY__{category_name}" + category_node_id = category_name category_node = await graph_client.extract_node(category_node_id) if not category_node: - category_node = await graph_client.add_node(category_node_id, dict(name = category_name, entity_type = "DataCategory")) + category_node = await graph_client.add_node(category_node_id, dict(name = category_name, type = "DataCategory")) await graph_client.add_edge(category_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as")) diff --git a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py index 11cc83a19..055362d8c 100644 --- a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py +++ b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py @@ -3,7 +3,6 @@ from uuid import uuid4 from typing import List, Tuple, TypedDict from cognee.infrastructure import infrastructure_config from cognee.infrastructure.databases.vector import DataPoint -from cognee.shared.data_models import KnowledgeGraph from cognee.utils import extract_pos_tags, extract_named_entities, extract_sentiment_vader class GraphLike(TypedDict): @@ -18,47 +17,50 @@ async def add_cognitive_layer_graphs( layer_graphs: List[Tuple[str, GraphLike]], ): vector_client = infrastructure_config.get_config("vector_engine") + graph_model = infrastructure_config.get_config("graph_model") for (layer_id, layer_graph) in layer_graphs: graph_nodes = [] graph_edges = [] - if not isinstance(layer_graph, KnowledgeGraph): - layer_graph = KnowledgeGraph.parse_obj(layer_graph) + if not isinstance(layer_graph, graph_model): + layer_graph = graph_model.parse_obj(layer_graph) for node in layer_graph.nodes: - node_id = generate_proposition_node_id(node.id) + node_id = generate_node_id(node.id) - entity_type_node_id = generate_type_node_id(node.entity_type) - entity_type_node = await graph_client.extract_node(entity_type_node_id) + type_node_id = generate_node_id(node.type) + type_node = await graph_client.extract_node(type_node_id) - if not entity_type_node: - node_name = node.entity_type.lower().capitalize() + if not type_node: + node_name = node.type.lower().capitalize() - entity_type_node = ( - entity_type_node_id, + type_node = ( + type_node_id, dict( - id = entity_type_node_id, + id = type_node_id, name = node_name, - entity_type = node_name, + type = node_name, created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"), updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ) ) - graph_nodes.append(entity_type_node) + graph_nodes.append(type_node) # Add relationship between document and entity type: "Document contains Person" graph_edges.append(( layer_id, - entity_type_node_id, + type_node_id, "contains", dict(relationship_name = "contains"), )) - pos_tags = extract_pos_tags(node.entity_description) - named_entities = extract_named_entities(node.entity_description) - sentiment = extract_sentiment_vader(node.entity_description) + # pos_tags = extract_pos_tags(node.description) + # named_entities = extract_named_entities(node.description) + # sentiment = extract_sentiment_vader(node.description) + + id, type, name, description, *node_properties = node graph_nodes.append(( node_id, @@ -67,21 +69,22 @@ async def add_cognitive_layer_graphs( layer_id = layer_id, chunk_id = chunk_id, chunk_collection = chunk_collection, - name = node.entity_name, - entity_type = node.entity_type.lower().capitalize(), - description = node.entity_description, - pos_tags = pos_tags, - sentiment = sentiment, - named_entities = named_entities, + name = node.name, + type = node.type.lower().capitalize(), + description = node.description, + # pos_tags = pos_tags, + # sentiment = sentiment, + # named_entities = named_entities, created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"), updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + *node_properties, ) )) # Add relationship between entity type and entity itself: "Jake is Person" graph_edges.append(( node_id, - entity_type_node_id, + type_node_id, "is", dict(relationship_name = "is"), )) @@ -96,8 +99,8 @@ async def add_cognitive_layer_graphs( # Add relationship that came from graphs. for edge in layer_graph.edges: graph_edges.append(( - generate_proposition_node_id(edge.source_node_id), - generate_proposition_node_id(edge.target_node_id), + generate_node_id(edge.source_node_id), + generate_node_id(edge.target_node_id), edge.relationship_name, dict(relationship_name = edge.relationship_name), )) @@ -129,8 +132,5 @@ async def add_cognitive_layer_graphs( await vector_client.create_data_points(layer_id, data_points) -def generate_proposition_node_id(node_id: str) -> str: - return f"PROPOSITION_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "") - -def generate_type_node_id(node_id: str) -> str: - return f"PROPOSITION_TYPE_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "") \ No newline at end of file +def generate_node_id(node_id: str) -> str: + return node_id.upper().replace(' ', '_').replace("'", "") diff --git a/cognee/modules/cognify/graph/add_cognitive_layers.py b/cognee/modules/cognify/graph/add_cognitive_layers.py index 6cd17c94a..50c87785e 100644 --- a/cognee/modules/cognify/graph/add_cognitive_layers.py +++ b/cognee/modules/cognify/graph/add_cognitive_layers.py @@ -21,4 +21,4 @@ async def add_cognitive_layers(graph_client, parent_node_id: str, cognitive_laye return cognitive_layer_nodes def generate_cognitive_layer_id(layer_id: str) -> str: - return f"COGNITIVE_LAYER__{layer_id.upper().replace(' ', '_')}".replace("'", "").replace("/", "_") + return layer_id.upper().replace(" ", "_").replace("'", "").replace("/", "_") diff --git a/cognee/modules/cognify/graph/add_document_node.py b/cognee/modules/cognify/graph/add_document_node.py index e1a6f7d52..027a5c241 100644 --- a/cognee/modules/cognify/graph/add_document_node.py +++ b/cognee/modules/cognify/graph/add_document_node.py @@ -13,7 +13,7 @@ async def add_document_node(graph_client: GraphDBInterface, parent_node_id, docu file_path = document_metadata["file_path"], ).model_dump() - document["entity_type"] = "Document" + document["type"] = "Document" await graph_client.add_node(document_id, document) diff --git a/cognee/modules/cognify/graph/add_label_nodes.py b/cognee/modules/cognify/graph/add_label_nodes.py index 572475060..245814f7c 100644 --- a/cognee/modules/cognify/graph/add_label_nodes.py +++ b/cognee/modules/cognify/graph/add_label_nodes.py @@ -19,7 +19,7 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw chunk_id = chunk_id, name = keyword.lower().capitalize(), keyword = keyword.lower(), - entity_type = "Keyword", + type = "Keyword", created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"), updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ), diff --git a/cognee/modules/cognify/graph/add_summary_nodes.py b/cognee/modules/cognify/graph/add_summary_nodes.py index 1424e64c5..0ea8dd7ca 100644 --- a/cognee/modules/cognify/graph/add_summary_nodes.py +++ b/cognee/modules/cognify/graph/add_summary_nodes.py @@ -20,7 +20,7 @@ async def add_summary_nodes(graph_client, document_id, summary): description_node_id, dict( name = "Description", - summary = summary["description"], + description = summary["description"], ), ) diff --git a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py index 24c57a40e..1988c23bc 100644 --- a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py +++ b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph.py @@ -7,18 +7,18 @@ from .extract_content_graph import extract_content_graph logger = logging.getLogger("extract_knowledge_graph(text: str)") async def extract_knowledge_graph(text: str, cognitive_layer, graph_model): - try: - compiled_extract_knowledge_graph = ExtractKnowledgeGraph() - compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json")) + # try: + # compiled_extract_knowledge_graph = ExtractKnowledgeGraph() + # compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json")) - event_loop = asyncio.get_event_loop() + # event_loop = asyncio.get_event_loop() - def sync_extract_knowledge_graph(): - return compiled_extract_knowledge_graph(context = text, question = "") + # def sync_extract_knowledge_graph(): + # return compiled_extract_knowledge_graph(context = text, question = "") - return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph - # return compiled_extract_knowledge_graph(text, question = "").graph - except Exception as error: - logger.error("Error extracting graph from content: %s", error, exc_info = True) + # return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph + # # return compiled_extract_knowledge_graph(text, question = "").graph + # except Exception as error: + # logger.error("Error extracting graph from content: %s", error, exc_info = True) - return await extract_content_graph(text, cognitive_layer, graph_model) + return await extract_content_graph(text, cognitive_layer, graph_model) diff --git a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py index 04ff42c8a..48ce1fc10 100644 --- a/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py +++ b/cognee/modules/data/extraction/knowledge_graph/extract_knowledge_graph_module.py @@ -29,8 +29,8 @@ class GraphFromText(dspy.Signature): def are_all_nodes_and_edges_valid(graph: KnowledgeGraph) -> bool: - return all([getattr(node, "entity_type", "").strip() != "" for node in graph.nodes]) and \ - all([getattr(node, "entity_name", "").strip() != "" for node in graph.nodes]) and \ + return all([getattr(node, "type", "").strip() != "" for node in graph.nodes]) and \ + all([getattr(node, "name", "").strip() != "" for node in graph.nodes]) and \ all([getattr(edge, "relationship_name", "").strip() != "" for edge in graph.edges]) def is_node_connected(node: Node, edges: List[Edge]) -> bool: @@ -56,7 +56,7 @@ class ExtractKnowledgeGraph(dspy.Module): graph = self.generate_graph(text = context).graph not_valid_nodes_or_edges_message = """ - All nodes must contain "entity_name". + All nodes must contain "name". All edges must contain "relationship_name". Please add mandatory fields to nodes and edges.""" diff --git a/cognee/shared/SourceCodeGraph.py b/cognee/shared/SourceCodeGraph.py new file mode 100644 index 000000000..279eb6870 --- /dev/null +++ b/cognee/shared/SourceCodeGraph.py @@ -0,0 +1,84 @@ +from typing import List, Union, Literal, Optional +from pydantic import BaseModel + +class BaseClass(BaseModel): + id: str + name: str + type: Literal["Class"] = "Class" + description: str + constructor_parameters: Optional[List[str]] + +class Class(BaseModel): + id: str + name: str + type: Literal["Class"] = "Class" + description: str + constructor_parameters: Optional[List[str]] + from_class: Optional[BaseClass] + +class ClassInstance(BaseModel): + id: str + name: str + type: Literal["ClassInstance"] = "ClassInstance" + description: str + from_class: Class + +class Function(BaseModel): + id: str + name: str + type: Literal["Function"] = "Function" + description: str + parameters: Optional[List[str]] + return_type: str + is_static: Optional[bool] = False + +class Variable(BaseModel): + id: str + name: str + type: Literal["Variable"] = "Variable" + description: str + is_static: Optional[bool] = False + default_value: Optional[str] + +class Operator(BaseModel): + id: str + name: str + type: Literal["Operator"] = "Operator" + description: str + return_type: str + +class ExpressionPart(BaseModel): + id: str + name: str + type: Literal["Expression"] = "Expression" + description: str + expression: str + members: List[Union[Variable, Function, Operator]] + +class Expression(BaseModel): + id: str + name: str + type: Literal["Expression"] = "Expression" + description: str + expression: str + members: List[Union[Variable, Function, Operator, ExpressionPart]] + +class Edge(BaseModel): + source_node_id: str + target_node_id: str + relationship_name: Literal["called in", "stored in", "defined in", "returned by", "instantiated in", "uses", "updates"] + +class SourceCodeGraph(BaseModel): + id: str + name: str + description: str + language: str + nodes: List[Union[ + Class, + Function, + Variable, + Operator, + Expression, + ClassInstance, + ]] + edges: List[Edge] diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index 8de86a291..034c97224 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -7,9 +7,9 @@ from pydantic import BaseModel, Field class Node(BaseModel): """Node in a knowledge graph.""" id: str - entity_name: str - entity_type: str - entity_description: str + name: str + type: str + description: str class Edge(BaseModel): """Edge in a knowledge graph.""" @@ -26,8 +26,6 @@ class GraphQLQuery(BaseModel): """GraphQL query.""" query: str - - class Answer(BaseModel): """Answer.""" answer: str @@ -42,7 +40,6 @@ class MemorySummary(BaseModel): nodes: List[Node] = Field(..., default_factory=list) edges: List[Edge] = Field(..., default_factory=list) - class TextSubclass(str, Enum): ARTICLES = "Articles, essays, and reports" BOOKS = "Books and manuscripts" @@ -107,7 +104,6 @@ class ImageSubclass(str, Enum): SCREENSHOTS = "Screenshots and graphical user interfaces" OTHER_IMAGES = "Other types of images" - class VideoSubclass(str, Enum): MOVIES = "Movies and short films" DOCUMENTARIES = "Documentaries and educational videos" @@ -183,7 +179,6 @@ class DefaultContentPrediction(BaseModel): ProceduralContent, ] - class SummarizedContent(BaseModel): """Class for a single class label summary and description.""" summary: str @@ -194,7 +189,6 @@ class LabeledContent(BaseModel): content_labels: str - class CognitiveLayerSubgroup(BaseModel): """ CognitiveLayerSubgroup in a general layer """ id: int diff --git a/notebooks/full_run.ipynb b/notebooks/full_run.ipynb index 878346fdf..250993b38 100644 --- a/notebooks/full_run.ipynb +++ b/notebooks/full_run.ipynb @@ -11,6 +11,7 @@ "import cognee\n", "import dspy\n", "from cognee.modules.cognify.dataset import HotPotQA\n", + "from cognee.shared.SourceCodeGraph import SourceCodeGraph\n", "\n", "data_directory_path = path.abspath(\"../.data\")\n", "cognee.config.data_root_directory(data_directory_path)\n", @@ -18,6 +19,8 @@ "cognee_directory_path = path.abspath(\"../.cognee_system\")\n", "cognee.config.system_root_directory(cognee_directory_path)\n", "\n", + "cognee.config.set_graph_model(SourceCodeGraph)\n", + "\n", "await cognee.prune.prune_system()\n", "\n", "colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = \"http://20.102.90.50:2017/wiki17_abstracts\")\n", @@ -39,7 +42,7 @@ "\n", "# texts_to_add.append(train_case_text)\n", "\n", - "dataset_name = \"short_stories\"\n", + "dataset_name = \"code\"\n", "await cognee.add(\"data://\" + data_directory_path, dataset_name)\n" ] }, @@ -75,6 +78,9 @@ "from os import path\n", "import logging\n", "import cognee\n", + "from cognee.shared.SourceCodeGraph import SourceCodeGraph\n", + "\n", + "cognee.config.set_graph_model(SourceCodeGraph)\n", "\n", "logging.basicConfig(level = logging.INFO)\n", "\n", @@ -86,7 +92,7 @@ "cognee_directory_path = path.abspath(\"../.cognee_system\")\n", "cognee.config.system_root_directory(cognee_directory_path)\n", "\n", - "await cognee.cognify('short_stories')" + "await cognee.cognify('code')" ] }, {