Intermidiate commit

This commit is contained in:
Boris Arzentar 2024-04-24 19:35:36 +02:00
parent 1c2c72b8bf
commit f655ee8194
12 changed files with 176 additions and 64 deletions

28
.data/code/example.txt Normal file
View file

@ -0,0 +1,28 @@
'''
Given a string, find the length of the longest substring without repeating characters.
Examples:
Given "abcabcbb", the answer is "abc", which the length is 3.
Given "bbbbb", the answer is "b", with the length of 1.
Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring.
'''
class Solution(object):
def lengthOfLongestSubstring(self, s):
"""
:type s: str
:rtype: int
"""
mapSet = {}
start, result = 0, 0
for end in range(len(s)):
if s[end] in mapSet:
start = max(mapSet[s[end]], start)
result = max(result, end-start+1)
mapSet[s[end]] = end+1
return result

View file

@ -5,20 +5,20 @@ async def add_classification_nodes(graph_client, parent_node_id: str, categories
data_type = category["data_type"].upper().replace(' ', '_')
category_name = category["category_name"].upper().replace(' ', '_').replace("'", "").replace("/", "_")
data_type_node_id = f"DATA_TYPE__{data_type}"
data_type_node_id = data_type
data_type_node = await graph_client.extract_node(data_type_node_id)
if not data_type_node:
data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, entity_type = "DataType"))
data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, type = "DataType"))
await graph_client.add_edge(data_type_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))
category_node_id = f"DATA_CATEGORY__{category_name}"
category_node_id = category_name
category_node = await graph_client.extract_node(category_node_id)
if not category_node:
category_node = await graph_client.add_node(category_node_id, dict(name = category_name, entity_type = "DataCategory"))
category_node = await graph_client.add_node(category_node_id, dict(name = category_name, type = "DataCategory"))
await graph_client.add_edge(category_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))

View file

@ -3,7 +3,6 @@ from uuid import uuid4
from typing import List, Tuple, TypedDict
from cognee.infrastructure import infrastructure_config
from cognee.infrastructure.databases.vector import DataPoint
from cognee.shared.data_models import KnowledgeGraph
from cognee.utils import extract_pos_tags, extract_named_entities, extract_sentiment_vader
class GraphLike(TypedDict):
@ -18,47 +17,50 @@ async def add_cognitive_layer_graphs(
layer_graphs: List[Tuple[str, GraphLike]],
):
vector_client = infrastructure_config.get_config("vector_engine")
graph_model = infrastructure_config.get_config("graph_model")
for (layer_id, layer_graph) in layer_graphs:
graph_nodes = []
graph_edges = []
if not isinstance(layer_graph, KnowledgeGraph):
layer_graph = KnowledgeGraph.parse_obj(layer_graph)
if not isinstance(layer_graph, graph_model):
layer_graph = graph_model.parse_obj(layer_graph)
for node in layer_graph.nodes:
node_id = generate_proposition_node_id(node.id)
node_id = generate_node_id(node.id)
entity_type_node_id = generate_type_node_id(node.entity_type)
entity_type_node = await graph_client.extract_node(entity_type_node_id)
type_node_id = generate_node_id(node.type)
type_node = await graph_client.extract_node(type_node_id)
if not entity_type_node:
node_name = node.entity_type.lower().capitalize()
if not type_node:
node_name = node.type.lower().capitalize()
entity_type_node = (
entity_type_node_id,
type_node = (
type_node_id,
dict(
id = entity_type_node_id,
id = type_node_id,
name = node_name,
entity_type = node_name,
type = node_name,
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
)
)
graph_nodes.append(entity_type_node)
graph_nodes.append(type_node)
# Add relationship between document and entity type: "Document contains Person"
graph_edges.append((
layer_id,
entity_type_node_id,
type_node_id,
"contains",
dict(relationship_name = "contains"),
))
pos_tags = extract_pos_tags(node.entity_description)
named_entities = extract_named_entities(node.entity_description)
sentiment = extract_sentiment_vader(node.entity_description)
# pos_tags = extract_pos_tags(node.description)
# named_entities = extract_named_entities(node.description)
# sentiment = extract_sentiment_vader(node.description)
id, type, name, description, *node_properties = node
graph_nodes.append((
node_id,
@ -67,21 +69,22 @@ async def add_cognitive_layer_graphs(
layer_id = layer_id,
chunk_id = chunk_id,
chunk_collection = chunk_collection,
name = node.entity_name,
entity_type = node.entity_type.lower().capitalize(),
description = node.entity_description,
pos_tags = pos_tags,
sentiment = sentiment,
named_entities = named_entities,
name = node.name,
type = node.type.lower().capitalize(),
description = node.description,
# pos_tags = pos_tags,
# sentiment = sentiment,
# named_entities = named_entities,
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
*node_properties,
)
))
# Add relationship between entity type and entity itself: "Jake is Person"
graph_edges.append((
node_id,
entity_type_node_id,
type_node_id,
"is",
dict(relationship_name = "is"),
))
@ -96,8 +99,8 @@ async def add_cognitive_layer_graphs(
# Add relationship that came from graphs.
for edge in layer_graph.edges:
graph_edges.append((
generate_proposition_node_id(edge.source_node_id),
generate_proposition_node_id(edge.target_node_id),
generate_node_id(edge.source_node_id),
generate_node_id(edge.target_node_id),
edge.relationship_name,
dict(relationship_name = edge.relationship_name),
))
@ -129,8 +132,5 @@ async def add_cognitive_layer_graphs(
await vector_client.create_data_points(layer_id, data_points)
def generate_proposition_node_id(node_id: str) -> str:
return f"PROPOSITION_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
def generate_type_node_id(node_id: str) -> str:
return f"PROPOSITION_TYPE_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
def generate_node_id(node_id: str) -> str:
return node_id.upper().replace(' ', '_').replace("'", "")

View file

@ -21,4 +21,4 @@ async def add_cognitive_layers(graph_client, parent_node_id: str, cognitive_laye
return cognitive_layer_nodes
def generate_cognitive_layer_id(layer_id: str) -> str:
return f"COGNITIVE_LAYER__{layer_id.upper().replace(' ', '_')}".replace("'", "").replace("/", "_")
return layer_id.upper().replace(" ", "_").replace("'", "").replace("/", "_")

View file

@ -13,7 +13,7 @@ async def add_document_node(graph_client: GraphDBInterface, parent_node_id, docu
file_path = document_metadata["file_path"],
).model_dump()
document["entity_type"] = "Document"
document["type"] = "Document"
await graph_client.add_node(document_id, document)

View file

@ -19,7 +19,7 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw
chunk_id = chunk_id,
name = keyword.lower().capitalize(),
keyword = keyword.lower(),
entity_type = "Keyword",
type = "Keyword",
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
),

View file

@ -20,7 +20,7 @@ async def add_summary_nodes(graph_client, document_id, summary):
description_node_id,
dict(
name = "Description",
summary = summary["description"],
description = summary["description"],
),
)

View file

@ -7,18 +7,18 @@ from .extract_content_graph import extract_content_graph
logger = logging.getLogger("extract_knowledge_graph(text: str)")
async def extract_knowledge_graph(text: str, cognitive_layer, graph_model):
try:
compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
# try:
# compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
# compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
event_loop = asyncio.get_event_loop()
# event_loop = asyncio.get_event_loop()
def sync_extract_knowledge_graph():
return compiled_extract_knowledge_graph(context = text, question = "")
# def sync_extract_knowledge_graph():
# return compiled_extract_knowledge_graph(context = text, question = "")
return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
# return compiled_extract_knowledge_graph(text, question = "").graph
except Exception as error:
logger.error("Error extracting graph from content: %s", error, exc_info = True)
# return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
# # return compiled_extract_knowledge_graph(text, question = "").graph
# except Exception as error:
# logger.error("Error extracting graph from content: %s", error, exc_info = True)
return await extract_content_graph(text, cognitive_layer, graph_model)
return await extract_content_graph(text, cognitive_layer, graph_model)

View file

@ -29,8 +29,8 @@ class GraphFromText(dspy.Signature):
def are_all_nodes_and_edges_valid(graph: KnowledgeGraph) -> bool:
return all([getattr(node, "entity_type", "").strip() != "" for node in graph.nodes]) and \
all([getattr(node, "entity_name", "").strip() != "" for node in graph.nodes]) and \
return all([getattr(node, "type", "").strip() != "" for node in graph.nodes]) and \
all([getattr(node, "name", "").strip() != "" for node in graph.nodes]) and \
all([getattr(edge, "relationship_name", "").strip() != "" for edge in graph.edges])
def is_node_connected(node: Node, edges: List[Edge]) -> bool:
@ -56,7 +56,7 @@ class ExtractKnowledgeGraph(dspy.Module):
graph = self.generate_graph(text = context).graph
not_valid_nodes_or_edges_message = """
All nodes must contain "entity_name".
All nodes must contain "name".
All edges must contain "relationship_name".
Please add mandatory fields to nodes and edges."""

View file

@ -0,0 +1,84 @@
from typing import List, Union, Literal, Optional
from pydantic import BaseModel
class BaseClass(BaseModel):
id: str
name: str
type: Literal["Class"] = "Class"
description: str
constructor_parameters: Optional[List[str]]
class Class(BaseModel):
id: str
name: str
type: Literal["Class"] = "Class"
description: str
constructor_parameters: Optional[List[str]]
from_class: Optional[BaseClass]
class ClassInstance(BaseModel):
id: str
name: str
type: Literal["ClassInstance"] = "ClassInstance"
description: str
from_class: Class
class Function(BaseModel):
id: str
name: str
type: Literal["Function"] = "Function"
description: str
parameters: Optional[List[str]]
return_type: str
is_static: Optional[bool] = False
class Variable(BaseModel):
id: str
name: str
type: Literal["Variable"] = "Variable"
description: str
is_static: Optional[bool] = False
default_value: Optional[str]
class Operator(BaseModel):
id: str
name: str
type: Literal["Operator"] = "Operator"
description: str
return_type: str
class ExpressionPart(BaseModel):
id: str
name: str
type: Literal["Expression"] = "Expression"
description: str
expression: str
members: List[Union[Variable, Function, Operator]]
class Expression(BaseModel):
id: str
name: str
type: Literal["Expression"] = "Expression"
description: str
expression: str
members: List[Union[Variable, Function, Operator, ExpressionPart]]
class Edge(BaseModel):
source_node_id: str
target_node_id: str
relationship_name: Literal["called in", "stored in", "defined in", "returned by", "instantiated in", "uses", "updates"]
class SourceCodeGraph(BaseModel):
id: str
name: str
description: str
language: str
nodes: List[Union[
Class,
Function,
Variable,
Operator,
Expression,
ClassInstance,
]]
edges: List[Edge]

View file

@ -7,9 +7,9 @@ from pydantic import BaseModel, Field
class Node(BaseModel):
"""Node in a knowledge graph."""
id: str
entity_name: str
entity_type: str
entity_description: str
name: str
type: str
description: str
class Edge(BaseModel):
"""Edge in a knowledge graph."""
@ -26,8 +26,6 @@ class GraphQLQuery(BaseModel):
"""GraphQL query."""
query: str
class Answer(BaseModel):
"""Answer."""
answer: str
@ -42,7 +40,6 @@ class MemorySummary(BaseModel):
nodes: List[Node] = Field(..., default_factory=list)
edges: List[Edge] = Field(..., default_factory=list)
class TextSubclass(str, Enum):
ARTICLES = "Articles, essays, and reports"
BOOKS = "Books and manuscripts"
@ -107,7 +104,6 @@ class ImageSubclass(str, Enum):
SCREENSHOTS = "Screenshots and graphical user interfaces"
OTHER_IMAGES = "Other types of images"
class VideoSubclass(str, Enum):
MOVIES = "Movies and short films"
DOCUMENTARIES = "Documentaries and educational videos"
@ -183,7 +179,6 @@ class DefaultContentPrediction(BaseModel):
ProceduralContent,
]
class SummarizedContent(BaseModel):
"""Class for a single class label summary and description."""
summary: str
@ -194,7 +189,6 @@ class LabeledContent(BaseModel):
content_labels: str
class CognitiveLayerSubgroup(BaseModel):
""" CognitiveLayerSubgroup in a general layer """
id: int

View file

@ -11,6 +11,7 @@
"import cognee\n",
"import dspy\n",
"from cognee.modules.cognify.dataset import HotPotQA\n",
"from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
"\n",
"data_directory_path = path.abspath(\"../.data\")\n",
"cognee.config.data_root_directory(data_directory_path)\n",
@ -18,6 +19,8 @@
"cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
"cognee.config.system_root_directory(cognee_directory_path)\n",
"\n",
"cognee.config.set_graph_model(SourceCodeGraph)\n",
"\n",
"await cognee.prune.prune_system()\n",
"\n",
"colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = \"http://20.102.90.50:2017/wiki17_abstracts\")\n",
@ -39,7 +42,7 @@
"\n",
"# texts_to_add.append(train_case_text)\n",
"\n",
"dataset_name = \"short_stories\"\n",
"dataset_name = \"code\"\n",
"await cognee.add(\"data://\" + data_directory_path, dataset_name)\n"
]
},
@ -75,6 +78,9 @@
"from os import path\n",
"import logging\n",
"import cognee\n",
"from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
"\n",
"cognee.config.set_graph_model(SourceCodeGraph)\n",
"\n",
"logging.basicConfig(level = logging.INFO)\n",
"\n",
@ -86,7 +92,7 @@
"cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
"cognee.config.system_root_directory(cognee_directory_path)\n",
"\n",
"await cognee.cognify('short_stories')"
"await cognee.cognify('code')"
]
},
{