Intermidiate commit
This commit is contained in:
parent
1c2c72b8bf
commit
f655ee8194
12 changed files with 176 additions and 64 deletions
28
.data/code/example.txt
Normal file
28
.data/code/example.txt
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
'''
|
||||
Given a string, find the length of the longest substring without repeating characters.
|
||||
|
||||
Examples:
|
||||
|
||||
Given "abcabcbb", the answer is "abc", which the length is 3.
|
||||
|
||||
Given "bbbbb", the answer is "b", with the length of 1.
|
||||
|
||||
Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring.
|
||||
'''
|
||||
|
||||
class Solution(object):
|
||||
def lengthOfLongestSubstring(self, s):
|
||||
"""
|
||||
:type s: str
|
||||
:rtype: int
|
||||
"""
|
||||
mapSet = {}
|
||||
start, result = 0, 0
|
||||
|
||||
for end in range(len(s)):
|
||||
if s[end] in mapSet:
|
||||
start = max(mapSet[s[end]], start)
|
||||
result = max(result, end-start+1)
|
||||
mapSet[s[end]] = end+1
|
||||
|
||||
return result
|
||||
|
|
@ -5,20 +5,20 @@ async def add_classification_nodes(graph_client, parent_node_id: str, categories
|
|||
data_type = category["data_type"].upper().replace(' ', '_')
|
||||
category_name = category["category_name"].upper().replace(' ', '_').replace("'", "").replace("/", "_")
|
||||
|
||||
data_type_node_id = f"DATA_TYPE__{data_type}"
|
||||
data_type_node_id = data_type
|
||||
|
||||
data_type_node = await graph_client.extract_node(data_type_node_id)
|
||||
|
||||
if not data_type_node:
|
||||
data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, entity_type = "DataType"))
|
||||
data_type_node = await graph_client.add_node(data_type_node_id, dict(name = data_type, type = "DataType"))
|
||||
|
||||
await graph_client.add_edge(data_type_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))
|
||||
|
||||
category_node_id = f"DATA_CATEGORY__{category_name}"
|
||||
category_node_id = category_name
|
||||
|
||||
category_node = await graph_client.extract_node(category_node_id)
|
||||
|
||||
if not category_node:
|
||||
category_node = await graph_client.add_node(category_node_id, dict(name = category_name, entity_type = "DataCategory"))
|
||||
category_node = await graph_client.add_node(category_node_id, dict(name = category_name, type = "DataCategory"))
|
||||
|
||||
await graph_client.add_edge(category_node_id, parent_node_id, "classified_as", dict(relationship_name = "classified_as"))
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ from uuid import uuid4
|
|||
from typing import List, Tuple, TypedDict
|
||||
from cognee.infrastructure import infrastructure_config
|
||||
from cognee.infrastructure.databases.vector import DataPoint
|
||||
from cognee.shared.data_models import KnowledgeGraph
|
||||
from cognee.utils import extract_pos_tags, extract_named_entities, extract_sentiment_vader
|
||||
|
||||
class GraphLike(TypedDict):
|
||||
|
|
@ -18,47 +17,50 @@ async def add_cognitive_layer_graphs(
|
|||
layer_graphs: List[Tuple[str, GraphLike]],
|
||||
):
|
||||
vector_client = infrastructure_config.get_config("vector_engine")
|
||||
graph_model = infrastructure_config.get_config("graph_model")
|
||||
|
||||
for (layer_id, layer_graph) in layer_graphs:
|
||||
graph_nodes = []
|
||||
graph_edges = []
|
||||
|
||||
if not isinstance(layer_graph, KnowledgeGraph):
|
||||
layer_graph = KnowledgeGraph.parse_obj(layer_graph)
|
||||
if not isinstance(layer_graph, graph_model):
|
||||
layer_graph = graph_model.parse_obj(layer_graph)
|
||||
|
||||
for node in layer_graph.nodes:
|
||||
node_id = generate_proposition_node_id(node.id)
|
||||
node_id = generate_node_id(node.id)
|
||||
|
||||
entity_type_node_id = generate_type_node_id(node.entity_type)
|
||||
entity_type_node = await graph_client.extract_node(entity_type_node_id)
|
||||
type_node_id = generate_node_id(node.type)
|
||||
type_node = await graph_client.extract_node(type_node_id)
|
||||
|
||||
if not entity_type_node:
|
||||
node_name = node.entity_type.lower().capitalize()
|
||||
if not type_node:
|
||||
node_name = node.type.lower().capitalize()
|
||||
|
||||
entity_type_node = (
|
||||
entity_type_node_id,
|
||||
type_node = (
|
||||
type_node_id,
|
||||
dict(
|
||||
id = entity_type_node_id,
|
||||
id = type_node_id,
|
||||
name = node_name,
|
||||
entity_type = node_name,
|
||||
type = node_name,
|
||||
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
)
|
||||
|
||||
graph_nodes.append(entity_type_node)
|
||||
graph_nodes.append(type_node)
|
||||
|
||||
# Add relationship between document and entity type: "Document contains Person"
|
||||
graph_edges.append((
|
||||
layer_id,
|
||||
entity_type_node_id,
|
||||
type_node_id,
|
||||
"contains",
|
||||
dict(relationship_name = "contains"),
|
||||
))
|
||||
|
||||
pos_tags = extract_pos_tags(node.entity_description)
|
||||
named_entities = extract_named_entities(node.entity_description)
|
||||
sentiment = extract_sentiment_vader(node.entity_description)
|
||||
# pos_tags = extract_pos_tags(node.description)
|
||||
# named_entities = extract_named_entities(node.description)
|
||||
# sentiment = extract_sentiment_vader(node.description)
|
||||
|
||||
id, type, name, description, *node_properties = node
|
||||
|
||||
graph_nodes.append((
|
||||
node_id,
|
||||
|
|
@ -67,21 +69,22 @@ async def add_cognitive_layer_graphs(
|
|||
layer_id = layer_id,
|
||||
chunk_id = chunk_id,
|
||||
chunk_collection = chunk_collection,
|
||||
name = node.entity_name,
|
||||
entity_type = node.entity_type.lower().capitalize(),
|
||||
description = node.entity_description,
|
||||
pos_tags = pos_tags,
|
||||
sentiment = sentiment,
|
||||
named_entities = named_entities,
|
||||
name = node.name,
|
||||
type = node.type.lower().capitalize(),
|
||||
description = node.description,
|
||||
# pos_tags = pos_tags,
|
||||
# sentiment = sentiment,
|
||||
# named_entities = named_entities,
|
||||
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
*node_properties,
|
||||
)
|
||||
))
|
||||
|
||||
# Add relationship between entity type and entity itself: "Jake is Person"
|
||||
graph_edges.append((
|
||||
node_id,
|
||||
entity_type_node_id,
|
||||
type_node_id,
|
||||
"is",
|
||||
dict(relationship_name = "is"),
|
||||
))
|
||||
|
|
@ -96,8 +99,8 @@ async def add_cognitive_layer_graphs(
|
|||
# Add relationship that came from graphs.
|
||||
for edge in layer_graph.edges:
|
||||
graph_edges.append((
|
||||
generate_proposition_node_id(edge.source_node_id),
|
||||
generate_proposition_node_id(edge.target_node_id),
|
||||
generate_node_id(edge.source_node_id),
|
||||
generate_node_id(edge.target_node_id),
|
||||
edge.relationship_name,
|
||||
dict(relationship_name = edge.relationship_name),
|
||||
))
|
||||
|
|
@ -129,8 +132,5 @@ async def add_cognitive_layer_graphs(
|
|||
await vector_client.create_data_points(layer_id, data_points)
|
||||
|
||||
|
||||
def generate_proposition_node_id(node_id: str) -> str:
|
||||
return f"PROPOSITION_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
|
||||
|
||||
def generate_type_node_id(node_id: str) -> str:
|
||||
return f"PROPOSITION_TYPE_NODE__{node_id.upper().replace(' ', '_')}".replace("'", "")
|
||||
def generate_node_id(node_id: str) -> str:
|
||||
return node_id.upper().replace(' ', '_').replace("'", "")
|
||||
|
|
|
|||
|
|
@ -21,4 +21,4 @@ async def add_cognitive_layers(graph_client, parent_node_id: str, cognitive_laye
|
|||
return cognitive_layer_nodes
|
||||
|
||||
def generate_cognitive_layer_id(layer_id: str) -> str:
|
||||
return f"COGNITIVE_LAYER__{layer_id.upper().replace(' ', '_')}".replace("'", "").replace("/", "_")
|
||||
return layer_id.upper().replace(" ", "_").replace("'", "").replace("/", "_")
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ async def add_document_node(graph_client: GraphDBInterface, parent_node_id, docu
|
|||
file_path = document_metadata["file_path"],
|
||||
).model_dump()
|
||||
|
||||
document["entity_type"] = "Document"
|
||||
document["type"] = "Document"
|
||||
|
||||
await graph_client.add_node(document_id, document)
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ async def add_label_nodes(graph_client, parent_node_id: str, chunk_id: str, keyw
|
|||
chunk_id = chunk_id,
|
||||
name = keyword.lower().capitalize(),
|
||||
keyword = keyword.lower(),
|
||||
entity_type = "Keyword",
|
||||
type = "Keyword",
|
||||
created_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
updated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
),
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ async def add_summary_nodes(graph_client, document_id, summary):
|
|||
description_node_id,
|
||||
dict(
|
||||
name = "Description",
|
||||
summary = summary["description"],
|
||||
description = summary["description"],
|
||||
),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,18 +7,18 @@ from .extract_content_graph import extract_content_graph
|
|||
logger = logging.getLogger("extract_knowledge_graph(text: str)")
|
||||
|
||||
async def extract_knowledge_graph(text: str, cognitive_layer, graph_model):
|
||||
try:
|
||||
compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
|
||||
compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
|
||||
# try:
|
||||
# compiled_extract_knowledge_graph = ExtractKnowledgeGraph()
|
||||
# compiled_extract_knowledge_graph.load(get_absolute_path("./programs/extract_knowledge_graph/extract_knowledge_graph.json"))
|
||||
|
||||
event_loop = asyncio.get_event_loop()
|
||||
# event_loop = asyncio.get_event_loop()
|
||||
|
||||
def sync_extract_knowledge_graph():
|
||||
return compiled_extract_knowledge_graph(context = text, question = "")
|
||||
# def sync_extract_knowledge_graph():
|
||||
# return compiled_extract_knowledge_graph(context = text, question = "")
|
||||
|
||||
return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
|
||||
# return compiled_extract_knowledge_graph(text, question = "").graph
|
||||
except Exception as error:
|
||||
logger.error("Error extracting graph from content: %s", error, exc_info = True)
|
||||
# return (await event_loop.run_in_executor(None, sync_extract_knowledge_graph)).graph
|
||||
# # return compiled_extract_knowledge_graph(text, question = "").graph
|
||||
# except Exception as error:
|
||||
# logger.error("Error extracting graph from content: %s", error, exc_info = True)
|
||||
|
||||
return await extract_content_graph(text, cognitive_layer, graph_model)
|
||||
return await extract_content_graph(text, cognitive_layer, graph_model)
|
||||
|
|
|
|||
|
|
@ -29,8 +29,8 @@ class GraphFromText(dspy.Signature):
|
|||
|
||||
|
||||
def are_all_nodes_and_edges_valid(graph: KnowledgeGraph) -> bool:
|
||||
return all([getattr(node, "entity_type", "").strip() != "" for node in graph.nodes]) and \
|
||||
all([getattr(node, "entity_name", "").strip() != "" for node in graph.nodes]) and \
|
||||
return all([getattr(node, "type", "").strip() != "" for node in graph.nodes]) and \
|
||||
all([getattr(node, "name", "").strip() != "" for node in graph.nodes]) and \
|
||||
all([getattr(edge, "relationship_name", "").strip() != "" for edge in graph.edges])
|
||||
|
||||
def is_node_connected(node: Node, edges: List[Edge]) -> bool:
|
||||
|
|
@ -56,7 +56,7 @@ class ExtractKnowledgeGraph(dspy.Module):
|
|||
graph = self.generate_graph(text = context).graph
|
||||
|
||||
not_valid_nodes_or_edges_message = """
|
||||
All nodes must contain "entity_name".
|
||||
All nodes must contain "name".
|
||||
All edges must contain "relationship_name".
|
||||
Please add mandatory fields to nodes and edges."""
|
||||
|
||||
|
|
|
|||
84
cognee/shared/SourceCodeGraph.py
Normal file
84
cognee/shared/SourceCodeGraph.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
from typing import List, Union, Literal, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
class BaseClass(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Class"] = "Class"
|
||||
description: str
|
||||
constructor_parameters: Optional[List[str]]
|
||||
|
||||
class Class(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Class"] = "Class"
|
||||
description: str
|
||||
constructor_parameters: Optional[List[str]]
|
||||
from_class: Optional[BaseClass]
|
||||
|
||||
class ClassInstance(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["ClassInstance"] = "ClassInstance"
|
||||
description: str
|
||||
from_class: Class
|
||||
|
||||
class Function(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Function"] = "Function"
|
||||
description: str
|
||||
parameters: Optional[List[str]]
|
||||
return_type: str
|
||||
is_static: Optional[bool] = False
|
||||
|
||||
class Variable(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Variable"] = "Variable"
|
||||
description: str
|
||||
is_static: Optional[bool] = False
|
||||
default_value: Optional[str]
|
||||
|
||||
class Operator(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Operator"] = "Operator"
|
||||
description: str
|
||||
return_type: str
|
||||
|
||||
class ExpressionPart(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Expression"] = "Expression"
|
||||
description: str
|
||||
expression: str
|
||||
members: List[Union[Variable, Function, Operator]]
|
||||
|
||||
class Expression(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: Literal["Expression"] = "Expression"
|
||||
description: str
|
||||
expression: str
|
||||
members: List[Union[Variable, Function, Operator, ExpressionPart]]
|
||||
|
||||
class Edge(BaseModel):
|
||||
source_node_id: str
|
||||
target_node_id: str
|
||||
relationship_name: Literal["called in", "stored in", "defined in", "returned by", "instantiated in", "uses", "updates"]
|
||||
|
||||
class SourceCodeGraph(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
description: str
|
||||
language: str
|
||||
nodes: List[Union[
|
||||
Class,
|
||||
Function,
|
||||
Variable,
|
||||
Operator,
|
||||
Expression,
|
||||
ClassInstance,
|
||||
]]
|
||||
edges: List[Edge]
|
||||
|
|
@ -7,9 +7,9 @@ from pydantic import BaseModel, Field
|
|||
class Node(BaseModel):
|
||||
"""Node in a knowledge graph."""
|
||||
id: str
|
||||
entity_name: str
|
||||
entity_type: str
|
||||
entity_description: str
|
||||
name: str
|
||||
type: str
|
||||
description: str
|
||||
|
||||
class Edge(BaseModel):
|
||||
"""Edge in a knowledge graph."""
|
||||
|
|
@ -26,8 +26,6 @@ class GraphQLQuery(BaseModel):
|
|||
"""GraphQL query."""
|
||||
query: str
|
||||
|
||||
|
||||
|
||||
class Answer(BaseModel):
|
||||
"""Answer."""
|
||||
answer: str
|
||||
|
|
@ -42,7 +40,6 @@ class MemorySummary(BaseModel):
|
|||
nodes: List[Node] = Field(..., default_factory=list)
|
||||
edges: List[Edge] = Field(..., default_factory=list)
|
||||
|
||||
|
||||
class TextSubclass(str, Enum):
|
||||
ARTICLES = "Articles, essays, and reports"
|
||||
BOOKS = "Books and manuscripts"
|
||||
|
|
@ -107,7 +104,6 @@ class ImageSubclass(str, Enum):
|
|||
SCREENSHOTS = "Screenshots and graphical user interfaces"
|
||||
OTHER_IMAGES = "Other types of images"
|
||||
|
||||
|
||||
class VideoSubclass(str, Enum):
|
||||
MOVIES = "Movies and short films"
|
||||
DOCUMENTARIES = "Documentaries and educational videos"
|
||||
|
|
@ -183,7 +179,6 @@ class DefaultContentPrediction(BaseModel):
|
|||
ProceduralContent,
|
||||
]
|
||||
|
||||
|
||||
class SummarizedContent(BaseModel):
|
||||
"""Class for a single class label summary and description."""
|
||||
summary: str
|
||||
|
|
@ -194,7 +189,6 @@ class LabeledContent(BaseModel):
|
|||
content_labels: str
|
||||
|
||||
|
||||
|
||||
class CognitiveLayerSubgroup(BaseModel):
|
||||
""" CognitiveLayerSubgroup in a general layer """
|
||||
id: int
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
"import cognee\n",
|
||||
"import dspy\n",
|
||||
"from cognee.modules.cognify.dataset import HotPotQA\n",
|
||||
"from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
|
||||
"\n",
|
||||
"data_directory_path = path.abspath(\"../.data\")\n",
|
||||
"cognee.config.data_root_directory(data_directory_path)\n",
|
||||
|
|
@ -18,6 +19,8 @@
|
|||
"cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
|
||||
"cognee.config.system_root_directory(cognee_directory_path)\n",
|
||||
"\n",
|
||||
"cognee.config.set_graph_model(SourceCodeGraph)\n",
|
||||
"\n",
|
||||
"await cognee.prune.prune_system()\n",
|
||||
"\n",
|
||||
"colbertv2_wiki17_abstracts = dspy.ColBERTv2(url = \"http://20.102.90.50:2017/wiki17_abstracts\")\n",
|
||||
|
|
@ -39,7 +42,7 @@
|
|||
"\n",
|
||||
"# texts_to_add.append(train_case_text)\n",
|
||||
"\n",
|
||||
"dataset_name = \"short_stories\"\n",
|
||||
"dataset_name = \"code\"\n",
|
||||
"await cognee.add(\"data://\" + data_directory_path, dataset_name)\n"
|
||||
]
|
||||
},
|
||||
|
|
@ -75,6 +78,9 @@
|
|||
"from os import path\n",
|
||||
"import logging\n",
|
||||
"import cognee\n",
|
||||
"from cognee.shared.SourceCodeGraph import SourceCodeGraph\n",
|
||||
"\n",
|
||||
"cognee.config.set_graph_model(SourceCodeGraph)\n",
|
||||
"\n",
|
||||
"logging.basicConfig(level = logging.INFO)\n",
|
||||
"\n",
|
||||
|
|
@ -86,7 +92,7 @@
|
|||
"cognee_directory_path = path.abspath(\"../.cognee_system\")\n",
|
||||
"cognee.config.system_root_directory(cognee_directory_path)\n",
|
||||
"\n",
|
||||
"await cognee.cognify('short_stories')"
|
||||
"await cognee.cognify('code')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue