Refactor of the tasks
This commit is contained in:
parent
2e367198cd
commit
85160da387
21 changed files with 392 additions and 690 deletions
|
|
@ -9,14 +9,6 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc
|
|||
from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument
|
||||
from cognee.shared.data_models import KnowledgeGraph
|
||||
from cognee.modules.data.processing.document_types import PdfDocument, TextDocument
|
||||
# from cognee.modules.cognify.vector import save_data_chunks
|
||||
# from cognee.modules.data.processing.process_documents import process_documents
|
||||
# from cognee.modules.classification.classify_text_chunks import classify_text_chunks
|
||||
# from cognee.modules.data.extraction.data_summary.summarize_text_chunks import summarize_text_chunks
|
||||
# from cognee.modules.data.processing.filter_affected_chunks import filter_affected_chunks
|
||||
# from cognee.modules.data.processing.remove_obsolete_chunks import remove_obsolete_chunks
|
||||
# from cognee.modules.data.extraction.knowledge_graph.expand_knowledge_graph import expand_knowledge_graph
|
||||
# from cognee.modules.data.extraction.knowledge_graph.establish_graph_topology import establish_graph_topology
|
||||
from cognee.modules.data.models import Dataset, Data
|
||||
from cognee.modules.data.operations.get_dataset_data import get_dataset_data
|
||||
from cognee.modules.data.operations.retrieve_datasets import retrieve_datasets
|
||||
|
|
@ -31,6 +23,7 @@ from cognee.tasks.chunk_extract_summary.chunk_extract_summary import chunk_extra
|
|||
from cognee.tasks.chunk_naive_llm_classifier.chunk_naive_llm_classifier import chunk_naive_llm_classifier_task
|
||||
from cognee.tasks.chunk_remove_disconnected.chunk_remove_disconnected import chunk_remove_disconnected_task
|
||||
from cognee.tasks.chunk_to_graph_decomposition.chunk_to_graph_decomposition import chunk_to_graph_decomposition_task
|
||||
from cognee.tasks.document_to_ontology.document_to_ontology import document_to_ontology
|
||||
from cognee.tasks.save_chunks_to_store.save_chunks_to_store import save_chunks_to_store_task
|
||||
from cognee.tasks.chunk_update_check.chunk_update_check import chunk_update_check_task
|
||||
from cognee.tasks.chunks_into_graph.chunks_into_graph import \
|
||||
|
|
@ -96,19 +89,20 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
|
|||
cognee_config = get_cognify_config()
|
||||
graph_config = get_graph_config()
|
||||
root_node_id = None
|
||||
|
||||
if graph_config.infer_graph_topology and graph_config.graph_topology_task:
|
||||
from cognee.modules.topology.topology import TopologyEngine
|
||||
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
|
||||
root_node_id = await topology_engine.add_graph_topology(files = data)
|
||||
elif graph_config.infer_graph_topology and not graph_config.infer_graph_topology:
|
||||
from cognee.modules.topology.topology import TopologyEngine
|
||||
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
|
||||
await topology_engine.add_graph_topology(graph_config.topology_file_path)
|
||||
elif not graph_config.graph_topology_task:
|
||||
root_node_id = "ROOT"
|
||||
#
|
||||
# if graph_config.infer_graph_topology and graph_config.graph_topology_task:
|
||||
# from cognee.modules.topology.topology import TopologyEngine
|
||||
# topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
|
||||
# root_node_id = await topology_engine.add_graph_topology(files = data)
|
||||
# elif graph_config.infer_graph_topology and not graph_config.infer_graph_topology:
|
||||
# from cognee.modules.topology.topology import TopologyEngine
|
||||
# topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
|
||||
# await topology_engine.add_graph_topology(graph_config.topology_file_path)
|
||||
# elif not graph_config.graph_topology_task:
|
||||
# root_node_id = "ROOT"
|
||||
|
||||
tasks = [
|
||||
Task(document_to_ontology, root_node_id = root_node_id),
|
||||
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
|
||||
Task(chunk_to_graph_decomposition_task, topology_model = KnowledgeGraph, task_config = { "batch_size": 10 }), # Set the graph topology for the document chunk data
|
||||
Task(chunks_into_graph_task, graph_model = KnowledgeGraph, collection_name = "entities"), # Generate knowledge graphs from the document chunks and attach it to chunk nodes
|
||||
|
|
|
|||
|
|
@ -1,152 +0,0 @@
|
|||
|
||||
import asyncio
|
||||
from uuid import uuid5, NAMESPACE_OID
|
||||
from typing import Type
|
||||
from pydantic import BaseModel
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine, DataPoint
|
||||
from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
|
||||
from ..data.extraction.extract_categories import extract_categories
|
||||
|
||||
async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]):
|
||||
if len(data_chunks) == 0:
|
||||
return data_chunks
|
||||
|
||||
chunk_classifications = await asyncio.gather(
|
||||
*[extract_categories(chunk.text, classification_model) for chunk in data_chunks],
|
||||
)
|
||||
|
||||
classification_data_points = []
|
||||
|
||||
for chunk_index, chunk in enumerate(data_chunks):
|
||||
chunk_classification = chunk_classifications[chunk_index]
|
||||
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
|
||||
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
|
||||
|
||||
for classification_subclass in chunk_classification.label.subclass:
|
||||
classification_data_points.append(uuid5(NAMESPACE_OID, classification_subclass.value))
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
|
||||
class Keyword(BaseModel):
|
||||
uuid: str
|
||||
text: str
|
||||
chunk_id: str
|
||||
document_id: str
|
||||
|
||||
collection_name = "classification"
|
||||
|
||||
if await vector_engine.has_collection(collection_name):
|
||||
existing_data_points = await vector_engine.retrieve(
|
||||
collection_name,
|
||||
list(set(classification_data_points)),
|
||||
) if len(classification_data_points) > 0 else []
|
||||
|
||||
existing_points_map = {point.id: True for point in existing_data_points}
|
||||
else:
|
||||
existing_points_map = {}
|
||||
await vector_engine.create_collection(collection_name, payload_schema = Keyword)
|
||||
|
||||
data_points = []
|
||||
nodes = []
|
||||
edges = []
|
||||
|
||||
for (chunk_index, data_chunk) in enumerate(data_chunks):
|
||||
chunk_classification = chunk_classifications[chunk_index]
|
||||
classification_type_label = chunk_classification.label.type
|
||||
classification_type_id = uuid5(NAMESPACE_OID, classification_type_label)
|
||||
|
||||
if classification_type_id not in existing_points_map:
|
||||
data_points.append(
|
||||
DataPoint[Keyword](
|
||||
id = str(classification_type_id),
|
||||
payload = Keyword.parse_obj({
|
||||
"uuid": str(classification_type_id),
|
||||
"text": classification_type_label,
|
||||
"chunk_id": str(data_chunk.chunk_id),
|
||||
"document_id": str(data_chunk.document_id),
|
||||
}),
|
||||
embed_field = "text",
|
||||
)
|
||||
)
|
||||
|
||||
nodes.append((
|
||||
str(classification_type_id),
|
||||
dict(
|
||||
id = str(classification_type_id),
|
||||
name = classification_type_label,
|
||||
type = classification_type_label,
|
||||
)
|
||||
))
|
||||
existing_points_map[classification_type_id] = True
|
||||
|
||||
edges.append((
|
||||
str(data_chunk.chunk_id),
|
||||
str(classification_type_id),
|
||||
"is_media_type",
|
||||
dict(
|
||||
relationship_name = "is_media_type",
|
||||
source_node_id = str(data_chunk.chunk_id),
|
||||
target_node_id = str(classification_type_id),
|
||||
),
|
||||
))
|
||||
|
||||
for classification_subclass in chunk_classification.label.subclass:
|
||||
classification_subtype_label = classification_subclass.value
|
||||
classification_subtype_id = uuid5(NAMESPACE_OID, classification_subtype_label)
|
||||
|
||||
if classification_subtype_id not in existing_points_map:
|
||||
data_points.append(
|
||||
DataPoint[Keyword](
|
||||
id = str(classification_subtype_id),
|
||||
payload = Keyword.parse_obj({
|
||||
"uuid": str(classification_subtype_id),
|
||||
"text": classification_subtype_label,
|
||||
"chunk_id": str(data_chunk.chunk_id),
|
||||
"document_id": str(data_chunk.document_id),
|
||||
}),
|
||||
embed_field = "text",
|
||||
)
|
||||
)
|
||||
|
||||
nodes.append((
|
||||
str(classification_subtype_id),
|
||||
dict(
|
||||
id = str(classification_subtype_id),
|
||||
name = classification_subtype_label,
|
||||
type = classification_subtype_label,
|
||||
)
|
||||
))
|
||||
edges.append((
|
||||
str(classification_subtype_id),
|
||||
str(classification_type_id),
|
||||
"is_subtype_of",
|
||||
dict(
|
||||
relationship_name = "contains",
|
||||
source_node_id = str(classification_type_id),
|
||||
target_node_id = str(classification_subtype_id),
|
||||
),
|
||||
))
|
||||
|
||||
existing_points_map[classification_subtype_id] = True
|
||||
|
||||
edges.append((
|
||||
str(data_chunk.chunk_id),
|
||||
str(classification_subtype_id),
|
||||
"is_classified_as",
|
||||
dict(
|
||||
relationship_name = "is_classified_as",
|
||||
source_node_id = str(data_chunk.chunk_id),
|
||||
target_node_id = str(classification_subtype_id),
|
||||
),
|
||||
))
|
||||
|
||||
if len(nodes) > 0 or len(edges) > 0:
|
||||
await vector_engine.create_data_points(collection_name, data_points)
|
||||
|
||||
graph_engine = await get_graph_engine()
|
||||
|
||||
await graph_engine.add_nodes(nodes)
|
||||
await graph_engine.add_edges(edges)
|
||||
|
||||
return data_chunks
|
||||
|
|
@ -1 +0,0 @@
|
|||
from .save_data_chunks import save_data_chunks
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
from cognee.infrastructure.databases.vector import DataPoint, get_vector_engine
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
|
||||
|
||||
async def save_data_chunks(data_chunks: list[DocumentChunk], collection_name: str):
|
||||
if len(data_chunks) == 0:
|
||||
return data_chunks
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
graph_engine = await get_graph_engine()
|
||||
|
||||
# Remove and unlink existing chunks
|
||||
if await vector_engine.has_collection(collection_name):
|
||||
existing_chunks = [DocumentChunk.parse_obj(chunk.payload) for chunk in (await vector_engine.retrieve(
|
||||
collection_name,
|
||||
[str(chunk.chunk_id) for chunk in data_chunks],
|
||||
))]
|
||||
|
||||
if len(existing_chunks) > 0:
|
||||
await vector_engine.delete_data_points(collection_name, [str(chunk.chunk_id) for chunk in existing_chunks])
|
||||
|
||||
await graph_engine.remove_connection_to_successors_of([chunk.chunk_id for chunk in existing_chunks], "next_chunk")
|
||||
await graph_engine.remove_connection_to_predecessors_of([chunk.chunk_id for chunk in existing_chunks], "has_chunk")
|
||||
else:
|
||||
await vector_engine.create_collection(collection_name, payload_schema = DocumentChunk)
|
||||
|
||||
# Add to vector storage
|
||||
await vector_engine.create_data_points(
|
||||
collection_name,
|
||||
[
|
||||
DataPoint[DocumentChunk](
|
||||
id = str(chunk.chunk_id),
|
||||
payload = chunk,
|
||||
embed_field = "text",
|
||||
) for chunk in data_chunks
|
||||
],
|
||||
)
|
||||
|
||||
# Add to graph storage
|
||||
chunk_nodes = []
|
||||
chunk_edges = []
|
||||
|
||||
for chunk in data_chunks:
|
||||
chunk_nodes.append((
|
||||
str(chunk.chunk_id),
|
||||
dict(
|
||||
id = str(chunk.chunk_id),
|
||||
chunk_id = str(chunk.chunk_id),
|
||||
document_id = str(chunk.document_id),
|
||||
word_count = chunk.word_count,
|
||||
chunk_index = chunk.chunk_index,
|
||||
cut_type = chunk.cut_type,
|
||||
pages = chunk.pages,
|
||||
)
|
||||
))
|
||||
|
||||
chunk_edges.append((
|
||||
str(chunk.document_id),
|
||||
str(chunk.chunk_id),
|
||||
"has_chunk",
|
||||
dict(
|
||||
relationship_name = "has_chunk",
|
||||
source_node_id = str(chunk.document_id),
|
||||
target_node_id = str(chunk.chunk_id),
|
||||
),
|
||||
))
|
||||
|
||||
previous_chunk_id = get_previous_chunk_id(data_chunks, chunk)
|
||||
|
||||
if previous_chunk_id is not None:
|
||||
chunk_edges.append((
|
||||
str(previous_chunk_id),
|
||||
str(chunk.chunk_id),
|
||||
"next_chunk",
|
||||
dict(
|
||||
relationship_name = "next_chunk",
|
||||
source_node_id = str(previous_chunk_id),
|
||||
target_node_id = str(chunk.chunk_id),
|
||||
),
|
||||
))
|
||||
|
||||
await graph_engine.add_nodes(chunk_nodes)
|
||||
await graph_engine.add_edges(chunk_edges)
|
||||
|
||||
return data_chunks
|
||||
|
||||
|
||||
def get_previous_chunk_id(document_chunks: list[DocumentChunk], current_chunk: DocumentChunk) -> DocumentChunk:
|
||||
if current_chunk.chunk_index == 0:
|
||||
return current_chunk.document_id
|
||||
|
||||
for chunk in document_chunks:
|
||||
if str(chunk.document_id) == str(current_chunk.document_id) \
|
||||
and chunk.chunk_index == current_chunk.chunk_index - 1:
|
||||
return chunk.chunk_id
|
||||
|
||||
return None
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
from typing import Type
|
||||
from pydantic import BaseModel
|
||||
from cognee.shared.data_models import KnowledgeGraph
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from ...processing.chunk_types.DocumentChunk import DocumentChunk
|
||||
from .add_model_class_to_graph import add_model_class_to_graph
|
||||
|
||||
async def chunk_to_graph_decomposition(data_chunks: list[DocumentChunk], topology_model: Type[BaseModel]):
|
||||
if topology_model == KnowledgeGraph:
|
||||
return data_chunks
|
||||
|
||||
graph_engine = await get_graph_engine()
|
||||
|
||||
await add_model_class_to_graph(topology_model, graph_engine)
|
||||
|
||||
return data_chunks
|
||||
|
||||
|
||||
def generate_node_id(node_id: str) -> str:
|
||||
return node_id.upper().replace(" ", "_").replace("'", "")
|
||||
|
|
@ -1,218 +0,0 @@
|
|||
import json
|
||||
import asyncio
|
||||
from uuid import uuid5, NAMESPACE_OID
|
||||
from datetime import datetime, timezone
|
||||
from typing import Type
|
||||
from pydantic import BaseModel
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from cognee.infrastructure.databases.vector import DataPoint, get_vector_engine
|
||||
from ...processing.chunk_types.DocumentChunk import DocumentChunk
|
||||
from .extract_knowledge_graph import extract_content_graph
|
||||
|
||||
class EntityNode(BaseModel):
|
||||
uuid: str
|
||||
name: str
|
||||
type: str
|
||||
description: str
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
async def expand_knowledge_graph(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel], collection_name: str):
|
||||
chunk_graphs = await asyncio.gather(
|
||||
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
||||
)
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
graph_engine = await get_graph_engine()
|
||||
|
||||
has_collection = await vector_engine.has_collection(collection_name)
|
||||
|
||||
if not has_collection:
|
||||
await vector_engine.create_collection(collection_name, payload_schema = EntityNode)
|
||||
|
||||
processed_nodes = {}
|
||||
type_node_edges = []
|
||||
entity_node_edges = []
|
||||
type_entity_edges = []
|
||||
|
||||
for (chunk_index, chunk) in enumerate(data_chunks):
|
||||
chunk_graph = chunk_graphs[chunk_index]
|
||||
for node in chunk_graph.nodes:
|
||||
type_node_id = generate_node_id(node.type)
|
||||
entity_node_id = generate_node_id(node.id)
|
||||
|
||||
if type_node_id not in processed_nodes:
|
||||
type_node_edges.append((str(chunk.chunk_id), type_node_id, "contains_entity_type"))
|
||||
processed_nodes[type_node_id] = True
|
||||
|
||||
if entity_node_id not in processed_nodes:
|
||||
entity_node_edges.append((str(chunk.chunk_id), entity_node_id, "contains_entity"))
|
||||
type_entity_edges.append((entity_node_id, type_node_id, "is_entity_type"))
|
||||
processed_nodes[entity_node_id] = True
|
||||
|
||||
graph_node_edges = [
|
||||
(edge.source_node_id, edge.target_node_id, edge.relationship_name) \
|
||||
for edge in chunk_graph.edges
|
||||
]
|
||||
|
||||
existing_edges = await graph_engine.has_edges([
|
||||
*type_node_edges,
|
||||
*entity_node_edges,
|
||||
*type_entity_edges,
|
||||
*graph_node_edges,
|
||||
])
|
||||
|
||||
existing_edges_map = {}
|
||||
existing_nodes_map = {}
|
||||
|
||||
for edge in existing_edges:
|
||||
existing_edges_map[edge[0] + edge[1] + edge[2]] = True
|
||||
existing_nodes_map[edge[0]] = True
|
||||
|
||||
graph_nodes = []
|
||||
graph_edges = []
|
||||
data_points = []
|
||||
|
||||
for (chunk_index, chunk) in enumerate(data_chunks):
|
||||
graph = chunk_graphs[chunk_index]
|
||||
if graph is None:
|
||||
continue
|
||||
|
||||
for node in graph.nodes:
|
||||
node_id = generate_node_id(node.id)
|
||||
node_name = generate_name(node.name)
|
||||
|
||||
type_node_id = generate_node_id(node.type)
|
||||
type_node_name = generate_name(node.type)
|
||||
|
||||
if node_id not in existing_nodes_map:
|
||||
node_data = dict(
|
||||
uuid = node_id,
|
||||
name = node_name,
|
||||
type = node_name,
|
||||
description = node.description,
|
||||
created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
|
||||
graph_nodes.append((
|
||||
node_id,
|
||||
dict(
|
||||
**node_data,
|
||||
properties = json.dumps(node.properties),
|
||||
)
|
||||
))
|
||||
|
||||
data_points.append(DataPoint[EntityNode](
|
||||
id = str(uuid5(NAMESPACE_OID, node_id)),
|
||||
payload = node_data,
|
||||
embed_field = "name",
|
||||
))
|
||||
|
||||
existing_nodes_map[node_id] = True
|
||||
|
||||
edge_key = str(chunk.chunk_id) + node_id + "contains_entity"
|
||||
|
||||
if edge_key not in existing_edges_map:
|
||||
graph_edges.append((
|
||||
str(chunk.chunk_id),
|
||||
node_id,
|
||||
"contains_entity",
|
||||
dict(
|
||||
relationship_name = "contains_entity",
|
||||
source_node_id = str(chunk.chunk_id),
|
||||
target_node_id = node_id,
|
||||
),
|
||||
))
|
||||
|
||||
# Add relationship between entity type and entity itself: "Jake is Person"
|
||||
graph_edges.append((
|
||||
node_id,
|
||||
type_node_id,
|
||||
"is_entity_type",
|
||||
dict(
|
||||
relationship_name = "is_entity_type",
|
||||
source_node_id = type_node_id,
|
||||
target_node_id = node_id,
|
||||
),
|
||||
))
|
||||
|
||||
existing_edges_map[edge_key] = True
|
||||
|
||||
if type_node_id not in existing_nodes_map:
|
||||
type_node_data = dict(
|
||||
uuid = type_node_id,
|
||||
name = type_node_name,
|
||||
type = type_node_id,
|
||||
description = type_node_name,
|
||||
created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
|
||||
graph_nodes.append((type_node_id, dict(
|
||||
**type_node_data,
|
||||
properties = json.dumps(node.properties)
|
||||
)))
|
||||
|
||||
data_points.append(DataPoint[EntityNode](
|
||||
id = str(uuid5(NAMESPACE_OID, type_node_id)),
|
||||
payload = type_node_data,
|
||||
embed_field = "name",
|
||||
))
|
||||
|
||||
existing_nodes_map[type_node_id] = True
|
||||
|
||||
edge_key = str(chunk.chunk_id) + type_node_id + "contains_entity_type"
|
||||
|
||||
if edge_key not in existing_edges_map:
|
||||
graph_edges.append((
|
||||
str(chunk.chunk_id),
|
||||
type_node_id,
|
||||
"contains_entity_type",
|
||||
dict(
|
||||
relationship_name = "contains_entity_type",
|
||||
source_node_id = str(chunk.chunk_id),
|
||||
target_node_id = type_node_id,
|
||||
),
|
||||
))
|
||||
|
||||
existing_edges_map[edge_key] = True
|
||||
|
||||
# Add relationship that came from graphs.
|
||||
for edge in graph.edges:
|
||||
source_node_id = generate_node_id(edge.source_node_id)
|
||||
target_node_id = generate_node_id(edge.target_node_id)
|
||||
relationship_name = generate_name(edge.relationship_name)
|
||||
edge_key = source_node_id + target_node_id + relationship_name
|
||||
|
||||
if edge_key not in existing_edges_map:
|
||||
graph_edges.append((
|
||||
generate_node_id(edge.source_node_id),
|
||||
generate_node_id(edge.target_node_id),
|
||||
edge.relationship_name,
|
||||
dict(
|
||||
relationship_name = generate_name(edge.relationship_name),
|
||||
source_node_id = generate_node_id(edge.source_node_id),
|
||||
target_node_id = generate_node_id(edge.target_node_id),
|
||||
properties = json.dumps(edge.properties),
|
||||
),
|
||||
))
|
||||
existing_edges_map[edge_key] = True
|
||||
|
||||
if len(data_points) > 0:
|
||||
await vector_engine.create_data_points(collection_name, data_points)
|
||||
|
||||
if len(graph_nodes) > 0:
|
||||
await graph_engine.add_nodes(graph_nodes)
|
||||
|
||||
if len(graph_edges) > 0:
|
||||
await graph_engine.add_edges(graph_edges)
|
||||
|
||||
return data_chunks
|
||||
|
||||
|
||||
def generate_name(name: str) -> str:
|
||||
return name.lower().replace(" ", "_").replace("'", "")
|
||||
|
||||
def generate_node_id(node_id: str) -> str:
|
||||
return node_id.lower().replace(" ", "_").replace("'", "")
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
from .extract_content_graph import extract_content_graph
|
||||
|
||||
async def extract_knowledge_graph(text: str, cognitive_layer, graph_model):
|
||||
return await extract_content_graph(text, cognitive_layer, graph_model)
|
||||
|
|
@ -1,91 +0,0 @@
|
|||
from typing import List
|
||||
import dspy
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
from cognee.infrastructure.llm import get_llm_config
|
||||
from cognee.shared.data_models import KnowledgeGraph, Node, Edge
|
||||
from cognee.shared.utils import trim_text_to_max_tokens
|
||||
|
||||
# """Instructions:
|
||||
# You are a top-tier algorithm designed for extracting information from text in structured formats to build a knowledge graph.
|
||||
# - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
|
||||
# - **Edges** represent relationships between concepts. They're akin to Wikipedia links.
|
||||
# Extract as much information as you can from the text and build a detailed knowledge graph.
|
||||
# If question is provided, make sure that the information to answer the question is present in the graph."""
|
||||
|
||||
class GraphFromText(dspy.Signature):
|
||||
"""Instructions:
|
||||
You are a top-tier algorithm designed for extracting information from text in structured formats to build a knowledge graph.
|
||||
- **Nodes** represent entities and concepts, akin to Wikipedia nodes.
|
||||
- **Edges** represent relationships between entities and concepts, akin to Wikipedia hyperlinks.
|
||||
Extract information from the text and build a detailed knowledge graph."""
|
||||
|
||||
text: str = dspy.InputField()
|
||||
graph: KnowledgeGraph = dspy.OutputField()
|
||||
|
||||
|
||||
def are_all_nodes_and_edges_valid(graph: KnowledgeGraph) -> bool:
|
||||
return all([getattr(node, "type", "").strip() != "" for node in graph.nodes]) and \
|
||||
all([getattr(node, "name", "").strip() != "" for node in graph.nodes]) and \
|
||||
all([getattr(edge, "relationship_name", "").strip() != "" for edge in graph.edges])
|
||||
|
||||
def is_node_connected(node: Node, edges: List[Edge]) -> bool:
|
||||
return any([(edge.source_node_id == node.id or edge.target_node_id == node.id) for edge in edges])
|
||||
|
||||
def are_all_nodes_connected(graph: KnowledgeGraph) -> bool:
|
||||
return all([is_node_connected(node, graph.edges) for node in graph.nodes])
|
||||
|
||||
|
||||
class ExtractKnowledgeGraph(dspy.Module):
|
||||
llm_config = get_llm_config()
|
||||
|
||||
def __init__(self, lm = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)):
|
||||
super().__init__()
|
||||
self.lm = lm
|
||||
dspy.settings.configure(lm=self.lm)
|
||||
self.generate_graph = dspy.TypedChainOfThought(GraphFromText)
|
||||
nltk.download("stopwords", quiet = True)
|
||||
|
||||
def forward(self, context: str, question: str):
|
||||
context = remove_stop_words(context)
|
||||
context = trim_text_to_max_tokens(context, 1500, self.llm_config.llm_model)
|
||||
|
||||
with dspy.context(lm = self.lm):
|
||||
graph = self.generate_graph(text = context).graph
|
||||
|
||||
not_valid_nodes_or_edges_message = """
|
||||
All nodes must contain "name".
|
||||
All edges must contain "relationship_name".
|
||||
Please add mandatory fields to nodes and edges."""
|
||||
|
||||
dspy.Suggest(are_all_nodes_and_edges_valid(graph), not_valid_nodes_or_edges_message)
|
||||
|
||||
# not_connected_graph_message = """
|
||||
# Output must be a graph that has all nodes connected to it.
|
||||
# Please find a relation and connect nodes or remove them."""
|
||||
|
||||
# dspy.Suggest(are_all_nodes_connected(graph), not_connected_graph_message)
|
||||
|
||||
return dspy.Prediction(context = context, graph = graph)
|
||||
|
||||
|
||||
def remove_stop_words(text):
|
||||
stop_words = set(stopwords.words("english"))
|
||||
word_tokens = word_tokenize(text)
|
||||
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
|
||||
return " ".join(filtered_text)
|
||||
|
||||
#
|
||||
# if __name__ == "__main__":
|
||||
# gpt_4_turbo = dspy.OpenAI(model="gpt-4", max_tokens=4000, api_key=config.llm_api_key, model_type="chat")
|
||||
# dspy.settings.configure(lm=gpt_4_turbo)
|
||||
|
||||
|
||||
# extract_knowledge_graph = ExtractKnowledgeGraph(lm=gpt_4_turbo)
|
||||
# # graph_text = extract_knowledge_graph("cognitive_layer", "text")
|
||||
# graph = extract_knowledge_graph("analysis_layer", """A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification. LLMs acquire these abilities by learning statistical relationships from text documents during a computationally intensive self-supervised and semi-supervised training process. LLMs can be used for text generation, a form of generative AI, by taking an input text and repeatedly predicting the next token or word.
|
||||
# LLMs are artificial neural networks. The largest and most capable, as of March 2024""", question="What is a large language model?")
|
||||
# print("GPT4 History:", gpt_4_turbo.inspect_history(n=1))
|
||||
# print(graph)
|
||||
#
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
[
|
||||
{
|
||||
"node_id": "062c22dfd99b599f90cd2d325c8bcf69",
|
||||
"name": "062c22df-d99b-599f-90cd-2d325c8bcf69",
|
||||
"default_relationship": {
|
||||
"type": "related_to",
|
||||
"source": "062c22dfd99b599f90cd2d325c8bcf69",
|
||||
"target": "6dfe01b607d25b7783c81d6c11ce2aa7"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"node_id": "6dfe01b607d25b7783c81d6c11ce2aa7",
|
||||
"name": "6dfe01b6-07d2-5b77-83c8-1d6c11ce2aa7",
|
||||
"default_relationship": {
|
||||
"type": "related_to",
|
||||
"source": "6dfe01b6-07d2-5b77-83c8-1d6c11ce2aa7",
|
||||
"target": "a27bb4fa897e53a594cab446e1d33dbf"
|
||||
},
|
||||
"children": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
|
||||
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
|
||||
Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
|
||||
The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
|
||||
Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
|
||||
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
U analizi komunikacionih sistema kod životinja zadržaćemo se samo na semiotičkim problemima – postoje li u pojedinim sistemima njihove komunikacije ZNACI, semiotički SISTEMI i neke semiotičke OPERACIJE, u onom smislu kako su ti pojmovi definisani i utvrđeni kod ljudi. Analiziraćemo sličnosti i razlike između komunikacije kod životinja i kod ljudi, posebno semiotičke komunikacije kod čoveka.
|
||||
Kada se ima u vidu bogatstvo oblika komunikativnih veza među životinjama: sva raznolikost signala u pogledu fizičkih svojstava – hemijski, oflaktivni (mirisni), akustički (uključiv i ultrazvukove), električni, motorički (kinezički), proksemički (položaj u prostoru), vizuelni i drugi, zatim – raznovrsnost kanala (sredina) kroz koje se ostvaruje veza, kao i raznovrsnost funkcija koje imaju komunikativni sistemi, pitanje je koliko je uopšte opravdano govoriti o komunikaciji životinja u celini.
|
||||
Međutim, kada se pristupi semiotičkoj analizi sistema komunikacije među životinjama, iza raznolikosti nalazi se prilična jednoličnost, čak tolika da se ne može utvrditi postoji li nekakvo usavršavanje sistema komunikacije duž evolucione lestvice.
|
||||
Pogledajmo najpre kakve FUNKCIJE opslužuju sistemi komunikacija kod životinja. Poznati istraživač ovih problema, Marler, ovako rezimira analizu komunikacije među nižim i višim majmunima: „U velikoj većini, celokupni sistem komunikacije izgleda postoji radi organizacije socijalnog ponašanja grupe, regulacije dominantnosti i subordinacije, održanja mira i kohezije u grupi, kao i radi reprodukcije i brige o mladima (Marleu, 1967). Pomenute funkcije mogle bi se, nešto raščlanjenije, ovako opisati:
|
||||
Postoje, najpre, kod nekih vrsta signali za identifikaciju pojedinaca (npr. parovi mužjaka i ženki kod ptica pevačica mogu ostati u stalnoj vezi tokom cele jedne godine i međusobno se identifikuju pomoću pevanja, ponekada u vidu dueta koji može izvesti samo određeni par) ili za identifikaciju vrste (npr. pčele-stražari ubijaju na ulazu u košnicu svaku jedinku koja se na osnovu signala ne može identifikovati kao član tog pčelinjeg društva).
|
||||
Najbrojniji i najraznovrsniji su signali koji saopštavaju o motivacionim i afektivnim stanjima jedinke i o promenama tih stanja, a često i o suptilnim nijansama raspoloženja. Ta vrsta signala kazuje o gladi, seksualnim potrebama, ugroženostistrahu, boli, uzbuđenju, naklonosti, neprijateljstvu i agresivnosti, o zadovoljstvu i o svim varijacijama ovakvih motivacionih stanja i raspoloženja.
|
||||
Izuzetnu biološku vrednost imaju signali koji regulišu međusobne odnose jedinki ili odnose u grupi kao celini. Podsticanje, dozivanje i približavanje partnera, privlačenje i parenje, međusobno prepoznavanje i saradnja između roditelja i mladunčadi – nemogući su bez nekog sistema signalizacije. Određivanje zauzete teritorije, okupljanje grupe, vođstvo i određivanje statusa u grupi, dominacija i potčinjavanje, organizovanje kolonija – to su samo neke od socijalnih funkcija koje poslužuju sistemi komunikacije.
|
||||
U svim sistemima komunikacije među životinjama upadljivo je najmanje onih poruka koje govore o okolini u kojoj životinja živi; samo kod pojedinih vrsta postoje signali koji saopštavaju o postojanju ili lokalizaciji napadača, o hrani ili nalazištu hrane, o lokaciji staništa.
|
||||
Kada se ima u vidu samo ono što je ovde pobrojano, stiče se utisak o bogatstvu informacija koje mogu preneti komunikativni sistemi životinja. Međutim, za pun uvid u prirodu tih sistema potrebno je videti na koji način sistemi komuniciranja kod životinja obavljaju te funkcije. Obično se kaže da ovi sistemi imaju, pre svega ili isključivo socijalne funkcije. To je tačno, ali pod uslovom da se prethodno razjasni šta ovde znači – socijalno. Funkcije tih sistema su socijalne najpre su smislu da signali UTIČU NA DRUGE jedinke. I upravo tako, utiču na druge jedinke, a ne upućeni su drugim jedinkama. U stvari, u razvoju odnosa među jedinkama tokom evolucije izgrađuje se svojevrsna socijalna simbioza, u kojoj neki vidljivi pokazatelji ponašanja jedne jedinke postaju obaveštenja (signali) o njenim motivacionim stanjima. Dakle, signali su (za razliku od SIMBOLA) samo pokazatelji i sastavni delovi motivacionih, afektivnih ili nekih drugih unutrašnjih stanja jedinke. I baš zbog toga se ne može reći da jedinka upućuje drugoj signale, ona prosto doživljava to što doživljava. Neke komponente doživljaja dostupne su opažanju drugih jedinki i u toku zajedničkog života postaju signali određenih stanja. Tokom evolucije ti signali se stabilizuju, stilizuju (ritualizuju) i prerađuju u određeni sistem komunikacije.
|
||||
U tako stvorenoj socijalnoj simbiozi, signali koje upućuju životinje jedne drugima pre su nalozi za izvođenje određenih radnji, tj. pokretači ili inhibitori radnji nego saopštene informacije. Hormonalne i druge promene u organizmu koje dovode do pojave signala u jednoj jedinki skoro automatski u određenim uslovima pokreću lanac hormonalnih i ostalih promena u jedinki koja prima signale, a te promene kod njih izazivaju određene radnje. Dakle, komunikativni sistemi imaju socijalne funkcije zato što menjaju ponašanje drugih jedinki. A to dalje znači da jedina jedinka kojoj životinja ne upućuje signale jeste ona sama, i to je jedna od suštinskih razlika semiotičke komunikacije čoveka i komunikacije među žvotinjama.
|
||||
Iz prethodno opisanog sledi i ovo: signal retko kada ima isto značenje za jedinku koja ga emituje i jedinku koja ga prima. U komunikaciji među životinjama više se radi o odnosima komplementarnosti nego o odnosima recipročnosti (baš kao u nekim neverbalnim vidovima komunikacije kod ljudi: onaj koji pokazuje gnev i onaj ko opaža gnev imaju različite doživljaje). U ovom pogledu signali više služe za socijalnu facilitaciju ponašanja i sinhronizaciju fizioloških stanja i motoričkih radnji u toku interindividualnih aktivnosti (nrp. parenja) ili grupnih aktivnosti (npr. u komunikaciji pčela) nego za socijalnu razmenu.
|
||||
Potrebno je ukazati na još jednu odliku komunikativnih sistema životinja: postoji uska specijalizacija signala za određene funkcije. Svaka od ranije pobrojanih funkcija ima specifične signale koji joj služe (npr. krici kod ptica su signali opasnosti, a pesma ptica-pevačica je ljubavni zov). Ovo, naravno, ne znači da za obavljanje jedne vrste funkcija postoji samo jedna vrsta signala, jer je i u sistemima veze između životinja pravilo da postoji redundansa (npr. za saopštavanje o mestu nalazišta paše pčele koriste istovremeno i „jezik“ igre telom i akustičke signale).
|
||||
U pogledu SEMANTIKE (značenja), sistemi komunikacija među životinjama poseduju dva osnovna svojstva:
|
||||
Postoji konačan i obično veoma mali broj poruka koje stoje svakoj vrsti na raspolaganju – značajnija je karakteristika da je broj signala konačan. Naime, životinja svake vrste dobija nasleđem, ili stiče uz izvesno učenje, određen broj signala i taj repertoar ostaje zatvoren, nepromenljiv. Za razliku od toga, ljudski govor je otvoreni sistem, koji po svojim pravilima stvara nove jedinice sistema.
|
||||
Jedva da poneki istraživač saopštava da je zapazio stvaranje novih signala kod životinja. Tamo gde je signale moguće tehnički pobrojati, nalazi se da njihov broj nije veliki. Tako kod pojedinih vrsta majmuna istraživači redovno utvrđuju da se broj akustičkih signala kreće od 10-15, dok se kod šimpanzi može razlikovati 9 facijanih ekspresija. Ni kod drugih životinja taj broj nije mnogo veći: kod nekih jedva da postoje dva signala u istom kanalu veze, a jedino se kod nekih ptica-pevačica sreće i do nekoliko stotina različitih „motiva“ u pesmama. Pošto većinu komunikativnih sredstava kojima se služe životinje dobijaju nasleđem, ta nasledna određenost vrlo je striktna i u pogledu funkcije i forme signala, tako da je životinja sposobna za komunikaciju koja je karakteristična za njenu vrstu, čak i kada se razvija u izolaciji. Kod nekih vrsta ptica učenje ima značajniju ulogu. Poznato je da mladi nekih vrsta ptica mogu da nauče i pesmu drugih vrsta ptica, u čijoj zajednici odrastaju, dok kod nekih vrsta ptica određene grupacije jedinki stvaraju svoje „dijalekte“. Izgleda da je funkcija tih dijalekata da iz nekih razloga ograniče parenje među pripadnicima različitih grupacija, jer ptice mogu da se pare samo na osnovu ljubavne pesme onog „dijalekta“ kojim se služe. Ovakav način sticanja komunikativnih sistema veoma podseća na usvajanje govora kod dece.
|
||||
Signali nemaju denotativna značenja, tj. ne označavaju neki određeni segment realnosti (denotat), ne saopštavaju nešto o tom denotatu, već samo predstavljaju vid ekspresije stanja organizma. Za semiotičku analizu posebno je značajno da li signali koje životinje koriste zaista označavaju nešto različito od sebe samih, da li kazuju nešto o denotatu, ili su puka ekspresija fizioloških i afektivnih stanja životinje.
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of "understanding"[citation needed] the contents of documents, including the contextual nuances of the language within them. To this end, natural language processing often borrows ideas from theoretical linguistics. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
|
||||
Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.
|
||||
|
|
@ -1 +0,0 @@
|
|||
German novels are fun to read and talk about nature
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
from typing import Type
|
||||
from pydantic import BaseModel
|
||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||
|
||||
|
||||
async def extract_topology(content: str, response_model: Type[BaseModel]):
|
||||
llm_client = get_llm_client()
|
||||
|
||||
system_prompt = read_query_prompt("extract_topology.txt")
|
||||
|
||||
llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
|
||||
|
||||
return llm_output.model_dump()
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
import logging
|
||||
from cognee.modules.topology.extraction.extract_topology import extract_topology
|
||||
from cognee.infrastructure.databases.graph.config import get_graph_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def infer_data_topology(content: str, graph_topology=None):
|
||||
if graph_topology is None:
|
||||
graph_config = get_graph_config()
|
||||
graph_topology = graph_config.graph_topology
|
||||
|
||||
print("content: ", type(content))
|
||||
try:
|
||||
return (await extract_topology(
|
||||
content,
|
||||
graph_topology
|
||||
))
|
||||
except Exception as error:
|
||||
logger.error("Error extracting topology from content: %s", error, exc_info = True)
|
||||
raise error
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
""" This module contains the TopologyEngine class which is responsible for adding graph topology from a JSON or CSV file. """
|
||||
|
||||
from cognee.infrastructure.databases.graph import get_graph_config
|
||||
from cognee.modules.cognify.config import get_cognify_config
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
|
|
@ -14,10 +16,42 @@ from cognee.infrastructure.data.chunking.get_chunking_engine import get_chunk_en
|
|||
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
|
||||
from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file
|
||||
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type, FileTypeException
|
||||
from cognee.modules.topology.topology_data_models import NodeModel
|
||||
from cognee.tasks.document_to_ontology.models.models import NodeModel
|
||||
|
||||
logger = logging.getLogger("topology")
|
||||
|
||||
from cognee.infrastructure.databases.graph.config import get_graph_config
|
||||
|
||||
|
||||
from typing import Type
|
||||
from pydantic import BaseModel
|
||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||
|
||||
|
||||
async def extract_topology(content: str, response_model: Type[BaseModel]):
|
||||
llm_client = get_llm_client()
|
||||
|
||||
system_prompt = read_query_prompt("extract_topology.txt")
|
||||
|
||||
llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
|
||||
|
||||
return llm_output.model_dump()
|
||||
|
||||
|
||||
async def infer_data_topology(content: str, graph_topology=None):
|
||||
if graph_topology is None:
|
||||
graph_config = get_graph_config()
|
||||
graph_topology = graph_config.graph_topology
|
||||
try:
|
||||
return (await extract_topology(
|
||||
content,
|
||||
graph_topology
|
||||
))
|
||||
except Exception as error:
|
||||
logger.error("Error extracting topology from content: %s", error, exc_info = True)
|
||||
raise error
|
||||
|
||||
class TopologyEngine:
|
||||
def __init__(self, infer:bool) -> None:
|
||||
self.models: Dict[str, Type[BaseModel]] = {}
|
||||
|
|
@ -69,8 +103,6 @@ class TopologyEngine:
|
|||
async def add_graph_topology(self, file_path: str = None, files: list = None):
|
||||
"""Add graph topology from a JSON or CSV file."""
|
||||
if self.infer:
|
||||
from cognee.modules.topology.infer_data_topology import infer_data_topology
|
||||
|
||||
initial_chunks_and_ids = []
|
||||
|
||||
chunk_config = get_chunk_config()
|
||||
|
|
@ -128,3 +160,23 @@ class TopologyEngine:
|
|||
return
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to add graph topology from {file_path}: {e}") from e
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
async def document_to_ontology(data, root_node_id):
|
||||
cognee_config = get_cognify_config()
|
||||
graph_config = get_graph_config()
|
||||
root_node_id = None
|
||||
if graph_config.infer_graph_topology and graph_config.graph_topology_task:
|
||||
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
|
||||
root_node_id = await topology_engine.add_graph_topology(files=data)
|
||||
elif graph_config.infer_graph_topology and not graph_config.infer_graph_topology:
|
||||
|
||||
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
|
||||
await topology_engine.add_graph_topology(graph_config.topology_file_path)
|
||||
elif not graph_config.graph_topology_task:
|
||||
root_node_id = "ROOT"
|
||||
|
||||
yield (data, root_node_id)
|
||||
325
docs/index.md
325
docs/index.md
|
|
@ -1,58 +1,381 @@
|
|||
# cognee
|
||||
|
||||
|
||||
|
||||
#### Deterministic LLMs Outputs for AI Engineers
|
||||
|
||||
|
||||
|
||||
_Open-source framework for loading and structuring LLM context to create accurate and explainable AI solutions using knowledge graphs and vector stores_
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
|
||||
|
||||
[](https://twitter.com/tricalt)
|
||||
|
||||
|
||||
[](https://pypi.python.org/pypi/cognee)
|
||||
|
||||
|
||||
[](https://github.com/topoteretes/cognee)
|
||||
|
||||
|
||||
### Let's learn about cogneeHub!
|
||||
|
||||
|
||||
cogneeHub is a free, open-source learning platform for those interested in creating deterministic LLM outputs. We help developers by using graphs, LLMs, and adding vector retrieval to their Machine Learning stack.
|
||||
|
||||
|
||||
- **Get started** — [Get started with cognee quickly and try it out for yourself.](quickstart.md)
|
||||
|
||||
- **Conceptual Overview** — Learn about the [core concepts](conceptual_overview.md) of cognee and how it fits into your projects.
|
||||
|
||||
- **Data Engineering and LLMOps** — Learn about some [data engineering and llmops](data_engineering_llm_ops.md) core concepts that will help you build better AI apps.
|
||||
|
||||
- **RAGs** — We provide easy-to-follow [learning materials](rags.md) to help you learn about RAGs.
|
||||
|
||||
- **Research** — A list of resources to help you learn more about [cognee and LLM memory research](research.md)
|
||||
|
||||
- **Blog** — A blog where you can read about the [latest news and updates](blog/index.md) about cognee.
|
||||
|
||||
- **Support** — [Book time](https://www.cognee.ai/#bookTime) with our team.
|
||||
|
||||
[//]: # (- **Case Studies** — Read about [case studies](case_studies.md) that show how cognee can be used in real-world applications.)
|
||||
|
||||
[//]: # (- **Case Studies** — Read about [case studies](case_studies.md) that show how cognee can be used in real-world applications.)
|
||||
|
||||
|
||||
### Vision
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
### Architecture
|
||||
|
||||
|
||||

|
||||
|
||||
### Why use cognee?
|
||||
|
||||
|
||||
The question of using cognee is fundamentally a question of why to have deterministic outputs for your llm workflows.
|
||||
|
||||
|
||||
1. **Cost-effective** — cognee extends the capabilities of your LLMs without the need for expensive data processing tools.
|
||||
|
||||
|
||||
2. **Self-contained** — cognee runs as a library and is simple to use
|
||||
|
||||
|
||||
3. **Interpretable** — Navigate graphs instead of embeddings to understand your data.
|
||||
|
||||
|
||||
4. **User Guided** — cognee lets you control your input and provide your own Pydantic data models
|
||||
|
||||
|
||||
|
||||
|
||||
## License
|
||||
|
||||
|
||||
This project is licensed under the terms of the Apache License 2.0.
|
||||
|
||||
|
||||
[//]: # (<style>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (.container {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( display: flex;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( justify-content: space-around;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( margin-top: 20px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.container div {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( width: 28%;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( padding: 20px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( box-sizing: border-box;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( border: 1px solid #e0e0e0;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( border-radius: 8px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( background-color: #f9f9f9;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.container h2 {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( font-size: 1.25em;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( margin-bottom: 10px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.container p {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( margin-bottom: 20px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( line-height: 1.6;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.button-container {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( text-align: center;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( margin: 30px 0;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.button-container a {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( display: inline-block;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( padding: 15px 25px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( background-color: #007bff;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( color: white;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( text-decoration: none;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( border-radius: 5px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( font-size: 1em;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.button-container a:hover {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( background-color: #0056b3;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.resources {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( margin-top: 40px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.resources h2 {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( font-size: 1.5em;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( margin-bottom: 20px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.resources ul {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( list-style: none;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( padding: 0;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.resources li {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( margin-bottom: 10px;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.resources a {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( color: #007bff;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( text-decoration: none;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (.resources a:hover {)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( text-decoration: underline;)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (})
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (</style>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (# New to cognee?)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (The getting started guide covers adding a GraphRAG data store to your AI app, sending events, identifying users, extracting actions and insights, and interconnecting separate datasets.)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (<div class="button-container">)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <a href="./quickstart.md">Get started</a>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (</div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (<div class="container">)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <h2>Ingest Data</h2>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <p>Learn how to manage ingestion of events, customer data or third party data for use with cognee.</p>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <a href="#">Explore</a>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( </div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <h2>Templates</h2>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <p>Analyze and enrich your data and improve LLM answers with a series of templates using cognee tasks and pipelines.</p>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <a href="#">Browse templates</a>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( </div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <h2>API</h2>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <p>Push or pull data to build custom functionality or create bespoke views for your business needs.</p>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <a href="#">Explore</a>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( </div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (</div>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ()
|
||||
[//]: # (<div class="resources">)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <h2>Resources</h2>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <ul>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <li><a href="#">What is GraphRAG</a></li>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <li><a href="#">Research</a></li>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <li><a href="#">Community</a></li>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <li><a href="#">Community</a></li>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <li><a href="#">API Reference</a></li>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( <li><a href="#">Support</a></li>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # ( </ul>)
|
||||
|
||||
[//]: # ()
|
||||
[//]: # (</div>)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue