Refactor of the tasks

This commit is contained in:
Vasilije 2024-08-08 13:37:55 +02:00
parent 2e367198cd
commit 85160da387
21 changed files with 392 additions and 690 deletions

View file

@ -9,14 +9,6 @@ from cognee.modules.data.processing.document_types.AudioDocument import AudioDoc
from cognee.modules.data.processing.document_types.ImageDocument import ImageDocument
from cognee.shared.data_models import KnowledgeGraph
from cognee.modules.data.processing.document_types import PdfDocument, TextDocument
# from cognee.modules.cognify.vector import save_data_chunks
# from cognee.modules.data.processing.process_documents import process_documents
# from cognee.modules.classification.classify_text_chunks import classify_text_chunks
# from cognee.modules.data.extraction.data_summary.summarize_text_chunks import summarize_text_chunks
# from cognee.modules.data.processing.filter_affected_chunks import filter_affected_chunks
# from cognee.modules.data.processing.remove_obsolete_chunks import remove_obsolete_chunks
# from cognee.modules.data.extraction.knowledge_graph.expand_knowledge_graph import expand_knowledge_graph
# from cognee.modules.data.extraction.knowledge_graph.establish_graph_topology import establish_graph_topology
from cognee.modules.data.models import Dataset, Data
from cognee.modules.data.operations.get_dataset_data import get_dataset_data
from cognee.modules.data.operations.retrieve_datasets import retrieve_datasets
@ -31,6 +23,7 @@ from cognee.tasks.chunk_extract_summary.chunk_extract_summary import chunk_extra
from cognee.tasks.chunk_naive_llm_classifier.chunk_naive_llm_classifier import chunk_naive_llm_classifier_task
from cognee.tasks.chunk_remove_disconnected.chunk_remove_disconnected import chunk_remove_disconnected_task
from cognee.tasks.chunk_to_graph_decomposition.chunk_to_graph_decomposition import chunk_to_graph_decomposition_task
from cognee.tasks.document_to_ontology.document_to_ontology import document_to_ontology
from cognee.tasks.save_chunks_to_store.save_chunks_to_store import save_chunks_to_store_task
from cognee.tasks.chunk_update_check.chunk_update_check import chunk_update_check_task
from cognee.tasks.chunks_into_graph.chunks_into_graph import \
@ -96,19 +89,20 @@ async def cognify(datasets: Union[str, list[str]] = None, user: User = None):
cognee_config = get_cognify_config()
graph_config = get_graph_config()
root_node_id = None
if graph_config.infer_graph_topology and graph_config.graph_topology_task:
from cognee.modules.topology.topology import TopologyEngine
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
root_node_id = await topology_engine.add_graph_topology(files = data)
elif graph_config.infer_graph_topology and not graph_config.infer_graph_topology:
from cognee.modules.topology.topology import TopologyEngine
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
await topology_engine.add_graph_topology(graph_config.topology_file_path)
elif not graph_config.graph_topology_task:
root_node_id = "ROOT"
#
# if graph_config.infer_graph_topology and graph_config.graph_topology_task:
# from cognee.modules.topology.topology import TopologyEngine
# topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
# root_node_id = await topology_engine.add_graph_topology(files = data)
# elif graph_config.infer_graph_topology and not graph_config.infer_graph_topology:
# from cognee.modules.topology.topology import TopologyEngine
# topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
# await topology_engine.add_graph_topology(graph_config.topology_file_path)
# elif not graph_config.graph_topology_task:
# root_node_id = "ROOT"
tasks = [
Task(document_to_ontology, root_node_id = root_node_id),
Task(source_documents_to_chunks, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type
Task(chunk_to_graph_decomposition_task, topology_model = KnowledgeGraph, task_config = { "batch_size": 10 }), # Set the graph topology for the document chunk data
Task(chunks_into_graph_task, graph_model = KnowledgeGraph, collection_name = "entities"), # Generate knowledge graphs from the document chunks and attach it to chunk nodes

View file

@ -1,152 +0,0 @@
import asyncio
from uuid import uuid5, NAMESPACE_OID
from typing import Type
from pydantic import BaseModel
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.infrastructure.databases.vector import get_vector_engine, DataPoint
from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
from ..data.extraction.extract_categories import extract_categories
async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]):
if len(data_chunks) == 0:
return data_chunks
chunk_classifications = await asyncio.gather(
*[extract_categories(chunk.text, classification_model) for chunk in data_chunks],
)
classification_data_points = []
for chunk_index, chunk in enumerate(data_chunks):
chunk_classification = chunk_classifications[chunk_index]
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
for classification_subclass in chunk_classification.label.subclass:
classification_data_points.append(uuid5(NAMESPACE_OID, classification_subclass.value))
vector_engine = get_vector_engine()
class Keyword(BaseModel):
uuid: str
text: str
chunk_id: str
document_id: str
collection_name = "classification"
if await vector_engine.has_collection(collection_name):
existing_data_points = await vector_engine.retrieve(
collection_name,
list(set(classification_data_points)),
) if len(classification_data_points) > 0 else []
existing_points_map = {point.id: True for point in existing_data_points}
else:
existing_points_map = {}
await vector_engine.create_collection(collection_name, payload_schema = Keyword)
data_points = []
nodes = []
edges = []
for (chunk_index, data_chunk) in enumerate(data_chunks):
chunk_classification = chunk_classifications[chunk_index]
classification_type_label = chunk_classification.label.type
classification_type_id = uuid5(NAMESPACE_OID, classification_type_label)
if classification_type_id not in existing_points_map:
data_points.append(
DataPoint[Keyword](
id = str(classification_type_id),
payload = Keyword.parse_obj({
"uuid": str(classification_type_id),
"text": classification_type_label,
"chunk_id": str(data_chunk.chunk_id),
"document_id": str(data_chunk.document_id),
}),
embed_field = "text",
)
)
nodes.append((
str(classification_type_id),
dict(
id = str(classification_type_id),
name = classification_type_label,
type = classification_type_label,
)
))
existing_points_map[classification_type_id] = True
edges.append((
str(data_chunk.chunk_id),
str(classification_type_id),
"is_media_type",
dict(
relationship_name = "is_media_type",
source_node_id = str(data_chunk.chunk_id),
target_node_id = str(classification_type_id),
),
))
for classification_subclass in chunk_classification.label.subclass:
classification_subtype_label = classification_subclass.value
classification_subtype_id = uuid5(NAMESPACE_OID, classification_subtype_label)
if classification_subtype_id not in existing_points_map:
data_points.append(
DataPoint[Keyword](
id = str(classification_subtype_id),
payload = Keyword.parse_obj({
"uuid": str(classification_subtype_id),
"text": classification_subtype_label,
"chunk_id": str(data_chunk.chunk_id),
"document_id": str(data_chunk.document_id),
}),
embed_field = "text",
)
)
nodes.append((
str(classification_subtype_id),
dict(
id = str(classification_subtype_id),
name = classification_subtype_label,
type = classification_subtype_label,
)
))
edges.append((
str(classification_subtype_id),
str(classification_type_id),
"is_subtype_of",
dict(
relationship_name = "contains",
source_node_id = str(classification_type_id),
target_node_id = str(classification_subtype_id),
),
))
existing_points_map[classification_subtype_id] = True
edges.append((
str(data_chunk.chunk_id),
str(classification_subtype_id),
"is_classified_as",
dict(
relationship_name = "is_classified_as",
source_node_id = str(data_chunk.chunk_id),
target_node_id = str(classification_subtype_id),
),
))
if len(nodes) > 0 or len(edges) > 0:
await vector_engine.create_data_points(collection_name, data_points)
graph_engine = await get_graph_engine()
await graph_engine.add_nodes(nodes)
await graph_engine.add_edges(edges)
return data_chunks

View file

@ -1 +0,0 @@
from .save_data_chunks import save_data_chunks

View file

@ -1,97 +0,0 @@
from cognee.infrastructure.databases.vector import DataPoint, get_vector_engine
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.data.processing.chunk_types.DocumentChunk import DocumentChunk
async def save_data_chunks(data_chunks: list[DocumentChunk], collection_name: str):
if len(data_chunks) == 0:
return data_chunks
vector_engine = get_vector_engine()
graph_engine = await get_graph_engine()
# Remove and unlink existing chunks
if await vector_engine.has_collection(collection_name):
existing_chunks = [DocumentChunk.parse_obj(chunk.payload) for chunk in (await vector_engine.retrieve(
collection_name,
[str(chunk.chunk_id) for chunk in data_chunks],
))]
if len(existing_chunks) > 0:
await vector_engine.delete_data_points(collection_name, [str(chunk.chunk_id) for chunk in existing_chunks])
await graph_engine.remove_connection_to_successors_of([chunk.chunk_id for chunk in existing_chunks], "next_chunk")
await graph_engine.remove_connection_to_predecessors_of([chunk.chunk_id for chunk in existing_chunks], "has_chunk")
else:
await vector_engine.create_collection(collection_name, payload_schema = DocumentChunk)
# Add to vector storage
await vector_engine.create_data_points(
collection_name,
[
DataPoint[DocumentChunk](
id = str(chunk.chunk_id),
payload = chunk,
embed_field = "text",
) for chunk in data_chunks
],
)
# Add to graph storage
chunk_nodes = []
chunk_edges = []
for chunk in data_chunks:
chunk_nodes.append((
str(chunk.chunk_id),
dict(
id = str(chunk.chunk_id),
chunk_id = str(chunk.chunk_id),
document_id = str(chunk.document_id),
word_count = chunk.word_count,
chunk_index = chunk.chunk_index,
cut_type = chunk.cut_type,
pages = chunk.pages,
)
))
chunk_edges.append((
str(chunk.document_id),
str(chunk.chunk_id),
"has_chunk",
dict(
relationship_name = "has_chunk",
source_node_id = str(chunk.document_id),
target_node_id = str(chunk.chunk_id),
),
))
previous_chunk_id = get_previous_chunk_id(data_chunks, chunk)
if previous_chunk_id is not None:
chunk_edges.append((
str(previous_chunk_id),
str(chunk.chunk_id),
"next_chunk",
dict(
relationship_name = "next_chunk",
source_node_id = str(previous_chunk_id),
target_node_id = str(chunk.chunk_id),
),
))
await graph_engine.add_nodes(chunk_nodes)
await graph_engine.add_edges(chunk_edges)
return data_chunks
def get_previous_chunk_id(document_chunks: list[DocumentChunk], current_chunk: DocumentChunk) -> DocumentChunk:
if current_chunk.chunk_index == 0:
return current_chunk.document_id
for chunk in document_chunks:
if str(chunk.document_id) == str(current_chunk.document_id) \
and chunk.chunk_index == current_chunk.chunk_index - 1:
return chunk.chunk_id
return None

View file

@ -1,20 +0,0 @@
from typing import Type
from pydantic import BaseModel
from cognee.shared.data_models import KnowledgeGraph
from cognee.infrastructure.databases.graph import get_graph_engine
from ...processing.chunk_types.DocumentChunk import DocumentChunk
from .add_model_class_to_graph import add_model_class_to_graph
async def chunk_to_graph_decomposition(data_chunks: list[DocumentChunk], topology_model: Type[BaseModel]):
if topology_model == KnowledgeGraph:
return data_chunks
graph_engine = await get_graph_engine()
await add_model_class_to_graph(topology_model, graph_engine)
return data_chunks
def generate_node_id(node_id: str) -> str:
return node_id.upper().replace(" ", "_").replace("'", "")

View file

@ -1,218 +0,0 @@
import json
import asyncio
from uuid import uuid5, NAMESPACE_OID
from datetime import datetime, timezone
from typing import Type
from pydantic import BaseModel
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.infrastructure.databases.vector import DataPoint, get_vector_engine
from ...processing.chunk_types.DocumentChunk import DocumentChunk
from .extract_knowledge_graph import extract_content_graph
class EntityNode(BaseModel):
uuid: str
name: str
type: str
description: str
created_at: datetime
updated_at: datetime
async def expand_knowledge_graph(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel], collection_name: str):
chunk_graphs = await asyncio.gather(
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
)
vector_engine = get_vector_engine()
graph_engine = await get_graph_engine()
has_collection = await vector_engine.has_collection(collection_name)
if not has_collection:
await vector_engine.create_collection(collection_name, payload_schema = EntityNode)
processed_nodes = {}
type_node_edges = []
entity_node_edges = []
type_entity_edges = []
for (chunk_index, chunk) in enumerate(data_chunks):
chunk_graph = chunk_graphs[chunk_index]
for node in chunk_graph.nodes:
type_node_id = generate_node_id(node.type)
entity_node_id = generate_node_id(node.id)
if type_node_id not in processed_nodes:
type_node_edges.append((str(chunk.chunk_id), type_node_id, "contains_entity_type"))
processed_nodes[type_node_id] = True
if entity_node_id not in processed_nodes:
entity_node_edges.append((str(chunk.chunk_id), entity_node_id, "contains_entity"))
type_entity_edges.append((entity_node_id, type_node_id, "is_entity_type"))
processed_nodes[entity_node_id] = True
graph_node_edges = [
(edge.source_node_id, edge.target_node_id, edge.relationship_name) \
for edge in chunk_graph.edges
]
existing_edges = await graph_engine.has_edges([
*type_node_edges,
*entity_node_edges,
*type_entity_edges,
*graph_node_edges,
])
existing_edges_map = {}
existing_nodes_map = {}
for edge in existing_edges:
existing_edges_map[edge[0] + edge[1] + edge[2]] = True
existing_nodes_map[edge[0]] = True
graph_nodes = []
graph_edges = []
data_points = []
for (chunk_index, chunk) in enumerate(data_chunks):
graph = chunk_graphs[chunk_index]
if graph is None:
continue
for node in graph.nodes:
node_id = generate_node_id(node.id)
node_name = generate_name(node.name)
type_node_id = generate_node_id(node.type)
type_node_name = generate_name(node.type)
if node_id not in existing_nodes_map:
node_data = dict(
uuid = node_id,
name = node_name,
type = node_name,
description = node.description,
created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
)
graph_nodes.append((
node_id,
dict(
**node_data,
properties = json.dumps(node.properties),
)
))
data_points.append(DataPoint[EntityNode](
id = str(uuid5(NAMESPACE_OID, node_id)),
payload = node_data,
embed_field = "name",
))
existing_nodes_map[node_id] = True
edge_key = str(chunk.chunk_id) + node_id + "contains_entity"
if edge_key not in existing_edges_map:
graph_edges.append((
str(chunk.chunk_id),
node_id,
"contains_entity",
dict(
relationship_name = "contains_entity",
source_node_id = str(chunk.chunk_id),
target_node_id = node_id,
),
))
# Add relationship between entity type and entity itself: "Jake is Person"
graph_edges.append((
node_id,
type_node_id,
"is_entity_type",
dict(
relationship_name = "is_entity_type",
source_node_id = type_node_id,
target_node_id = node_id,
),
))
existing_edges_map[edge_key] = True
if type_node_id not in existing_nodes_map:
type_node_data = dict(
uuid = type_node_id,
name = type_node_name,
type = type_node_id,
description = type_node_name,
created_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
updated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
)
graph_nodes.append((type_node_id, dict(
**type_node_data,
properties = json.dumps(node.properties)
)))
data_points.append(DataPoint[EntityNode](
id = str(uuid5(NAMESPACE_OID, type_node_id)),
payload = type_node_data,
embed_field = "name",
))
existing_nodes_map[type_node_id] = True
edge_key = str(chunk.chunk_id) + type_node_id + "contains_entity_type"
if edge_key not in existing_edges_map:
graph_edges.append((
str(chunk.chunk_id),
type_node_id,
"contains_entity_type",
dict(
relationship_name = "contains_entity_type",
source_node_id = str(chunk.chunk_id),
target_node_id = type_node_id,
),
))
existing_edges_map[edge_key] = True
# Add relationship that came from graphs.
for edge in graph.edges:
source_node_id = generate_node_id(edge.source_node_id)
target_node_id = generate_node_id(edge.target_node_id)
relationship_name = generate_name(edge.relationship_name)
edge_key = source_node_id + target_node_id + relationship_name
if edge_key not in existing_edges_map:
graph_edges.append((
generate_node_id(edge.source_node_id),
generate_node_id(edge.target_node_id),
edge.relationship_name,
dict(
relationship_name = generate_name(edge.relationship_name),
source_node_id = generate_node_id(edge.source_node_id),
target_node_id = generate_node_id(edge.target_node_id),
properties = json.dumps(edge.properties),
),
))
existing_edges_map[edge_key] = True
if len(data_points) > 0:
await vector_engine.create_data_points(collection_name, data_points)
if len(graph_nodes) > 0:
await graph_engine.add_nodes(graph_nodes)
if len(graph_edges) > 0:
await graph_engine.add_edges(graph_edges)
return data_chunks
def generate_name(name: str) -> str:
return name.lower().replace(" ", "_").replace("'", "")
def generate_node_id(node_id: str) -> str:
return node_id.lower().replace(" ", "_").replace("'", "")

View file

@ -1,4 +0,0 @@
from .extract_content_graph import extract_content_graph
async def extract_knowledge_graph(text: str, cognitive_layer, graph_model):
return await extract_content_graph(text, cognitive_layer, graph_model)

View file

@ -1,91 +0,0 @@
from typing import List
import dspy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from cognee.infrastructure.llm import get_llm_config
from cognee.shared.data_models import KnowledgeGraph, Node, Edge
from cognee.shared.utils import trim_text_to_max_tokens
# """Instructions:
# You are a top-tier algorithm designed for extracting information from text in structured formats to build a knowledge graph.
# - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
# - **Edges** represent relationships between concepts. They're akin to Wikipedia links.
# Extract as much information as you can from the text and build a detailed knowledge graph.
# If question is provided, make sure that the information to answer the question is present in the graph."""
class GraphFromText(dspy.Signature):
"""Instructions:
You are a top-tier algorithm designed for extracting information from text in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts, akin to Wikipedia nodes.
- **Edges** represent relationships between entities and concepts, akin to Wikipedia hyperlinks.
Extract information from the text and build a detailed knowledge graph."""
text: str = dspy.InputField()
graph: KnowledgeGraph = dspy.OutputField()
def are_all_nodes_and_edges_valid(graph: KnowledgeGraph) -> bool:
return all([getattr(node, "type", "").strip() != "" for node in graph.nodes]) and \
all([getattr(node, "name", "").strip() != "" for node in graph.nodes]) and \
all([getattr(edge, "relationship_name", "").strip() != "" for edge in graph.edges])
def is_node_connected(node: Node, edges: List[Edge]) -> bool:
return any([(edge.source_node_id == node.id or edge.target_node_id == node.id) for edge in edges])
def are_all_nodes_connected(graph: KnowledgeGraph) -> bool:
return all([is_node_connected(node, graph.edges) for node in graph.nodes])
class ExtractKnowledgeGraph(dspy.Module):
llm_config = get_llm_config()
def __init__(self, lm = dspy.OpenAI(model = llm_config.llm_model, api_key = llm_config.llm_api_key, model_type = "chat", max_tokens = 4096)):
super().__init__()
self.lm = lm
dspy.settings.configure(lm=self.lm)
self.generate_graph = dspy.TypedChainOfThought(GraphFromText)
nltk.download("stopwords", quiet = True)
def forward(self, context: str, question: str):
context = remove_stop_words(context)
context = trim_text_to_max_tokens(context, 1500, self.llm_config.llm_model)
with dspy.context(lm = self.lm):
graph = self.generate_graph(text = context).graph
not_valid_nodes_or_edges_message = """
All nodes must contain "name".
All edges must contain "relationship_name".
Please add mandatory fields to nodes and edges."""
dspy.Suggest(are_all_nodes_and_edges_valid(graph), not_valid_nodes_or_edges_message)
# not_connected_graph_message = """
# Output must be a graph that has all nodes connected to it.
# Please find a relation and connect nodes or remove them."""
# dspy.Suggest(are_all_nodes_connected(graph), not_connected_graph_message)
return dspy.Prediction(context = context, graph = graph)
def remove_stop_words(text):
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(text)
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
return " ".join(filtered_text)
#
# if __name__ == "__main__":
# gpt_4_turbo = dspy.OpenAI(model="gpt-4", max_tokens=4000, api_key=config.llm_api_key, model_type="chat")
# dspy.settings.configure(lm=gpt_4_turbo)
# extract_knowledge_graph = ExtractKnowledgeGraph(lm=gpt_4_turbo)
# # graph_text = extract_knowledge_graph("cognitive_layer", "text")
# graph = extract_knowledge_graph("analysis_layer", """A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification. LLMs acquire these abilities by learning statistical relationships from text documents during a computationally intensive self-supervised and semi-supervised training process. LLMs can be used for text generation, a form of generative AI, by taking an input text and repeatedly predicting the next token or word.
# LLMs are artificial neural networks. The largest and most capable, as of March 2024""", question="What is a large language model?")
# print("GPT4 History:", gpt_4_turbo.inspect_history(n=1))
# print(graph)
#

View file

@ -1,25 +0,0 @@
[
{
"node_id": "062c22dfd99b599f90cd2d325c8bcf69",
"name": "062c22df-d99b-599f-90cd-2d325c8bcf69",
"default_relationship": {
"type": "related_to",
"source": "062c22dfd99b599f90cd2d325c8bcf69",
"target": "6dfe01b607d25b7783c81d6c11ce2aa7"
},
"children": [
{
"node_id": "6dfe01b607d25b7783c81d6c11ce2aa7",
"name": "6dfe01b6-07d2-5b77-83c8-1d6c11ce2aa7",
"default_relationship": {
"type": "related_to",
"source": "6dfe01b6-07d2-5b77-83c8-1d6c11ce2aa7",
"target": "a27bb4fa897e53a594cab446e1d33dbf"
},
"children": []
}
]
}
]

View file

@ -1,6 +0,0 @@
A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.

View file

@ -1,16 +0,0 @@
U analizi komunikacionih sistema kod životinja zadržaćemo se samo na semiotičkim problemima postoje li u pojedinim sistemima njihove komunikacije ZNACI, semiotički SISTEMI i neke semiotičke OPERACIJE, u onom smislu kako su ti pojmovi definisani i utvrđeni kod ljudi. Analiziraćemo sličnosti i razlike između komunikacije kod životinja i kod ljudi, posebno semiotičke komunikacije kod čoveka.
Kada se ima u vidu bogatstvo oblika komunikativnih veza među životinjama: sva raznolikost signala u pogledu fizičkih svojstava hemijski, oflaktivni (mirisni), akustički (uključiv i ultrazvukove), električni, motorički (kinezički), proksemički (položaj u prostoru), vizuelni i drugi, zatim raznovrsnost kanala (sredina) kroz koje se ostvaruje veza, kao i raznovrsnost funkcija koje imaju komunikativni sistemi, pitanje je koliko je uopšte opravdano govoriti o komunikaciji životinja u celini.
Međutim, kada se pristupi semiotičkoj analizi sistema komunikacije među životinjama, iza raznolikosti nalazi se prilična jednoličnost, čak tolika da se ne može utvrditi postoji li nekakvo usavršavanje sistema komunikacije duž evolucione lestvice.
Pogledajmo najpre kakve FUNKCIJE opslužuju sistemi komunikacija kod životinja. Poznati istraživač ovih problema, Marler, ovako rezimira analizu komunikacije među nižim i višim majmunima: „U velikoj većini, celokupni sistem komunikacije izgleda postoji radi organizacije socijalnog ponašanja grupe, regulacije dominantnosti i subordinacije, održanja mira i kohezije u grupi, kao i radi reprodukcije i brige o mladima (Marleu, 1967). Pomenute funkcije mogle bi se, nešto raščlanjenije, ovako opisati:
Postoje, najpre, kod nekih vrsta signali za identifikaciju pojedinaca (npr. parovi mužjaka i ženki kod ptica pevačica mogu ostati u stalnoj vezi tokom cele jedne godine i međusobno se identifikuju pomoću pevanja, ponekada u vidu dueta koji može izvesti samo određeni par) ili za identifikaciju vrste (npr. pčele-stražari ubijaju na ulazu u košnicu svaku jedinku koja se na osnovu signala ne može identifikovati kao član tog pčelinjeg društva).
Najbrojniji i najraznovrsniji su signali koji saopštavaju o motivacionim i afektivnim stanjima jedinke i o promenama tih stanja, a često i o suptilnim nijansama raspoloženja. Ta vrsta signala kazuje o gladi, seksualnim potrebama, ugroženostistrahu, boli, uzbuđenju, naklonosti, neprijateljstvu i agresivnosti, o zadovoljstvu i o svim varijacijama ovakvih motivacionih stanja i raspoloženja.
Izuzetnu biološku vrednost imaju signali koji regulišu međusobne odnose jedinki ili odnose u grupi kao celini. Podsticanje, dozivanje i približavanje partnera, privlačenje i parenje, međusobno prepoznavanje i saradnja između roditelja i mladunčadi nemogući su bez nekog sistema signalizacije. Određivanje zauzete teritorije, okupljanje grupe, vođstvo i određivanje statusa u grupi, dominacija i potčinjavanje, organizovanje kolonija to su samo neke od socijalnih funkcija koje poslužuju sistemi komunikacije.
U svim sistemima komunikacije među životinjama upadljivo je najmanje onih poruka koje govore o okolini u kojoj životinja živi; samo kod pojedinih vrsta postoje signali koji saopštavaju o postojanju ili lokalizaciji napadača, o hrani ili nalazištu hrane, o lokaciji staništa.
Kada se ima u vidu samo ono što je ovde pobrojano, stiče se utisak o bogatstvu informacija koje mogu preneti komunikativni sistemi životinja. Međutim, za pun uvid u prirodu tih sistema potrebno je videti na koji način sistemi komuniciranja kod životinja obavljaju te funkcije. Obično se kaže da ovi sistemi imaju, pre svega ili isključivo socijalne funkcije. To je tačno, ali pod uslovom da se prethodno razjasni šta ovde znači socijalno. Funkcije tih sistema su socijalne najpre su smislu da signali UTIČU NA DRUGE jedinke. I upravo tako, utiču na druge jedinke, a ne upućeni su drugim jedinkama. U stvari, u razvoju odnosa među jedinkama tokom evolucije izgrađuje se svojevrsna socijalna simbioza, u kojoj neki vidljivi pokazatelji ponašanja jedne jedinke postaju obaveštenja (signali) o njenim motivacionim stanjima. Dakle, signali su (za razliku od SIMBOLA) samo pokazatelji i sastavni delovi motivacionih, afektivnih ili nekih drugih unutrašnjih stanja jedinke. I baš zbog toga se ne može reći da jedinka upućuje drugoj signale, ona prosto doživljava to što doživljava. Neke komponente doživljaja dostupne su opažanju drugih jedinki i u toku zajedničkog života postaju signali određenih stanja. Tokom evolucije ti signali se stabilizuju, stilizuju (ritualizuju) i prerađuju u određeni sistem komunikacije.
U tako stvorenoj socijalnoj simbiozi, signali koje upućuju životinje jedne drugima pre su nalozi za izvođenje određenih radnji, tj. pokretači ili inhibitori radnji nego saopštene informacije. Hormonalne i druge promene u organizmu koje dovode do pojave signala u jednoj jedinki skoro automatski u određenim uslovima pokreću lanac hormonalnih i ostalih promena u jedinki koja prima signale, a te promene kod njih izazivaju određene radnje. Dakle, komunikativni sistemi imaju socijalne funkcije zato što menjaju ponašanje drugih jedinki. A to dalje znači da jedina jedinka kojoj životinja ne upućuje signale jeste ona sama, i to je jedna od suštinskih razlika semiotičke komunikacije čoveka i komunikacije među žvotinjama.
Iz prethodno opisanog sledi i ovo: signal retko kada ima isto značenje za jedinku koja ga emituje i jedinku koja ga prima. U komunikaciji među životinjama više se radi o odnosima komplementarnosti nego o odnosima recipročnosti (baš kao u nekim neverbalnim vidovima komunikacije kod ljudi: onaj koji pokazuje gnev i onaj ko opaža gnev imaju različite doživljaje). U ovom pogledu signali više služe za socijalnu facilitaciju ponašanja i sinhronizaciju fizioloških stanja i motoričkih radnji u toku interindividualnih aktivnosti (nrp. parenja) ili grupnih aktivnosti (npr. u komunikaciji pčela) nego za socijalnu razmenu.
Potrebno je ukazati na još jednu odliku komunikativnih sistema životinja: postoji uska specijalizacija signala za određene funkcije. Svaka od ranije pobrojanih funkcija ima specifične signale koji joj služe (npr. krici kod ptica su signali opasnosti, a pesma ptica-pevačica je ljubavni zov). Ovo, naravno, ne znači da za obavljanje jedne vrste funkcija postoji samo jedna vrsta signala, jer je i u sistemima veze između životinja pravilo da postoji redundansa (npr. za saopštavanje o mestu nalazišta paše pčele koriste istovremeno i „jezik“ igre telom i akustičke signale).
U pogledu SEMANTIKE (značenja), sistemi komunikacija među životinjama poseduju dva osnovna svojstva:
Postoji konačan i obično veoma mali broj poruka koje stoje svakoj vrsti na raspolaganju značajnija je karakteristika da je broj signala konačan. Naime, životinja svake vrste dobija nasleđem, ili stiče uz izvesno učenje, određen broj signala i taj repertoar ostaje zatvoren, nepromenljiv. Za razliku od toga, ljudski govor je otvoreni sistem, koji po svojim pravilima stvara nove jedinice sistema.
Jedva da poneki istraživač saopštava da je zapazio stvaranje novih signala kod životinja. Tamo gde je signale moguće tehnički pobrojati, nalazi se da njihov broj nije veliki. Tako kod pojedinih vrsta majmuna istraživači redovno utvrđuju da se broj akustičkih signala kreće od 10-15, dok se kod šimpanzi može razlikovati 9 facijanih ekspresija. Ni kod drugih životinja taj broj nije mnogo veći: kod nekih jedva da postoje dva signala u istom kanalu veze, a jedino se kod nekih ptica-pevačica sreće i do nekoliko stotina različitih „motiva“ u pesmama. Pošto većinu komunikativnih sredstava kojima se služe životinje dobijaju nasleđem, ta nasledna određenost vrlo je striktna i u pogledu funkcije i forme signala, tako da je životinja sposobna za komunikaciju koja je karakteristična za njenu vrstu, čak i kada se razvija u izolaciji. Kod nekih vrsta ptica učenje ima značajniju ulogu. Poznato je da mladi nekih vrsta ptica mogu da nauče i pesmu drugih vrsta ptica, u čijoj zajednici odrastaju, dok kod nekih vrsta ptica određene grupacije jedinki stvaraju svoje „dijalekte“. Izgleda da je funkcija tih dijalekata da iz nekih razloga ograniče parenje među pripadnicima različitih grupacija, jer ptice mogu da se pare samo na osnovu ljubavne pesme onog „dijalekta“ kojim se služe. Ovakav način sticanja komunikativnih sistema veoma podseća na usvajanje govora kod dece.
Signali nemaju denotativna značenja, tj. ne označavaju neki određeni segment realnosti (denotat), ne saopštavaju nešto o tom denotatu, već samo predstavljaju vid ekspresije stanja organizma. Za semiotičku analizu posebno je značajno da li signali koje životinje koriste zaista označavaju nešto različito od sebe samih, da li kazuju nešto o denotatu, ili su puka ekspresija fizioloških i afektivnih stanja životinje.

View file

@ -1,2 +0,0 @@
Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of "understanding"[citation needed] the contents of documents, including the contextual nuances of the language within them. To this end, natural language processing often borrows ideas from theoretical linguistics. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.

View file

@ -1 +0,0 @@
German novels are fun to read and talk about nature

View file

@ -1,14 +0,0 @@
from typing import Type
from pydantic import BaseModel
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.infrastructure.llm.get_llm_client import get_llm_client
async def extract_topology(content: str, response_model: Type[BaseModel]):
llm_client = get_llm_client()
system_prompt = read_query_prompt("extract_topology.txt")
llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
return llm_output.model_dump()

View file

@ -1,20 +0,0 @@
import logging
from cognee.modules.topology.extraction.extract_topology import extract_topology
from cognee.infrastructure.databases.graph.config import get_graph_config
logger = logging.getLogger(__name__)
async def infer_data_topology(content: str, graph_topology=None):
if graph_topology is None:
graph_config = get_graph_config()
graph_topology = graph_config.graph_topology
print("content: ", type(content))
try:
return (await extract_topology(
content,
graph_topology
))
except Exception as error:
logger.error("Error extracting topology from content: %s", error, exc_info = True)
raise error

View file

@ -1,5 +1,7 @@
""" This module contains the TopologyEngine class which is responsible for adding graph topology from a JSON or CSV file. """
from cognee.infrastructure.databases.graph import get_graph_config
from cognee.modules.cognify.config import get_cognify_config
import csv
import json
import logging
@ -14,10 +16,42 @@ from cognee.infrastructure.data.chunking.get_chunking_engine import get_chunk_en
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type, FileTypeException
from cognee.modules.topology.topology_data_models import NodeModel
from cognee.tasks.document_to_ontology.models.models import NodeModel
logger = logging.getLogger("topology")
from cognee.infrastructure.databases.graph.config import get_graph_config
from typing import Type
from pydantic import BaseModel
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.infrastructure.llm.get_llm_client import get_llm_client
async def extract_topology(content: str, response_model: Type[BaseModel]):
llm_client = get_llm_client()
system_prompt = read_query_prompt("extract_topology.txt")
llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
return llm_output.model_dump()
async def infer_data_topology(content: str, graph_topology=None):
if graph_topology is None:
graph_config = get_graph_config()
graph_topology = graph_config.graph_topology
try:
return (await extract_topology(
content,
graph_topology
))
except Exception as error:
logger.error("Error extracting topology from content: %s", error, exc_info = True)
raise error
class TopologyEngine:
def __init__(self, infer:bool) -> None:
self.models: Dict[str, Type[BaseModel]] = {}
@ -69,8 +103,6 @@ class TopologyEngine:
async def add_graph_topology(self, file_path: str = None, files: list = None):
"""Add graph topology from a JSON or CSV file."""
if self.infer:
from cognee.modules.topology.infer_data_topology import infer_data_topology
initial_chunks_and_ids = []
chunk_config = get_chunk_config()
@ -128,3 +160,23 @@ class TopologyEngine:
return
except Exception as e:
raise RuntimeError(f"Failed to add graph topology from {file_path}: {e}") from e
async def document_to_ontology(data, root_node_id):
cognee_config = get_cognify_config()
graph_config = get_graph_config()
root_node_id = None
if graph_config.infer_graph_topology and graph_config.graph_topology_task:
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
root_node_id = await topology_engine.add_graph_topology(files=data)
elif graph_config.infer_graph_topology and not graph_config.infer_graph_topology:
topology_engine = TopologyEngine(infer=graph_config.infer_graph_topology)
await topology_engine.add_graph_topology(graph_config.topology_file_path)
elif not graph_config.graph_topology_task:
root_node_id = "ROOT"
yield (data, root_node_id)

View file

@ -1,58 +1,381 @@
# cognee
#### Deterministic LLMs Outputs for AI Engineers
_Open-source framework for loading and structuring LLM context to create accurate and explainable AI solutions using knowledge graphs and vector stores_
---
[![Twitter Follow](https://img.shields.io/twitter/follow/tricalt?style=social)](https://twitter.com/tricalt)
[![Downloads](https://img.shields.io/pypi/dm/cognee.svg)](https://pypi.python.org/pypi/cognee)
[![Star on GitHub](https://img.shields.io/github/stars/topoteretes/cognee.svg?style=social)](https://github.com/topoteretes/cognee)
### Let's learn about cogneeHub!
cogneeHub is a free, open-source learning platform for those interested in creating deterministic LLM outputs. We help developers by using graphs, LLMs, and adding vector retrieval to their Machine Learning stack.
- **Get started** — [Get started with cognee quickly and try it out for yourself.](quickstart.md)
- **Conceptual Overview** — Learn about the [core concepts](conceptual_overview.md) of cognee and how it fits into your projects.
- **Data Engineering and LLMOps** — Learn about some [data engineering and llmops](data_engineering_llm_ops.md) core concepts that will help you build better AI apps.
- **RAGs** — We provide easy-to-follow [learning materials](rags.md) to help you learn about RAGs.
- **Research** — A list of resources to help you learn more about [cognee and LLM memory research](research.md)
- **Blog** — A blog where you can read about the [latest news and updates](blog/index.md) about cognee.
- **Support** — [Book time](https://www.cognee.ai/#bookTime) with our team.
[//]: # (- **Case Studies** — Read about [case studies](case_studies.md) that show how cognee can be used in real-world applications.)
[//]: # (- **Case Studies** — Read about [case studies](case_studies.md) that show how cognee can be used in real-world applications.)
### Vision
![Vision](img/roadmap.png)
### Architecture
![Architecture](img/architecture.png)
### Why use cognee?
The question of using cognee is fundamentally a question of why to have deterministic outputs for your llm workflows.
1. **Cost-effective** — cognee extends the capabilities of your LLMs without the need for expensive data processing tools.
2. **Self-contained** — cognee runs as a library and is simple to use
3. **Interpretable** — Navigate graphs instead of embeddings to understand your data.
4. **User Guided** — cognee lets you control your input and provide your own Pydantic data models
## License
This project is licensed under the terms of the Apache License 2.0.
[//]: # (<style>)
[//]: # ()
[//]: # (.container {)
[//]: # ()
[//]: # ( display: flex;)
[//]: # ()
[//]: # ( justify-content: space-around;)
[//]: # ()
[//]: # ( margin-top: 20px;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.container div {)
[//]: # ()
[//]: # ( width: 28%;)
[//]: # ()
[//]: # ( padding: 20px;)
[//]: # ()
[//]: # ( box-sizing: border-box;)
[//]: # ()
[//]: # ( border: 1px solid #e0e0e0;)
[//]: # ()
[//]: # ( border-radius: 8px;)
[//]: # ()
[//]: # ( background-color: #f9f9f9;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.container h2 {)
[//]: # ()
[//]: # ( font-size: 1.25em;)
[//]: # ()
[//]: # ( margin-bottom: 10px;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.container p {)
[//]: # ()
[//]: # ( margin-bottom: 20px;)
[//]: # ()
[//]: # ( line-height: 1.6;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.button-container {)
[//]: # ()
[//]: # ( text-align: center;)
[//]: # ()
[//]: # ( margin: 30px 0;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.button-container a {)
[//]: # ()
[//]: # ( display: inline-block;)
[//]: # ()
[//]: # ( padding: 15px 25px;)
[//]: # ()
[//]: # ( background-color: #007bff;)
[//]: # ()
[//]: # ( color: white;)
[//]: # ()
[//]: # ( text-decoration: none;)
[//]: # ()
[//]: # ( border-radius: 5px;)
[//]: # ()
[//]: # ( font-size: 1em;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.button-container a:hover {)
[//]: # ()
[//]: # ( background-color: #0056b3;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.resources {)
[//]: # ()
[//]: # ( margin-top: 40px;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.resources h2 {)
[//]: # ()
[//]: # ( font-size: 1.5em;)
[//]: # ()
[//]: # ( margin-bottom: 20px;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.resources ul {)
[//]: # ()
[//]: # ( list-style: none;)
[//]: # ()
[//]: # ( padding: 0;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.resources li {)
[//]: # ()
[//]: # ( margin-bottom: 10px;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.resources a {)
[//]: # ()
[//]: # ( color: #007bff;)
[//]: # ()
[//]: # ( text-decoration: none;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # ()
[//]: # (.resources a:hover {)
[//]: # ()
[//]: # ( text-decoration: underline;)
[//]: # ()
[//]: # (})
[//]: # ()
[//]: # (</style>)
[//]: # ()
[//]: # ()
[//]: # (# New to cognee?)
[//]: # ()
[//]: # ()
[//]: # (The getting started guide covers adding a GraphRAG data store to your AI app, sending events, identifying users, extracting actions and insights, and interconnecting separate datasets.)
[//]: # ()
[//]: # ()
[//]: # (<div class="button-container">)
[//]: # ()
[//]: # ( <a href="./quickstart.md">Get started</a>)
[//]: # ()
[//]: # (</div>)
[//]: # ()
[//]: # ()
[//]: # (<div class="container">)
[//]: # ()
[//]: # ( <div>)
[//]: # ()
[//]: # ( <h2>Ingest Data</h2>)
[//]: # ()
[//]: # ( <p>Learn how to manage ingestion of events, customer data or third party data for use with cognee.</p>)
[//]: # ()
[//]: # ( <a href="#">Explore</a>)
[//]: # ()
[//]: # ( </div>)
[//]: # ()
[//]: # ( <div>)
[//]: # ()
[//]: # ( <h2>Templates</h2>)
[//]: # ()
[//]: # ( <p>Analyze and enrich your data and improve LLM answers with a series of templates using cognee tasks and pipelines.</p>)
[//]: # ()
[//]: # ( <a href="#">Browse templates</a>)
[//]: # ()
[//]: # ( </div>)
[//]: # ()
[//]: # ( <div>)
[//]: # ()
[//]: # ( <h2>API</h2>)
[//]: # ()
[//]: # ( <p>Push or pull data to build custom functionality or create bespoke views for your business needs.</p>)
[//]: # ()
[//]: # ( <a href="#">Explore</a>)
[//]: # ()
[//]: # ( </div>)
[//]: # ()
[//]: # (</div>)
[//]: # ()
[//]: # ()
[//]: # (<div class="resources">)
[//]: # ()
[//]: # ( <h2>Resources</h2>)
[//]: # ()
[//]: # ( <ul>)
[//]: # ()
[//]: # ( <li><a href="#">What is GraphRAG</a></li>)
[//]: # ()
[//]: # ( <li><a href="#">Research</a></li>)
[//]: # ()
[//]: # ( <li><a href="#">Community</a></li>)
[//]: # ()
[//]: # ( <li><a href="#">Community</a></li>)
[//]: # ()
[//]: # ( <li><a href="#">API Reference</a></li>)
[//]: # ()
[//]: # ( <li><a href="#">Support</a></li>)
[//]: # ()
[//]: # ( </ul>)
[//]: # ()
[//]: # (</div>)