From 07505b5f5e9c685471bc17184886d02c78d0d505 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 3 Jul 2025 12:32:44 +0200 Subject: [PATCH] feat: Adds new graph extraction pipeline --- ...ge_extraction_prompt_sequential_system.txt | 6 ++ ...edge_extraction_prompt_sequential_user.txt | 15 +++++ ...de_extraction_prompt_sequential_system.txt | 9 +++ ...node_extraction_prompt_sequential_user.txt | 10 ++++ .../extract_content_graph_separated.py | 55 +++++++++++++++++++ cognee/shared/data_models.py | 20 +++++++ cognee/tasks/graph/extract_graph_from_data.py | 18 +++++- 7 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt create mode 100644 cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt create mode 100644 cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt create mode 100644 cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt create mode 100644 cognee/modules/data/extraction/extract_content_graph_separated.py diff --git a/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt new file mode 100644 index 000000000..ddfac849e --- /dev/null +++ b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt @@ -0,0 +1,6 @@ +You are an expert in relationship identification and knowledge graph building focusing on relationships. Your task is to perform a detailed extraction of relationship names from the text. + • Extract all relationship names from explicit phrases, verbs, and implied context that could help form edge triplets. + • Use the potential nodes and reassign them to relationship names if they correspond to a relation, verb, action or similar. + • Ensure completeness by working in multiple rounds, capturing overlooked connections and refining the nodes list. + • Focus on meaningful entities and relationship, directly stated or implied and implicit. + • Return two lists: refined nodes and potential relationship names (for forming edges). diff --git a/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt new file mode 100644 index 000000000..1d173048e --- /dev/null +++ b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt @@ -0,0 +1,15 @@ +Analyze the following text to identify relationships between entities in the knowledge graph. +Build upon previously extracted edges, ensuring completeness and consistency. +Return all the previously extracted edges **together** with the new ones that you extracted. +This is round {{ round_number }} of {{ total_rounds }}. + +**Text:** +{{ text }} + +**Previously Extracted Nodes:** +{{ nodes }} + +**Relationships Identified in Previous Rounds:** +{{ relationships }} + +Extract both explicit and implicit relationships between the nodes, building upon previous findings while ensuring completeness and consistency. diff --git a/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt new file mode 100644 index 000000000..e03e6c966 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt @@ -0,0 +1,9 @@ +You are an expert in entity extraction and knowledge graph building focusing on the node identification. +Your task is to perform a detailed entity and concept extraction from text to generate a list of potential nodes for a knowledge graph. + • Node IDs should be names or human-readable identifiers found in the text. + • Extract clear, distinct entities and concepts as individual strings. + • Be exhaustive, ensure completeness by capturing all the entities, names, nouns, noun-parts, and implied or implicit mentions. + • Also extract potential entity type nodes, directly mentioned or implied. + • Avoid duplicates and overly generic terms. + • Consider different perspectives and indirect references. + • Return only a list of unique node strings with all the entities. diff --git a/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt new file mode 100644 index 000000000..2261f096d --- /dev/null +++ b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt @@ -0,0 +1,10 @@ +Extract distinct entities and concepts from the following text to expand the knowledge graph. +Build upon previously extracted entities, ensuring completeness and consistency. +Return all the previously extracted entities **together** with the new ones that you extracted. +This is round {{ round_number }} of {{ total_rounds }}. + +**Text:** +{{ text }} + +**Previously Extracted Entities:** +{{ nodes }} diff --git a/cognee/modules/data/extraction/extract_content_graph_separated.py b/cognee/modules/data/extraction/extract_content_graph_separated.py new file mode 100644 index 000000000..8e999c834 --- /dev/null +++ b/cognee/modules/data/extraction/extract_content_graph_separated.py @@ -0,0 +1,55 @@ +import json + +from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.infrastructure.llm.prompts import render_prompt +from cognee.shared.data_models import KnowledgeGraph, NodeList, EdgeList + + +async def extract_content_graph_separated(content: str, node_rounds: int = 2, edge_rounds=2): + llm_client = get_llm_client() + + current_nodes = NodeList() + + for pass_idx in range(node_rounds): + nodes_json = json.dumps([n.model_dump() for n in current_nodes.nodes], ensure_ascii=False) + + node_system = render_prompt("node_extraction_prompt_sequential_system.txt", {}) + node_user = render_prompt( + "node_extraction_prompt_sequential_user.txt", + { + "text": content, + "nodes": {nodes_json}, + "total_rounds": {node_rounds}, + "round_number": {pass_idx}, + }, + ) + + current_nodes = await llm_client.acreate_structured_output(node_user, node_system, NodeList) + + final_nodes = current_nodes + final_nodes_json = json.dumps([n.model_dump() for n in final_nodes.nodes], ensure_ascii=False) + + current_edges = EdgeList() + + for pass_idx in range(edge_rounds): + edges_json = json.dumps([n.model_dump() for n in current_edges.edges], ensure_ascii=False) + + edges_system = render_prompt("edge_extraction_prompt_sequential_system.txt", {}) + edges_user = render_prompt( + "edge_extraction_prompt_sequential_user.txt", + { + "text": content, + "nodes": {final_nodes_json}, + "edges": {edges_json}, + "total_rounds": {node_rounds}, + "round_number": {pass_idx}, + }, + ) + + current_edges = await llm_client.acreate_structured_output( + edges_user, edges_system, EdgeList + ) + + final_edges = current_edges + + return KnowledgeGraph(nodes=final_nodes.nodes, edges=final_edges.edges) diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index fb9386dcf..27d24d6d7 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -30,6 +30,16 @@ if get_llm_config().llm_provider.lower() == "gemini": target_node_id: str relationship_name: str + class NodeList(BaseModel): + """List of nodes in a knowledge graph.""" + + nodes: List[Node] = Field(..., default_factory=list) + + class EdgeList(BaseModel): + """List of edges in a knowledge graph.""" + + edges: List[Edge] = Field(..., default_factory=list) + class KnowledgeGraph(BaseModel): """Knowledge graph.""" @@ -54,6 +64,16 @@ else: target_node_id: str relationship_name: str + class NodeList(BaseModel): + """List of nodes in a knowledge graph.""" + + nodes: List[Node] = Field(..., default_factory=list) + + class EdgeList(BaseModel): + """List of edges in a knowledge graph.""" + + edges: List[Edge] = Field(..., default_factory=list) + class KnowledgeGraph(BaseModel): """Knowledge graph.""" diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 01d8bb618..73fc0b46a 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -4,6 +4,9 @@ from typing import Type, List, Optional from pydantic import BaseModel from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.data.extraction.extract_content_graph_separated import ( + extract_content_graph_separated, +) from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.data.extraction.knowledge_graph import extract_content_graph @@ -58,9 +61,18 @@ async def extract_graph_from_data( """ Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model. """ - chunk_graphs = await asyncio.gather( - *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] - ) + low_level = False + if low_level: + chunk_graphs = await asyncio.gather( + *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] + ) + else: + chunk_graphs = await asyncio.gather( + *[ + extract_content_graph_separated(content=chunk.text, node_rounds=1, edge_rounds=1) + for chunk in data_chunks + ] + ) # Note: Filter edges with missing source or target nodes if graph_model == KnowledgeGraph: