From 07505b5f5e9c685471bc17184886d02c78d0d505 Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Thu, 3 Jul 2025 12:32:44 +0200
Subject: [PATCH] feat: Adds new graph extraction pipeline

---
 ...ge_extraction_prompt_sequential_system.txt |  6 ++
 ...edge_extraction_prompt_sequential_user.txt | 15 +++++
 ...de_extraction_prompt_sequential_system.txt |  9 +++
 ...node_extraction_prompt_sequential_user.txt | 10 ++++
 .../extract_content_graph_separated.py        | 55 +++++++++++++++++++
 cognee/shared/data_models.py                  | 20 +++++++
 cognee/tasks/graph/extract_graph_from_data.py | 18 +++++-
 7 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt
 create mode 100644 cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt
 create mode 100644 cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt
 create mode 100644 cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt
 create mode 100644 cognee/modules/data/extraction/extract_content_graph_separated.py

diff --git a/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt
new file mode 100644
index 000000000..ddfac849e
--- /dev/null
+++ b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_system.txt
@@ -0,0 +1,6 @@
+You are an expert in relationship identification and knowledge graph building focusing on relationships. Your task is to perform a detailed extraction of relationship names from the text.
+	•	Extract all relationship names from explicit phrases, verbs, and implied context that could help form edge triplets.
+	•	Use the potential nodes and reassign them to relationship names if they correspond to a relation, verb, action or similar.
+	•	Ensure completeness by working in multiple rounds, capturing overlooked connections and refining the nodes list.
+	•	Focus on meaningful entities and relationship, directly stated or implied and implicit.
+	•	Return two lists: refined nodes and potential relationship names (for forming edges).
diff --git a/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt
new file mode 100644
index 000000000..1d173048e
--- /dev/null
+++ b/cognee/infrastructure/llm/prompts/edge_extraction_prompt_sequential_user.txt
@@ -0,0 +1,15 @@
+Analyze the following text to identify relationships between entities in the knowledge graph.
+Build upon previously extracted edges, ensuring completeness and consistency.
+Return all the previously extracted edges **together** with the new ones that you extracted.
+This is round {{ round_number }} of {{ total_rounds }}.
+
+**Text:**
+{{ text }}
+
+**Previously Extracted Nodes:**
+{{ nodes }}
+
+**Relationships Identified in Previous Rounds:**
+{{ relationships }}
+
+Extract both explicit and implicit relationships between the nodes, building upon previous findings while ensuring completeness and consistency.
diff --git a/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt
new file mode 100644
index 000000000..e03e6c966
--- /dev/null
+++ b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_system.txt
@@ -0,0 +1,9 @@
+You are an expert in entity extraction and knowledge graph building focusing on the node identification.
+Your task is to perform a detailed entity and concept extraction from text to generate a list of potential nodes for a knowledge graph.
+	•	Node IDs should be names or human-readable identifiers found in the text.
+	•	Extract clear, distinct entities and concepts as individual strings.
+	•	Be exhaustive, ensure completeness by capturing all the entities, names, nouns, noun-parts, and implied or implicit mentions.
+	•	Also extract potential entity type nodes, directly mentioned or implied.
+	•	Avoid duplicates and overly generic terms.
+	•	Consider different perspectives and indirect references.
+	•	Return only a list of unique node strings with all the entities.
diff --git a/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt
new file mode 100644
index 000000000..2261f096d
--- /dev/null
+++ b/cognee/infrastructure/llm/prompts/node_extraction_prompt_sequential_user.txt
@@ -0,0 +1,10 @@
+Extract distinct entities and concepts from the following text to expand the knowledge graph.
+Build upon previously extracted entities, ensuring completeness and consistency.
+Return all the previously extracted entities **together** with the new ones that you extracted.
+This is round {{ round_number }} of {{ total_rounds }}.
+
+**Text:**
+{{ text }}
+
+**Previously Extracted Entities:**
+{{ nodes }}
diff --git a/cognee/modules/data/extraction/extract_content_graph_separated.py b/cognee/modules/data/extraction/extract_content_graph_separated.py
new file mode 100644
index 000000000..8e999c834
--- /dev/null
+++ b/cognee/modules/data/extraction/extract_content_graph_separated.py
@@ -0,0 +1,55 @@
+import json
+
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import render_prompt
+from cognee.shared.data_models import KnowledgeGraph, NodeList, EdgeList
+
+
+async def extract_content_graph_separated(content: str, node_rounds: int = 2, edge_rounds=2):
+    llm_client = get_llm_client()
+
+    current_nodes = NodeList()
+
+    for pass_idx in range(node_rounds):
+        nodes_json = json.dumps([n.model_dump() for n in current_nodes.nodes], ensure_ascii=False)
+
+        node_system = render_prompt("node_extraction_prompt_sequential_system.txt", {})
+        node_user = render_prompt(
+            "node_extraction_prompt_sequential_user.txt",
+            {
+                "text": content,
+                "nodes": {nodes_json},
+                "total_rounds": {node_rounds},
+                "round_number": {pass_idx},
+            },
+        )
+
+        current_nodes = await llm_client.acreate_structured_output(node_user, node_system, NodeList)
+
+    final_nodes = current_nodes
+    final_nodes_json = json.dumps([n.model_dump() for n in final_nodes.nodes], ensure_ascii=False)
+
+    current_edges = EdgeList()
+
+    for pass_idx in range(edge_rounds):
+        edges_json = json.dumps([n.model_dump() for n in current_edges.edges], ensure_ascii=False)
+
+        edges_system = render_prompt("edge_extraction_prompt_sequential_system.txt", {})
+        edges_user = render_prompt(
+            "edge_extraction_prompt_sequential_user.txt",
+            {
+                "text": content,
+                "nodes": {final_nodes_json},
+                "edges": {edges_json},
+                "total_rounds": {node_rounds},
+                "round_number": {pass_idx},
+            },
+        )
+
+        current_edges = await llm_client.acreate_structured_output(
+            edges_user, edges_system, EdgeList
+        )
+
+    final_edges = current_edges
+
+    return KnowledgeGraph(nodes=final_nodes.nodes, edges=final_edges.edges)
diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py
index fb9386dcf..27d24d6d7 100644
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@@ -30,6 +30,16 @@ if get_llm_config().llm_provider.lower() == "gemini":
         target_node_id: str
         relationship_name: str
 
+    class NodeList(BaseModel):
+        """List of nodes in a knowledge graph."""
+
+        nodes: List[Node] = Field(..., default_factory=list)
+
+    class EdgeList(BaseModel):
+        """List of edges in a knowledge graph."""
+
+        edges: List[Edge] = Field(..., default_factory=list)
+
     class KnowledgeGraph(BaseModel):
         """Knowledge graph."""
 
@@ -54,6 +64,16 @@ else:
         target_node_id: str
         relationship_name: str
 
+    class NodeList(BaseModel):
+        """List of nodes in a knowledge graph."""
+
+        nodes: List[Node] = Field(..., default_factory=list)
+
+    class EdgeList(BaseModel):
+        """List of edges in a knowledge graph."""
+
+        edges: List[Edge] = Field(..., default_factory=list)
+
     class KnowledgeGraph(BaseModel):
         """Knowledge graph."""
 
diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py
index 01d8bb618..73fc0b46a 100644
--- a/cognee/tasks/graph/extract_graph_from_data.py
+++ b/cognee/tasks/graph/extract_graph_from_data.py
@@ -4,6 +4,9 @@ from typing import Type, List, Optional
 from pydantic import BaseModel
 
 from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.modules.data.extraction.extract_content_graph_separated import (
+    extract_content_graph_separated,
+)
 from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
 from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
 from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
@@ -58,9 +61,18 @@ async def extract_graph_from_data(
     """
     Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
     """
-    chunk_graphs = await asyncio.gather(
-        *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
-    )
+    low_level = False
+    if low_level:
+        chunk_graphs = await asyncio.gather(
+            *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
+        )
+    else:
+        chunk_graphs = await asyncio.gather(
+            *[
+                extract_content_graph_separated(content=chunk.text, node_rounds=1, edge_rounds=1)
+                for chunk in data_chunks
+            ]
+        )
 
     # Note: Filter edges with missing source or target nodes
     if graph_model == KnowledgeGraph: