feat: Adds new graph extraction pipeline
This commit is contained in:
parent
b8ea699abe
commit
07505b5f5e
7 changed files with 130 additions and 3 deletions
|
|
@ -0,0 +1,6 @@
|
||||||
|
You are an expert in relationship identification and knowledge graph building focusing on relationships. Your task is to perform a detailed extraction of relationship names from the text.
|
||||||
|
• Extract all relationship names from explicit phrases, verbs, and implied context that could help form edge triplets.
|
||||||
|
• Use the potential nodes and reassign them to relationship names if they correspond to a relation, verb, action or similar.
|
||||||
|
• Ensure completeness by working in multiple rounds, capturing overlooked connections and refining the nodes list.
|
||||||
|
• Focus on meaningful entities and relationship, directly stated or implied and implicit.
|
||||||
|
• Return two lists: refined nodes and potential relationship names (for forming edges).
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
Analyze the following text to identify relationships between entities in the knowledge graph.
|
||||||
|
Build upon previously extracted edges, ensuring completeness and consistency.
|
||||||
|
Return all the previously extracted edges **together** with the new ones that you extracted.
|
||||||
|
This is round {{ round_number }} of {{ total_rounds }}.
|
||||||
|
|
||||||
|
**Text:**
|
||||||
|
{{ text }}
|
||||||
|
|
||||||
|
**Previously Extracted Nodes:**
|
||||||
|
{{ nodes }}
|
||||||
|
|
||||||
|
**Relationships Identified in Previous Rounds:**
|
||||||
|
{{ relationships }}
|
||||||
|
|
||||||
|
Extract both explicit and implicit relationships between the nodes, building upon previous findings while ensuring completeness and consistency.
|
||||||
|
|
@ -0,0 +1,9 @@
|
||||||
|
You are an expert in entity extraction and knowledge graph building focusing on the node identification.
|
||||||
|
Your task is to perform a detailed entity and concept extraction from text to generate a list of potential nodes for a knowledge graph.
|
||||||
|
• Node IDs should be names or human-readable identifiers found in the text.
|
||||||
|
• Extract clear, distinct entities and concepts as individual strings.
|
||||||
|
• Be exhaustive, ensure completeness by capturing all the entities, names, nouns, noun-parts, and implied or implicit mentions.
|
||||||
|
• Also extract potential entity type nodes, directly mentioned or implied.
|
||||||
|
• Avoid duplicates and overly generic terms.
|
||||||
|
• Consider different perspectives and indirect references.
|
||||||
|
• Return only a list of unique node strings with all the entities.
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
Extract distinct entities and concepts from the following text to expand the knowledge graph.
|
||||||
|
Build upon previously extracted entities, ensuring completeness and consistency.
|
||||||
|
Return all the previously extracted entities **together** with the new ones that you extracted.
|
||||||
|
This is round {{ round_number }} of {{ total_rounds }}.
|
||||||
|
|
||||||
|
**Text:**
|
||||||
|
{{ text }}
|
||||||
|
|
||||||
|
**Previously Extracted Entities:**
|
||||||
|
{{ nodes }}
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
|
from cognee.infrastructure.llm.prompts import render_prompt
|
||||||
|
from cognee.shared.data_models import KnowledgeGraph, NodeList, EdgeList
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_content_graph_separated(content: str, node_rounds: int = 2, edge_rounds=2):
|
||||||
|
llm_client = get_llm_client()
|
||||||
|
|
||||||
|
current_nodes = NodeList()
|
||||||
|
|
||||||
|
for pass_idx in range(node_rounds):
|
||||||
|
nodes_json = json.dumps([n.model_dump() for n in current_nodes.nodes], ensure_ascii=False)
|
||||||
|
|
||||||
|
node_system = render_prompt("node_extraction_prompt_sequential_system.txt", {})
|
||||||
|
node_user = render_prompt(
|
||||||
|
"node_extraction_prompt_sequential_user.txt",
|
||||||
|
{
|
||||||
|
"text": content,
|
||||||
|
"nodes": {nodes_json},
|
||||||
|
"total_rounds": {node_rounds},
|
||||||
|
"round_number": {pass_idx},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
current_nodes = await llm_client.acreate_structured_output(node_user, node_system, NodeList)
|
||||||
|
|
||||||
|
final_nodes = current_nodes
|
||||||
|
final_nodes_json = json.dumps([n.model_dump() for n in final_nodes.nodes], ensure_ascii=False)
|
||||||
|
|
||||||
|
current_edges = EdgeList()
|
||||||
|
|
||||||
|
for pass_idx in range(edge_rounds):
|
||||||
|
edges_json = json.dumps([n.model_dump() for n in current_edges.edges], ensure_ascii=False)
|
||||||
|
|
||||||
|
edges_system = render_prompt("edge_extraction_prompt_sequential_system.txt", {})
|
||||||
|
edges_user = render_prompt(
|
||||||
|
"edge_extraction_prompt_sequential_user.txt",
|
||||||
|
{
|
||||||
|
"text": content,
|
||||||
|
"nodes": {final_nodes_json},
|
||||||
|
"edges": {edges_json},
|
||||||
|
"total_rounds": {node_rounds},
|
||||||
|
"round_number": {pass_idx},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
current_edges = await llm_client.acreate_structured_output(
|
||||||
|
edges_user, edges_system, EdgeList
|
||||||
|
)
|
||||||
|
|
||||||
|
final_edges = current_edges
|
||||||
|
|
||||||
|
return KnowledgeGraph(nodes=final_nodes.nodes, edges=final_edges.edges)
|
||||||
|
|
@ -30,6 +30,16 @@ if get_llm_config().llm_provider.lower() == "gemini":
|
||||||
target_node_id: str
|
target_node_id: str
|
||||||
relationship_name: str
|
relationship_name: str
|
||||||
|
|
||||||
|
class NodeList(BaseModel):
|
||||||
|
"""List of nodes in a knowledge graph."""
|
||||||
|
|
||||||
|
nodes: List[Node] = Field(..., default_factory=list)
|
||||||
|
|
||||||
|
class EdgeList(BaseModel):
|
||||||
|
"""List of edges in a knowledge graph."""
|
||||||
|
|
||||||
|
edges: List[Edge] = Field(..., default_factory=list)
|
||||||
|
|
||||||
class KnowledgeGraph(BaseModel):
|
class KnowledgeGraph(BaseModel):
|
||||||
"""Knowledge graph."""
|
"""Knowledge graph."""
|
||||||
|
|
||||||
|
|
@ -54,6 +64,16 @@ else:
|
||||||
target_node_id: str
|
target_node_id: str
|
||||||
relationship_name: str
|
relationship_name: str
|
||||||
|
|
||||||
|
class NodeList(BaseModel):
|
||||||
|
"""List of nodes in a knowledge graph."""
|
||||||
|
|
||||||
|
nodes: List[Node] = Field(..., default_factory=list)
|
||||||
|
|
||||||
|
class EdgeList(BaseModel):
|
||||||
|
"""List of edges in a knowledge graph."""
|
||||||
|
|
||||||
|
edges: List[Edge] = Field(..., default_factory=list)
|
||||||
|
|
||||||
class KnowledgeGraph(BaseModel):
|
class KnowledgeGraph(BaseModel):
|
||||||
"""Knowledge graph."""
|
"""Knowledge graph."""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,9 @@ from typing import Type, List, Optional
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
|
from cognee.modules.data.extraction.extract_content_graph_separated import (
|
||||||
|
extract_content_graph_separated,
|
||||||
|
)
|
||||||
from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
|
from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
|
||||||
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
||||||
from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
|
from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
|
||||||
|
|
@ -58,9 +61,18 @@ async def extract_graph_from_data(
|
||||||
"""
|
"""
|
||||||
Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
|
Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
|
||||||
"""
|
"""
|
||||||
chunk_graphs = await asyncio.gather(
|
low_level = False
|
||||||
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
if low_level:
|
||||||
)
|
chunk_graphs = await asyncio.gather(
|
||||||
|
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
chunk_graphs = await asyncio.gather(
|
||||||
|
*[
|
||||||
|
extract_content_graph_separated(content=chunk.text, node_rounds=1, edge_rounds=1)
|
||||||
|
for chunk in data_chunks
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# Note: Filter edges with missing source or target nodes
|
# Note: Filter edges with missing source or target nodes
|
||||||
if graph_model == KnowledgeGraph:
|
if graph_model == KnowledgeGraph:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue