feat: cog-1320 Minimal LLM-Based Entity Extraction (#590)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced an expert entity extraction feature that extracts significant named entities from text and provides structured output with essential details. - Rolled out customizable prompt templates for both system instructions and user input to standardize the extraction process. - Integrated a robust language model–based extractor with comprehensive error handling to ensure reliable and consistent results. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
This commit is contained in:
parent
2323fd0c94
commit
8874ddad2e
3 changed files with 118 additions and 0 deletions
|
|
@ -0,0 +1,42 @@
|
||||||
|
You are an expert entity extraction system. Your task is to identify and extract important named entities from the provided text.
|
||||||
|
|
||||||
|
Extract only distinct, meaningful entities that are central to understanding the text. Avoid extracting common nouns, pronouns, or generic terms.
|
||||||
|
|
||||||
|
For each entity, provide:
|
||||||
|
1. name: The entity name
|
||||||
|
2. is_a: An EntityType object with:
|
||||||
|
- name: The type name (in uppercase)
|
||||||
|
- description: A brief description of the type
|
||||||
|
3. description: A brief description of the entity (1-2 sentences)
|
||||||
|
|
||||||
|
Your response MUST be a valid JSON object with a single field "entities" containing an array of entity objects. Do not include any explanatory text, markdown formatting, or code blocks outside of the JSON.
|
||||||
|
|
||||||
|
Example response format:
|
||||||
|
{
|
||||||
|
"entities": [
|
||||||
|
{
|
||||||
|
"name": "Albert Einstein",
|
||||||
|
"is_a": {
|
||||||
|
"name": "PERSON",
|
||||||
|
"description": "Entity type for person entities"
|
||||||
|
},
|
||||||
|
"description": "A theoretical physicist who developed the theory of relativity."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Theory of Relativity",
|
||||||
|
"is_a": {
|
||||||
|
"name": "CONCEPT",
|
||||||
|
"description": "Entity type for concept entities"
|
||||||
|
},
|
||||||
|
"description": "A physics theory describing the relationship between space and time."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Princeton University",
|
||||||
|
"is_a": {
|
||||||
|
"name": "ORGANIZATION",
|
||||||
|
"description": "Entity type for organization entities"
|
||||||
|
},
|
||||||
|
"description": "An Ivy League research university in Princeton, New Jersey."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
Extract key entities from this text:
|
||||||
|
|
||||||
|
{{ text }}
|
||||||
|
|
@ -0,0 +1,73 @@
|
||||||
|
import logging
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from cognee.infrastructure.entities.BaseEntityExtractor import BaseEntityExtractor
|
||||||
|
from cognee.modules.engine.models import Entity
|
||||||
|
from cognee.modules.engine.models.EntityType import EntityType
|
||||||
|
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
|
||||||
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
|
|
||||||
|
logger = logging.getLogger("llm_entity_extractor")
|
||||||
|
|
||||||
|
|
||||||
|
class EntityList(BaseModel):
|
||||||
|
"""Response model containing a list of extracted entities."""
|
||||||
|
|
||||||
|
entities: List[Entity]
|
||||||
|
|
||||||
|
|
||||||
|
class LLMEntityExtractor(BaseEntityExtractor):
|
||||||
|
"""Entity extractor that uses an LLM to identify entities in text."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
system_prompt_template: str = "extract_entities_system.txt",
|
||||||
|
user_prompt_template: str = "extract_entities_user.txt",
|
||||||
|
):
|
||||||
|
"""Initialize the LLM entity extractor."""
|
||||||
|
self.system_prompt_template = system_prompt_template
|
||||||
|
self.user_prompt_template = user_prompt_template
|
||||||
|
self._entity_type_cache = {}
|
||||||
|
|
||||||
|
def _get_entity_type(self, type_name: str) -> EntityType:
|
||||||
|
"""Get or create an EntityType object."""
|
||||||
|
type_name = type_name.upper()
|
||||||
|
|
||||||
|
if type_name not in self._entity_type_cache:
|
||||||
|
self._entity_type_cache[type_name] = EntityType(
|
||||||
|
name=type_name, description=f"Entity type for {type_name.lower()} entities"
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._entity_type_cache[type_name]
|
||||||
|
|
||||||
|
async def extract_entities(self, text: str) -> List[Entity]:
|
||||||
|
"""Extract entities from text using an LLM."""
|
||||||
|
if not text or not isinstance(text, str):
|
||||||
|
logger.warning("Invalid input text for entity extraction")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Extracting entities from text: {text[:100]}...")
|
||||||
|
|
||||||
|
llm_client = get_llm_client()
|
||||||
|
user_prompt = render_prompt(self.user_prompt_template, {"text": text})
|
||||||
|
system_prompt = read_query_prompt(self.system_prompt_template)
|
||||||
|
|
||||||
|
response = await llm_client.acreate_structured_output(
|
||||||
|
text_input=user_prompt,
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
response_model=EntityList,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response.entities:
|
||||||
|
logger.warning("No entities were extracted from the text")
|
||||||
|
return []
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(response.entities)} entities")
|
||||||
|
return response.entities
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Entity extraction failed: {str(e)}")
|
||||||
|
return []
|
||||||
Loading…
Add table
Reference in a new issue