feat: cog-1320 Minimal LLM-Based Entity Extraction (#590)

## Description  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **New Features** - Introduced an expert entity extraction feature that extracts significant named entities from text and provides structured output with essential details. - Rolled out customizable prompt templates for both system instructions and user input to standardize the extraction process. - Integrated a robust language model–based extractor with comprehensive error handling to ensure reliable and consistent results.  --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
2025-03-03 13:22:29 +01:00 · 2025-03-03 13:22:29 +01:00 · 8874ddad2e
commit 8874ddad2e
parent 2323fd0c94
3 changed files with 118 additions and 0 deletions
--- a/cognee/infrastructure/llm/prompts/extract_entities_system.txt
+++ b/cognee/infrastructure/llm/prompts/extract_entities_system.txt
@ -0,0 +1,42 @@
+You are an expert entity extraction system. Your task is to identify and extract important named entities from the provided text.
+
+Extract only distinct, meaningful entities that are central to understanding the text. Avoid extracting common nouns, pronouns, or generic terms.
+
+For each entity, provide:
+1. name: The entity name
+2. is_a: An EntityType object with:
+   - name: The type name (in uppercase)
+   - description: A brief description of the type
+3. description: A brief description of the entity (1-2 sentences)
+
+Your response MUST be a valid JSON object with a single field "entities" containing an array of entity objects. Do not include any explanatory text, markdown formatting, or code blocks outside of the JSON.
+
+Example response format:
+{
+  "entities": [
+    {
+      "name": "Albert Einstein",
+      "is_a": {
+        "name": "PERSON",
+        "description": "Entity type for person entities"
+      },
+      "description": "A theoretical physicist who developed the theory of relativity."
+    },
+    {
+      "name": "Theory of Relativity",
+      "is_a": {
+        "name": "CONCEPT",
+        "description": "Entity type for concept entities"
+      },
+      "description": "A physics theory describing the relationship between space and time."
+    },
+    {
+      "name": "Princeton University",
+      "is_a": {
+        "name": "ORGANIZATION",
+        "description": "Entity type for organization entities"
+      },
+      "description": "An Ivy League research university in Princeton, New Jersey."
+    }
+  ]
+}
--- a/cognee/infrastructure/llm/prompts/extract_entities_user.txt
+++ b/cognee/infrastructure/llm/prompts/extract_entities_user.txt
@ -0,0 +1,3 @@
+Extract key entities from this text:
+
+{{ text }}
--- a/cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py
+++ b/cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py
@ -0,0 +1,73 @@
+import logging
+from typing import List
+
+from pydantic import BaseModel
+
+from cognee.infrastructure.entities.BaseEntityExtractor import BaseEntityExtractor
+from cognee.modules.engine.models import Entity
+from cognee.modules.engine.models.EntityType import EntityType
+from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+
+logger = logging.getLogger("llm_entity_extractor")
+
+
+class EntityList(BaseModel):
+    """Response model containing a list of extracted entities."""
+
+    entities: List[Entity]
+
+
+class LLMEntityExtractor(BaseEntityExtractor):
+    """Entity extractor that uses an LLM to identify entities in text."""
+
+    def __init__(
+        self,
+        system_prompt_template: str = "extract_entities_system.txt",
+        user_prompt_template: str = "extract_entities_user.txt",
+    ):
+        """Initialize the LLM entity extractor."""
+        self.system_prompt_template = system_prompt_template
+        self.user_prompt_template = user_prompt_template
+        self._entity_type_cache = {}
+
+    def _get_entity_type(self, type_name: str) -> EntityType:
+        """Get or create an EntityType object."""
+        type_name = type_name.upper()
+
+        if type_name not in self._entity_type_cache:
+            self._entity_type_cache[type_name] = EntityType(
+                name=type_name, description=f"Entity type for {type_name.lower()} entities"
+            )
+
+        return self._entity_type_cache[type_name]
+
+    async def extract_entities(self, text: str) -> List[Entity]:
+        """Extract entities from text using an LLM."""
+        if not text or not isinstance(text, str):
+            logger.warning("Invalid input text for entity extraction")
+            return []
+
+        try:
+            logger.info(f"Extracting entities from text: {text[:100]}...")
+
+            llm_client = get_llm_client()
+            user_prompt = render_prompt(self.user_prompt_template, {"text": text})
+            system_prompt = read_query_prompt(self.system_prompt_template)
+
+            response = await llm_client.acreate_structured_output(
+                text_input=user_prompt,
+                system_prompt=system_prompt,
+                response_model=EntityList,
+            )
+
+            if not response.entities:
+                logger.warning("No entities were extracted from the text")
+                return []
+
+            logger.info(f"Extracted {len(response.entities)} entities")
+            return response.entities
+
+        except Exception as e:
+            logger.error(f"Entity extraction failed: {str(e)}")
+            return []