feat: add regex entity extractor (#605)

## Description  - Created a new RegexEntityExtractor that uses regex patterns to identify entities like emails, URLs, and dates in text - Implemented a JSON-based configuration system to add or modify entity types without changing code - Built a separate RegexEntityConfig class to handle loading and processing of entity configurations - Added test suite covering all entity types and edge cases ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **New Features** - Introduced a new regex-based extraction capability that uses configurable patterns and description templates to identify common entities such as emails, phone numbers, URLs, dates, and more. - **Tests** - Added comprehensive tests to validate the extraction functionality across standard scenarios and edge cases for reliable text analysis.
2025-03-06 12:13:59 +01:00 · 2025-03-06 12:13:59 +01:00 · ea5b11a3b4
commit ea5b11a3b4
parent 9d783675e0
4 changed files with 506 additions and 0 deletions
--- a/cognee/tasks/entity_completion/entity_extractors/regex_entity_config.json
+++ b/cognee/tasks/entity_completion/entity_extractors/regex_entity_config.json
@ -0,0 +1,62 @@
 [
    {
        "entity_name": "EMAIL",
        "entity_description": "Entity type for email entities",
        "regex": "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
        "description_template": "Email address: {}"
    },
    {
        "entity_name": "PHONE",
        "entity_description": "Entity type for phone entities",
        "regex": "\\+?\\d{1,4}[\\s-]?\\(?\\d{2,4}\\)?[\\s-]?\\d{3,4}[\\s-]?\\d{3,4}",
        "description_template": "Phone number: {}"
    },
    {
        "entity_name": "URL",
        "entity_description": "Entity type for url entities",
        "regex": "https?:\\/\\/(www\\.)?[a-zA-Z0-9-]+(\\.[a-zA-Z]{2,})+(\\/\\S*)?",
        "description_template": "URL: {}"
    },
    {
        "entity_name": "DATE",
        "entity_description": "Entity type for date entities",
        "regex": "(\\d{4}[-/]\\d{2}[-/]\\d{2})|(\\d{2}[-/]\\d{2}[-/]\\d{4})",
        "description_template": "Date: {}"
    },
    {
        "entity_name": "TIME",
        "entity_description": "Entity type for time entities",
        "regex": "(1[0-2]|0?[1-9]):[0-5][0-9](\\s?[APap][Mm])?|([01]?[0-9]|2[0-3]):[0-5][0-9]",
        "description_template": "Time: {}"
    },
    {
        "entity_name": "MONEY",
        "entity_description": "Entity type for money entities",
        "regex": "\\$?\\d{1,3}(,\\d{3})*(\\.[0-9]{2})?|\\€?\\d{1,3}(\\.\\d{3})*(,[0-9]{2})?",
        "description_template": "Monetary amount: {}"
    },
    {
        "entity_name": "PERSON",
        "entity_description": "Entity type for person entities",
        "regex": "\\b(?:(?:Dr|Prof|Mr|Mrs|Ms)\\.?\\s+)?[A-Z][a-z]+(?:\\s+(?:[A-Z][a-z]+|[A-Z]\\.?|(?:van|de|la|del|von|der|le)))+\\b",
        "description_template": "Person name: {}"
    },
    {
        "entity_name": "HASHTAG",
        "entity_description": "Entity type for hashtag entities",
        "regex": "\\#[A-Za-z0-9_]+",
        "description_template": "Hashtag: {}"
    },
    {
        "entity_name": "MENTION",
        "entity_description": "Entity type for mention entities",
        "regex": "\\@[A-Za-z0-9_]+",
        "description_template": "Mention: {}"
    },
    {
        "entity_name": "IP_ADDRESS",
        "entity_description": "Entity type for ip_address entities",
        "regex": "(?<!\\d\\.)(?:(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d?|0)\\.){3}(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d?|0)(?!\\.\\d)",
        "description_template": "IP address: {}"
    }
 ]
--- a/cognee/tasks/entity_completion/entity_extractors/regex_entity_config.py
+++ b/cognee/tasks/entity_completion/entity_extractors/regex_entity_config.py
@ -0,0 +1,91 @@
 import json
 import logging
 import os
 import re
 from typing import Dict, List, Pattern, Any
 from cognee.modules.engine.models.EntityType import EntityType
 from cognee.root_dir import get_absolute_path
 logger = logging.getLogger("regex_entity_config")
 class RegexEntityConfig:
    """Class to load and process regex entity extraction configuration."""
    def __init__(self, config_path: str):
        """Initialize the regex entity configuration with the config path."""
        self.config_path = config_path
        self.entity_configs = {}
        self._load_config()
    def _validate_config_fields(self, config: Dict[str, Any]) -> None:
        """Validate that all required fields are present in the configuration."""
        required_fields = ["entity_name", "entity_description", "regex", "description_template"]
        missing_fields = [field for field in required_fields if field not in config]
        if missing_fields:
            raise ValueError(
                f"Missing required fields in entity configuration: {', '.join(missing_fields)}"
            )
    def _compile_regex(self, pattern: str, entity_name: str) -> Pattern:
        """Compile a regex pattern safely, with error handling."""
        try:
            return re.compile(pattern)
        except re.error as e:
            logger.error(f"Invalid regex pattern for entity '{entity_name}': {str(e)}")
            raise ValueError(f"Invalid regex pattern for entity '{entity_name}': {str(e)}")
    def _load_config(self) -> None:
        """Load and process the configuration from the JSON file."""
        try:
            with open(self.config_path, "r") as f:
                config_list = json.load(f)
        except FileNotFoundError:
            logger.error(f"Config file not found: {self.config_path}")
            raise
        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON in config file {self.config_path}: {str(e)}")
            raise ValueError(f"Invalid JSON in config file: {str(e)}")
        for config in config_list:
            self._validate_config_fields(config)
            entity_name = config["entity_name"]
            entity_type = EntityType(name=entity_name, description=config["entity_description"])
            compiled_pattern = self._compile_regex(config["regex"], entity_name)
            self.entity_configs[entity_name] = {
                "entity_type": entity_type,
                "regex": config["regex"],
                "compiled_pattern": compiled_pattern,
                "description_template": config["description_template"],
            }
        logger.info(
            f"Loaded {len(self.entity_configs)} entity configurations from {self.config_path}"
        )
    def get_entity_names(self) -> List[str]:
        """Return a list of all configured entity names."""
        return list(self.entity_configs.keys())
    def get_entity_config(self, entity_name: str) -> Dict[str, Any]:
        """Get the configuration for a specific entity type."""
        if entity_name not in self.entity_configs:
            raise KeyError(f"Unknown entity type: {entity_name}")
        return self.entity_configs[entity_name]
    def get_entity_type(self, entity_name: str) -> EntityType:
        """Get the EntityType object for a specific entity type."""
        return self.get_entity_config(entity_name)["entity_type"]
    def get_compiled_pattern(self, entity_name: str) -> Pattern:
        """Get the compiled regex pattern for a specific entity type."""
        return self.get_entity_config(entity_name)["compiled_pattern"]
    def get_description_template(self, entity_name: str) -> str:
        """Get the description template for a specific entity type."""
        return self.get_entity_config(entity_name)["description_template"]
--- a/cognee/tasks/entity_completion/entity_extractors/regex_entity_extractor.py
+++ b/cognee/tasks/entity_completion/entity_extractors/regex_entity_extractor.py
@ -0,0 +1,72 @@
 import logging
 from typing import List, Optional
 from cognee.infrastructure.entities.BaseEntityExtractor import BaseEntityExtractor
 from cognee.modules.engine.models import Entity
 from cognee.root_dir import get_absolute_path
 from cognee.tasks.entity_completion.entity_extractors.regex_entity_config import RegexEntityConfig
 logger = logging.getLogger("regex_entity_extractor")
 class RegexEntityExtractor(BaseEntityExtractor):
    """Entity extractor that uses regular expressions to identify entities in text."""
    def __init__(self, config_path: Optional[str] = None):
        """Initialize the regex entity extractor with an optional custom config path."""
        if config_path is None:
            config_path = get_absolute_path(
                "tasks/entity_completion/entity_extractors/regex_entity_config.json"
            )
        self.config = RegexEntityConfig(config_path)
        logger.info(
            f"Initialized RegexEntityExtractor with {len(self.config.get_entity_names())} entity types"
        )
    def _create_entity(self, match_text: str, entity_type_obj, description_template: str) -> Entity:
        """Create an entity from a regex match."""
        return Entity(
            name=match_text,
            is_a=entity_type_obj,
            description=description_template.format(match_text),
        )
    def _extract_entities_by_type(self, entity_type: str, text: str) -> List[Entity]:
        """Extract entities of a specific type from the given text."""
        try:
            pattern = self.config.get_compiled_pattern(entity_type)
            description_template = self.config.get_description_template(entity_type)
            entity_type_obj = self.config.get_entity_type(entity_type)
            return [
                self._create_entity(match.group(0), entity_type_obj, description_template)
                for match in pattern.finditer(text)
            ]
        except KeyError:
            logger.warning(f"Unknown entity type: {entity_type}")
            return []
    def _text_to_entities(self, text: str) -> List[Entity]:
        """Extract all entity types from the given text and return them as a list."""
        all_entities = []
        for entity_type in self.config.get_entity_names():
            extracted_entities = self._extract_entities_by_type(entity_type, text)
            all_entities.extend(extracted_entities)
        logger.info(f"Extracted {len(all_entities)} entities")
        return all_entities
    async def extract_entities(self, text: str) -> List[Entity]:
        """Extract all configured entity types from the given text."""
        if not text or not isinstance(text, str):
            logger.warning("Invalid input text for entity extraction")
            return []
        try:
            logger.info(f"Extracting entities from text: {text[:100]}...")
            return self._text_to_entities(text)
        except Exception as e:
            logger.error(f"Entity extraction failed: {str(e)}")
            return []
--- a/cognee/tests/unit/entity_extraction/regex_entity_extraction_test.py
+++ b/cognee/tests/unit/entity_extraction/regex_entity_extraction_test.py
@ -0,0 +1,281 @@
 import pytest
 from cognee.tasks.entity_completion.entity_extractors.regex_entity_extractor import (
    RegexEntityExtractor,
 )
@pytest.fixture
 def regex_extractor():
    """Create a RegexEntityExtractor instance for testing."""
    return RegexEntityExtractor()
@pytest.mark.asyncio
 async def test_extract_emails(regex_extractor):
    """Test extraction of email addresses."""
    text = "Contact us at support@example.com or sales@company.co.uk for more information."
    entities = await regex_extractor.extract_entities(text)
    # Filter only EMAIL entities
    email_entities = [e for e in entities if e.is_a.name == "EMAIL"]
    assert len(email_entities) == 2
    assert "support@example.com" in [e.name for e in email_entities]
    assert "sales@company.co.uk" in [e.name for e in email_entities]
@pytest.mark.asyncio
 async def test_extract_phone_numbers(regex_extractor):
    """Test extraction of phone numbers."""
    text = "Call us at +1-555-123-4567 or 020 7946 0958 for support."
    entities = await regex_extractor.extract_entities(text)
    # Filter only PHONE entities
    phone_entities = [e for e in entities if e.is_a.name == "PHONE"]
    assert len(phone_entities) == 2
    assert "+1-555-123-4567" in [e.name for e in phone_entities]
    assert "020 7946 0958" in [e.name for e in phone_entities]
@pytest.mark.asyncio
 async def test_extract_urls(regex_extractor):
    """Test extraction of URLs."""
    text = "Visit our website at https://www.example.com or http://docs.example.org/help for more information."
    entities = await regex_extractor.extract_entities(text)
    # Filter only URL entities
    url_entities = [e for e in entities if e.is_a.name == "URL"]
    assert len(url_entities) == 2
    assert "https://www.example.com" in [e.name for e in url_entities]
    assert "http://docs.example.org/help" in [e.name for e in url_entities]
@pytest.mark.asyncio
 async def test_extract_dates(regex_extractor):
    """Test extraction of dates."""
    text = "The event is scheduled for 2023-05-15 and ends on 06/30/2023."
    entities = await regex_extractor.extract_entities(text)
    # Filter only DATE entities
    date_entities = [e for e in entities if e.is_a.name == "DATE"]
    assert len(date_entities) == 2
    assert "2023-05-15" in [e.name for e in date_entities]
    assert "06/30/2023" in [e.name for e in date_entities]
@pytest.mark.asyncio
 async def test_extract_times(regex_extractor):
    """Test extraction of times."""
    text = "The meeting starts at 09:30 AM and ends at 14:45."
    entities = await regex_extractor.extract_entities(text)
    # Filter only TIME entities
    time_entities = [e for e in entities if e.is_a.name == "TIME"]
    assert len(time_entities) == 2
    assert "09:30 AM" in [e.name for e in time_entities]
    assert "14:45" in [e.name for e in time_entities]
@pytest.mark.asyncio
 async def test_extract_money(regex_extractor):
    """Test extraction of monetary amounts."""
    text = "The product costs $1,299.99 or €1.045,00 depending on your region."
    entities = await regex_extractor.extract_entities(text)
    # Filter only MONEY entities
    money_entities = [e for e in entities if e.is_a.name == "MONEY"]
    assert len(money_entities) == 2
    assert "$1,299.99" in [e.name for e in money_entities]
    assert "€1.045,00" in [e.name for e in money_entities]
@pytest.mark.asyncio
 async def test_extract_person_names(regex_extractor):
    """Test extraction of person names with various formats."""
    text = """
    Standard names: John Smith and Sarah Johnson will be attending.
    Names with titles: Dr. Jane Wilson and Prof Michael Brown will present.
    Names with middle initials: James T. Kirk and William H Gates are invited.
    Names with prefixes: Jean de la Fontaine and Ludwig van Beethoven are famous.
    Single names like Mary or Robert should not be extracted as they could be
    confused with regular capitalized words at the beginning of sentences.
    """
    entities = await regex_extractor.extract_entities(text)
    # Filter only PERSON entities
    person_entities = [e for e in entities if e.is_a.name == "PERSON"]
    entity_names = [e.name for e in person_entities]
    # Standard two-part names
    assert "John Smith" in entity_names
    assert "Sarah Johnson" in entity_names
    # Names with titles
    assert "Dr. Jane Wilson" in entity_names
    assert "Prof Michael Brown" in entity_names
    # Names with middle initials
    assert "James T. Kirk" in entity_names
    assert "William H Gates" in entity_names
    # Names with prefixes
    assert "Jean de la Fontaine" in entity_names
    assert "Ludwig van Beethoven" in entity_names
    # Verify single names are not extracted
    assert "Mary" not in entity_names
    assert "Robert" not in entity_names
    # Verify we have the expected number of names
    assert len(person_entities) == 8
@pytest.mark.asyncio
 async def test_extract_hashtags(regex_extractor):
    """Test extraction of hashtags."""
    text = "Check out our latest post #Python #MachineLearning"
    entities = await regex_extractor.extract_entities(text)
    # Filter only HASHTAG entities
    hashtag_entities = [e for e in entities if e.is_a.name == "HASHTAG"]
    assert len(hashtag_entities) == 2
    assert "#Python" in [e.name for e in hashtag_entities]
    assert "#MachineLearning" in [e.name for e in hashtag_entities]
@pytest.mark.asyncio
 async def test_extract_mentions(regex_extractor):
    """Test extraction of mentions."""
    text = "Thanks to @johndoe and @jane_smith for their contributions."
    entities = await regex_extractor.extract_entities(text)
    # Filter only MENTION entities
    mention_entities = [e for e in entities if e.is_a.name == "MENTION"]
    assert len(mention_entities) == 2
    assert "@johndoe" in [e.name for e in mention_entities]
    assert "@jane_smith" in [e.name for e in mention_entities]
@pytest.mark.asyncio
 async def test_extract_ip_addresses(regex_extractor):
    """Test extraction of IP addresses with proper validation of octet ranges."""
    # Test with valid IP addresses
    text = "The server IPs are 192.168.1.1, 10.0.0.1, 255.255.255.255, and 0.0.0.0."
    entities = await regex_extractor.extract_entities(text)
    # Filter only IP_ADDRESS entities
    ip_entities = [e for e in entities if e.is_a.name == "IP_ADDRESS"]
    assert len(ip_entities) == 4
    assert "192.168.1.1" in [e.name for e in ip_entities]
    assert "10.0.0.1" in [e.name for e in ip_entities]
    assert "255.255.255.255" in [e.name for e in ip_entities]
    assert "0.0.0.0" in [e.name for e in ip_entities]
@pytest.mark.asyncio
 async def test_invalid_ip_addresses(regex_extractor):
    """Test that invalid IP addresses are not extracted."""
    # Test with invalid IP addresses
    text = "Invalid IPs: 999.999.999.999, 256.256.256.256, 1.2.3.4.5, 01.102.103.104"
    entities = await regex_extractor.extract_entities(text)
    # Filter only IP_ADDRESS entities
    ip_entities = [e for e in entities if e.is_a.name == "IP_ADDRESS"]
    # None of these should be extracted as valid IPs
    assert len(ip_entities) == 1
    assert "999.999.999.999" not in [e.name for e in ip_entities]
    assert "256.256.256.256" not in [e.name for e in ip_entities]
    assert "1.2.3.4.5" not in [e.name for e in ip_entities]
    assert "01.102.103.104" not in [e.name for e in ip_entities]
    assert "1.102.103.104" in [e.name for e in ip_entities]
@pytest.mark.asyncio
 async def test_extract_multiple_entity_types(regex_extractor):
    """Test extraction of multiple entity types from a single text."""
    text = """
    Contact John Doe at john.doe@example.com or +1-555-123-4567.
    Visit our website at https://www.example.com.
    The meeting is scheduled for 2023-05-15 at 09:30 AM.
    The project budget is $10,000.00.
    Follow us on social media with #Python and mention @pythonorg.
    Our server IP is 192.168.1.1.
    """
    entities = await regex_extractor.extract_entities(text)
    # Check that we have at least one entity of each type
    entity_types = [e.is_a.name for e in entities]
    assert "EMAIL" in entity_types
    assert "PHONE" in entity_types
    assert "URL" in entity_types
    assert "DATE" in entity_types
    assert "TIME" in entity_types
    assert "MONEY" in entity_types
    assert "PERSON" in entity_types
    assert "HASHTAG" in entity_types
    assert "MENTION" in entity_types
    assert "IP_ADDRESS" in entity_types
@pytest.mark.asyncio
 async def test_empty_text(regex_extractor):
    """Test extraction with empty text."""
    entities = await regex_extractor.extract_entities("")
    assert len(entities) == 0
@pytest.mark.asyncio
 async def test_none_text(regex_extractor):
    """Test extraction with None text."""
    entities = await regex_extractor.extract_entities(None)
    assert len(entities) == 0
@pytest.mark.asyncio
 async def test_text_without_entities(regex_extractor):
    """Test extraction with text that doesn't contain any entities."""
    text = "This text does not contain any extractable entities."
    entities = await regex_extractor.extract_entities(text)
    assert len(entities) == 0
@pytest.mark.asyncio
 async def test_custom_config_path(tmp_path):
    """Test extraction with a custom configuration path."""
    # Create a minimal test config file
    config_content = """[
        {
            "entity_name": "TEST_ENTITY",
            "entity_description": "Test entity type",
            "regex": "TEST\\\\d+",
            "description_template": "Test entity: {}"
        }
    ]"""
    config_path = tmp_path / "test_config.json"
    with open(config_path, "w") as f:
        f.write(config_content)
    # Create extractor with custom config
    extractor = RegexEntityExtractor(str(config_path))
    # Test extraction
    text = "This contains TEST123 and TEST456."
    entities = await extractor.extract_entities(text)
    assert len(entities) == 2
    assert all(e.is_a.name == "TEST_ENTITY" for e in entities)
    assert "TEST123" in [e.name for e in entities]
    assert "TEST456" in [e.name for e in entities]