From 93a383b56a4e774a863a84847b4eb62ce61789cf Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 17 Sep 2025 12:23:30 +0200 Subject: [PATCH] feat: adds matching strategies and moves resolver --- cognee/api/v1/cognify/cognify.py | 2 +- .../get_default_tasks_by_indices.py | 2 +- .../utils/expand_with_nodes_and_edges.py | 2 +- .../ontology/base_ontology_resolver.py | 10 ++++ .../modules/ontology/matching_strategies.py | 55 +++++++++++++++++++ ...yResolver.py => RDFLibOntologyResolver.py} | 13 ++--- cognee/tasks/graph/extract_graph_from_data.py | 2 +- .../tasks/graph/extract_graph_from_data_v2.py | 2 +- .../modules/ontology/test_ontology_adapter.py | 2 +- 9 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 cognee/modules/ontology/matching_strategies.py rename cognee/modules/ontology/rdf_xml/{OntologyResolver.py => RDFLibOntologyResolver.py} (95%) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index a35658691..e933bafd8 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -10,7 +10,7 @@ from cognee.infrastructure.llm import get_max_chunk_tokens from cognee.modules.pipelines import run_pipeline from cognee.modules.pipelines.tasks.task import Task from cognee.modules.chunking.TextChunker import TextChunker -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.users.models import User from cognee.tasks.documents import ( diff --git a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py index 677090a58..fb10c7eed 100644 --- a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +++ b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py @@ -5,7 +5,7 @@ from cognee.modules.chunking.TextChunker import TextChunker from cognee.tasks.graph import extract_graph_from_data from cognee.tasks.storage import add_data_points from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver async def get_default_tasks_by_indices( diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index 3bd62e6e0..bc6205d41 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -8,7 +8,7 @@ from cognee.modules.engine.utils import ( generate_node_name, ) from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver def _create_node_key(node_id: str, category: str) -> str: diff --git a/cognee/modules/ontology/base_ontology_resolver.py b/cognee/modules/ontology/base_ontology_resolver.py index 55826bfb0..86f51fcb7 100644 --- a/cognee/modules/ontology/base_ontology_resolver.py +++ b/cognee/modules/ontology/base_ontology_resolver.py @@ -2,10 +2,20 @@ from abc import ABC, abstractmethod from typing import List, Tuple, Optional from cognee.modules.ontology.models import AttachedOntologyNode +from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy class BaseOntologyResolver(ABC): """Abstract base class for ontology resolvers.""" + + def __init__(self, matching_strategy: Optional[MatchingStrategy] = None): + """Initialize the ontology resolver with a matching strategy. + + Args: + matching_strategy: The strategy to use for entity matching. + Defaults to FuzzyMatchingStrategy if None. + """ + self.matching_strategy = matching_strategy or FuzzyMatchingStrategy() @abstractmethod def build_lookup(self) -> None: diff --git a/cognee/modules/ontology/matching_strategies.py b/cognee/modules/ontology/matching_strategies.py new file mode 100644 index 000000000..c576bf6e2 --- /dev/null +++ b/cognee/modules/ontology/matching_strategies.py @@ -0,0 +1,55 @@ +import difflib +from abc import ABC, abstractmethod +from typing import List, Optional + + +class MatchingStrategy(ABC): + """Abstract base class for ontology entity matching strategies.""" + + @abstractmethod + def find_match(self, name: str, candidates: List[str]) -> Optional[str]: + """Find the best match for a given name from a list of candidates. + + Args: + name: The name to match + candidates: List of candidate names to match against + + Returns: + The best matching candidate name, or None if no match found + """ + pass + + +class FuzzyMatchingStrategy(MatchingStrategy): + """Fuzzy matching strategy using difflib for approximate string matching.""" + + def __init__(self, cutoff: float = 0.8): + """Initialize fuzzy matching strategy. + + Args: + cutoff: Minimum similarity score (0.0 to 1.0) for a match to be considered valid + """ + self.cutoff = cutoff + + def find_match(self, name: str, candidates: List[str]) -> Optional[str]: + """Find the closest fuzzy match for a given name. + + Args: + name: The normalized name to match + candidates: List of normalized candidate names + + Returns: + The best matching candidate name, or None if no match meets the cutoff + """ + if not candidates: + return None + + # Check for exact match first + if name in candidates: + return name + + # Find fuzzy match + best_match = difflib.get_close_matches( + name, candidates, n=1, cutoff=self.cutoff + ) + return best_match[0] if best_match else None diff --git a/cognee/modules/ontology/rdf_xml/OntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py similarity index 95% rename from cognee/modules/ontology/rdf_xml/OntologyResolver.py rename to cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index 3c1a55b5a..d8de5794a 100644 --- a/cognee/modules/ontology/rdf_xml/OntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -12,6 +12,7 @@ from cognee.modules.ontology.exceptions import ( ) from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver from cognee.modules.ontology.models import AttachedOntologyNode +from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy logger = get_logger("OntologyAdapter") @@ -23,7 +24,8 @@ class RDFLibOntologyResolver(BaseOntologyResolver): It provides fuzzy matching and subgraph extraction capabilities for ontology entities. """ - def __init__(self, ontology_file: Optional[str] = None): + def __init__(self, ontology_file: Optional[str] = None, matching_strategy: Optional[MatchingStrategy] = None): + super().__init__(matching_strategy) self.ontology_file = ontology_file try: if ontology_file and os.path.exists(ontology_file): @@ -94,13 +96,8 @@ class RDFLibOntologyResolver(BaseOntologyResolver): try: normalized_name = name.lower().replace(" ", "_").strip() possible_matches = list(self.lookup.get(category, {}).keys()) - if normalized_name in possible_matches: - return normalized_name - - best_match = difflib.get_close_matches( - normalized_name, possible_matches, n=1, cutoff=0.8 - ) - return best_match[0] if best_match else None + + return self.matching_strategy.find_match(normalized_name, possible_matches) except Exception as e: logger.error("Error in find_closest_match: %s", str(e)) raise FindClosestMatchError() from e diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 2ad32f308..22cbc70fe 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -4,7 +4,7 @@ from pydantic import BaseModel from cognee.infrastructure.databases.graph import get_graph_engine from cognee.tasks.storage.add_data_points import add_data_points -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.graph.utils import ( expand_with_nodes_and_edges, diff --git a/cognee/tasks/graph/extract_graph_from_data_v2.py b/cognee/tasks/graph/extract_graph_from_data_v2.py index ce69f9b0e..d2b4924c7 100644 --- a/cognee/tasks/graph/extract_graph_from_data_v2.py +++ b/cognee/tasks/graph/extract_graph_from_data_v2.py @@ -3,7 +3,7 @@ from typing import List from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.shared.data_models import KnowledgeGraph -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import ( extract_content_nodes_and_relationship_names, diff --git a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py index e0a6f1402..051cb3556 100644 --- a/cognee/tests/unit/modules/ontology/test_ontology_adapter.py +++ b/cognee/tests/unit/modules/ontology/test_ontology_adapter.py @@ -1,6 +1,6 @@ import pytest from rdflib import Graph, Namespace, RDF, OWL, RDFS -from cognee.modules.ontology.rdf_xml.OntologyResolver import RDFLibOntologyResolver +from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver from cognee.modules.ontology.models import AttachedOntologyNode