feature: adds multifile ontology support (#1674)
<!-- .github/pull_request_template.md --> ## Description Adds multifile ontology support ## Type of Change <!-- Please check the relevant option --> - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [x] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) None ## Pre-submission Checklist <!-- Please check all boxes that apply before submitting your PR --> - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
commit
abb1aba517
3 changed files with 194 additions and 9 deletions
|
|
@ -21,7 +21,8 @@ def get_ontology_resolver_from_env(
|
|||
Supported value: "rdflib".
|
||||
matching_strategy (str): The matching strategy to apply.
|
||||
Supported value: "fuzzy".
|
||||
ontology_file_path (str): Path to the ontology file required for the resolver.
|
||||
ontology_file_path (str): Path to the ontology file(s) required for the resolver.
|
||||
Can be a single path or comma-separated paths for multiple files.
|
||||
|
||||
Returns:
|
||||
BaseOntologyResolver: An instance of the requested ontology resolver.
|
||||
|
|
@ -31,8 +32,13 @@ def get_ontology_resolver_from_env(
|
|||
or if required parameters are missing.
|
||||
"""
|
||||
if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
|
||||
if "," in ontology_file_path:
|
||||
file_paths = [path.strip() for path in ontology_file_path.split(",")]
|
||||
else:
|
||||
file_paths = ontology_file_path
|
||||
|
||||
return RDFLibOntologyResolver(
|
||||
matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
|
||||
matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths
|
||||
)
|
||||
else:
|
||||
raise EnvironmentError(
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import os
|
|||
import difflib
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from collections import deque
|
||||
from typing import List, Tuple, Dict, Optional, Any
|
||||
from typing import List, Tuple, Dict, Optional, Any, Union
|
||||
from rdflib import Graph, URIRef, RDF, RDFS, OWL
|
||||
|
||||
from cognee.modules.ontology.exceptions import (
|
||||
|
|
@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver):
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
ontology_file: Optional[str] = None,
|
||||
ontology_file: Optional[Union[str, List[str]]] = None,
|
||||
matching_strategy: Optional[MatchingStrategy] = None,
|
||||
) -> None:
|
||||
super().__init__(matching_strategy)
|
||||
self.ontology_file = ontology_file
|
||||
try:
|
||||
if ontology_file and os.path.exists(ontology_file):
|
||||
files_to_load = []
|
||||
if ontology_file is not None:
|
||||
if isinstance(ontology_file, str):
|
||||
files_to_load = [ontology_file]
|
||||
elif isinstance(ontology_file, list):
|
||||
files_to_load = ontology_file
|
||||
else:
|
||||
raise ValueError(
|
||||
f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}"
|
||||
)
|
||||
|
||||
if files_to_load:
|
||||
self.graph = Graph()
|
||||
self.graph.parse(ontology_file)
|
||||
logger.info("Ontology loaded successfully from file: %s", ontology_file)
|
||||
loaded_files = []
|
||||
for file_path in files_to_load:
|
||||
if os.path.exists(file_path):
|
||||
self.graph.parse(file_path)
|
||||
loaded_files.append(file_path)
|
||||
logger.info("Ontology loaded successfully from file: %s", file_path)
|
||||
else:
|
||||
logger.warning(
|
||||
"Ontology file '%s' not found. Skipping this file.",
|
||||
file_path,
|
||||
)
|
||||
|
||||
if not loaded_files:
|
||||
logger.info(
|
||||
"No valid ontology files found. No owl ontology will be attached to the graph."
|
||||
)
|
||||
self.graph = None
|
||||
else:
|
||||
logger.info("Total ontology files loaded: %d", len(loaded_files))
|
||||
else:
|
||||
logger.info(
|
||||
"Ontology file '%s' not found. No owl ontology will be attached to the graph.",
|
||||
ontology_file,
|
||||
"No ontology file provided. No owl ontology will be attached to the graph."
|
||||
)
|
||||
self.graph = None
|
||||
|
||||
self.build_lookup()
|
||||
except Exception as e:
|
||||
logger.error("Failed to load ontology", exc_info=e)
|
||||
|
|
|
|||
|
|
@ -489,3 +489,154 @@ def test_get_ontology_resolver_from_env_resolver_functionality():
|
|||
assert nodes == []
|
||||
assert relationships == []
|
||||
assert start_node is None
|
||||
|
||||
|
||||
def test_multifile_ontology_loading_success():
|
||||
"""Test successful loading of multiple ontology files."""
|
||||
ns1 = Namespace("http://example.org/cars#")
|
||||
ns2 = Namespace("http://example.org/tech#")
|
||||
|
||||
g1 = Graph()
|
||||
g1.add((ns1.Vehicle, RDF.type, OWL.Class))
|
||||
g1.add((ns1.Car, RDF.type, OWL.Class))
|
||||
g1.add((ns1.Car, RDFS.subClassOf, ns1.Vehicle))
|
||||
g1.add((ns1.Audi, RDF.type, ns1.Car))
|
||||
g1.add((ns1.BMW, RDF.type, ns1.Car))
|
||||
|
||||
g2 = Graph()
|
||||
g2.add((ns2.Company, RDF.type, OWL.Class))
|
||||
g2.add((ns2.TechCompany, RDF.type, OWL.Class))
|
||||
g2.add((ns2.TechCompany, RDFS.subClassOf, ns2.Company))
|
||||
g2.add((ns2.Apple, RDF.type, ns2.TechCompany))
|
||||
g2.add((ns2.Google, RDF.type, ns2.TechCompany))
|
||||
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f1:
|
||||
g1.serialize(f1.name, format="xml")
|
||||
file1_path = f1.name
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f2:
|
||||
g2.serialize(f2.name, format="xml")
|
||||
file2_path = f2.name
|
||||
|
||||
try:
|
||||
resolver = RDFLibOntologyResolver(ontology_file=[file1_path, file2_path])
|
||||
|
||||
assert resolver.graph is not None
|
||||
|
||||
assert "car" in resolver.lookup["classes"]
|
||||
assert "vehicle" in resolver.lookup["classes"]
|
||||
assert "company" in resolver.lookup["classes"]
|
||||
assert "techcompany" in resolver.lookup["classes"]
|
||||
|
||||
assert "audi" in resolver.lookup["individuals"]
|
||||
assert "bmw" in resolver.lookup["individuals"]
|
||||
assert "apple" in resolver.lookup["individuals"]
|
||||
assert "google" in resolver.lookup["individuals"]
|
||||
|
||||
car_match = resolver.find_closest_match("Audi", "individuals")
|
||||
assert car_match == "audi"
|
||||
|
||||
tech_match = resolver.find_closest_match("Google", "individuals")
|
||||
assert tech_match == "google"
|
||||
|
||||
finally:
|
||||
import os
|
||||
|
||||
os.unlink(file1_path)
|
||||
os.unlink(file2_path)
|
||||
|
||||
|
||||
def test_multifile_ontology_with_missing_files():
|
||||
"""Test loading multiple ontology files where some don't exist."""
|
||||
ns = Namespace("http://example.org/test#")
|
||||
g = Graph()
|
||||
g.add((ns.Car, RDF.type, OWL.Class))
|
||||
g.add((ns.Audi, RDF.type, ns.Car))
|
||||
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f:
|
||||
g.serialize(f.name, format="xml")
|
||||
valid_file = f.name
|
||||
|
||||
try:
|
||||
resolver = RDFLibOntologyResolver(
|
||||
ontology_file=["nonexistent_file_1.owl", valid_file, "nonexistent_file_2.owl"]
|
||||
)
|
||||
|
||||
assert resolver.graph is not None
|
||||
|
||||
assert "car" in resolver.lookup["classes"]
|
||||
assert "audi" in resolver.lookup["individuals"]
|
||||
|
||||
match = resolver.find_closest_match("Audi", "individuals")
|
||||
assert match == "audi"
|
||||
|
||||
finally:
|
||||
import os
|
||||
|
||||
os.unlink(valid_file)
|
||||
|
||||
|
||||
def test_multifile_ontology_all_files_missing():
|
||||
"""Test loading multiple ontology files where all files are missing."""
|
||||
resolver = RDFLibOntologyResolver(
|
||||
ontology_file=["nonexistent_file_1.owl", "nonexistent_file_2.owl", "nonexistent_file_3.owl"]
|
||||
)
|
||||
|
||||
assert resolver.graph is None
|
||||
|
||||
assert resolver.lookup["classes"] == {}
|
||||
assert resolver.lookup["individuals"] == {}
|
||||
|
||||
|
||||
def test_multifile_ontology_with_overlapping_entities():
|
||||
"""Test loading multiple ontology files with overlapping/related entities."""
|
||||
ns = Namespace("http://example.org/automotive#")
|
||||
|
||||
g1 = Graph()
|
||||
g1.add((ns.Vehicle, RDF.type, OWL.Class))
|
||||
g1.add((ns.Car, RDF.type, OWL.Class))
|
||||
g1.add((ns.Car, RDFS.subClassOf, ns.Vehicle))
|
||||
|
||||
g2 = Graph()
|
||||
g2.add((ns.LuxuryCar, RDF.type, OWL.Class))
|
||||
g2.add((ns.LuxuryCar, RDFS.subClassOf, ns.Car))
|
||||
g2.add((ns.Mercedes, RDF.type, ns.LuxuryCar))
|
||||
g2.add((ns.BMW, RDF.type, ns.LuxuryCar))
|
||||
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f1:
|
||||
g1.serialize(f1.name, format="xml")
|
||||
file1_path = f1.name
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f2:
|
||||
g2.serialize(f2.name, format="xml")
|
||||
file2_path = f2.name
|
||||
|
||||
try:
|
||||
resolver = RDFLibOntologyResolver(ontology_file=[file1_path, file2_path])
|
||||
|
||||
assert "vehicle" in resolver.lookup["classes"]
|
||||
assert "car" in resolver.lookup["classes"]
|
||||
assert "luxurycar" in resolver.lookup["classes"]
|
||||
|
||||
assert "mercedes" in resolver.lookup["individuals"]
|
||||
assert "bmw" in resolver.lookup["individuals"]
|
||||
|
||||
nodes, relationships, start_node = resolver.get_subgraph("Mercedes", "individuals")
|
||||
|
||||
uri_labels = {resolver._uri_to_key(n.uri) for n in nodes}
|
||||
assert "mercedes" in uri_labels
|
||||
assert "luxurycar" in uri_labels
|
||||
assert "car" in uri_labels
|
||||
assert "vehicle" in uri_labels
|
||||
|
||||
finally:
|
||||
import os
|
||||
|
||||
os.unlink(file1_path)
|
||||
os.unlink(file2_path)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue