feature: adds multifile ontology support (#1674)

<!-- .github/pull_request_template.md -->

## Description
Adds multifile ontology support

## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [x] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [x] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Screenshots/Videos (if applicable)
None

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [ ] **This PR contains minimal changes necessary to address the
issue/feature**
- [ ] My code follows the project's coding standards and style
guidelines
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [x] I have searched existing PRs to ensure this change hasn't been
submitted already
- [x] I have linked any relevant issues in the description
- [x] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Vasilije 2025-10-28 13:29:13 +01:00 committed by GitHub
commit abb1aba517
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 194 additions and 9 deletions

View file

@ -21,7 +21,8 @@ def get_ontology_resolver_from_env(
Supported value: "rdflib".
matching_strategy (str): The matching strategy to apply.
Supported value: "fuzzy".
ontology_file_path (str): Path to the ontology file required for the resolver.
ontology_file_path (str): Path to the ontology file(s) required for the resolver.
Can be a single path or comma-separated paths for multiple files.
Returns:
BaseOntologyResolver: An instance of the requested ontology resolver.
@ -31,8 +32,13 @@ def get_ontology_resolver_from_env(
or if required parameters are missing.
"""
if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
if "," in ontology_file_path:
file_paths = [path.strip() for path in ontology_file_path.split(",")]
else:
file_paths = ontology_file_path
return RDFLibOntologyResolver(
matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths
)
else:
raise EnvironmentError(

View file

@ -2,7 +2,7 @@ import os
import difflib
from cognee.shared.logging_utils import get_logger
from collections import deque
from typing import List, Tuple, Dict, Optional, Any
from typing import List, Tuple, Dict, Optional, Any, Union
from rdflib import Graph, URIRef, RDF, RDFS, OWL
from cognee.modules.ontology.exceptions import (
@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver):
def __init__(
self,
ontology_file: Optional[str] = None,
ontology_file: Optional[Union[str, List[str]]] = None,
matching_strategy: Optional[MatchingStrategy] = None,
) -> None:
super().__init__(matching_strategy)
self.ontology_file = ontology_file
try:
if ontology_file and os.path.exists(ontology_file):
files_to_load = []
if ontology_file is not None:
if isinstance(ontology_file, str):
files_to_load = [ontology_file]
elif isinstance(ontology_file, list):
files_to_load = ontology_file
else:
raise ValueError(
f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}"
)
if files_to_load:
self.graph = Graph()
self.graph.parse(ontology_file)
logger.info("Ontology loaded successfully from file: %s", ontology_file)
loaded_files = []
for file_path in files_to_load:
if os.path.exists(file_path):
self.graph.parse(file_path)
loaded_files.append(file_path)
logger.info("Ontology loaded successfully from file: %s", file_path)
else:
logger.warning(
"Ontology file '%s' not found. Skipping this file.",
file_path,
)
if not loaded_files:
logger.info(
"No valid ontology files found. No owl ontology will be attached to the graph."
)
self.graph = None
else:
logger.info("Total ontology files loaded: %d", len(loaded_files))
else:
logger.info(
"Ontology file '%s' not found. No owl ontology will be attached to the graph.",
ontology_file,
"No ontology file provided. No owl ontology will be attached to the graph."
)
self.graph = None
self.build_lookup()
except Exception as e:
logger.error("Failed to load ontology", exc_info=e)

View file

@ -489,3 +489,154 @@ def test_get_ontology_resolver_from_env_resolver_functionality():
assert nodes == []
assert relationships == []
assert start_node is None
def test_multifile_ontology_loading_success():
"""Test successful loading of multiple ontology files."""
ns1 = Namespace("http://example.org/cars#")
ns2 = Namespace("http://example.org/tech#")
g1 = Graph()
g1.add((ns1.Vehicle, RDF.type, OWL.Class))
g1.add((ns1.Car, RDF.type, OWL.Class))
g1.add((ns1.Car, RDFS.subClassOf, ns1.Vehicle))
g1.add((ns1.Audi, RDF.type, ns1.Car))
g1.add((ns1.BMW, RDF.type, ns1.Car))
g2 = Graph()
g2.add((ns2.Company, RDF.type, OWL.Class))
g2.add((ns2.TechCompany, RDF.type, OWL.Class))
g2.add((ns2.TechCompany, RDFS.subClassOf, ns2.Company))
g2.add((ns2.Apple, RDF.type, ns2.TechCompany))
g2.add((ns2.Google, RDF.type, ns2.TechCompany))
import tempfile
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f1:
g1.serialize(f1.name, format="xml")
file1_path = f1.name
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f2:
g2.serialize(f2.name, format="xml")
file2_path = f2.name
try:
resolver = RDFLibOntologyResolver(ontology_file=[file1_path, file2_path])
assert resolver.graph is not None
assert "car" in resolver.lookup["classes"]
assert "vehicle" in resolver.lookup["classes"]
assert "company" in resolver.lookup["classes"]
assert "techcompany" in resolver.lookup["classes"]
assert "audi" in resolver.lookup["individuals"]
assert "bmw" in resolver.lookup["individuals"]
assert "apple" in resolver.lookup["individuals"]
assert "google" in resolver.lookup["individuals"]
car_match = resolver.find_closest_match("Audi", "individuals")
assert car_match == "audi"
tech_match = resolver.find_closest_match("Google", "individuals")
assert tech_match == "google"
finally:
import os
os.unlink(file1_path)
os.unlink(file2_path)
def test_multifile_ontology_with_missing_files():
"""Test loading multiple ontology files where some don't exist."""
ns = Namespace("http://example.org/test#")
g = Graph()
g.add((ns.Car, RDF.type, OWL.Class))
g.add((ns.Audi, RDF.type, ns.Car))
import tempfile
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f:
g.serialize(f.name, format="xml")
valid_file = f.name
try:
resolver = RDFLibOntologyResolver(
ontology_file=["nonexistent_file_1.owl", valid_file, "nonexistent_file_2.owl"]
)
assert resolver.graph is not None
assert "car" in resolver.lookup["classes"]
assert "audi" in resolver.lookup["individuals"]
match = resolver.find_closest_match("Audi", "individuals")
assert match == "audi"
finally:
import os
os.unlink(valid_file)
def test_multifile_ontology_all_files_missing():
"""Test loading multiple ontology files where all files are missing."""
resolver = RDFLibOntologyResolver(
ontology_file=["nonexistent_file_1.owl", "nonexistent_file_2.owl", "nonexistent_file_3.owl"]
)
assert resolver.graph is None
assert resolver.lookup["classes"] == {}
assert resolver.lookup["individuals"] == {}
def test_multifile_ontology_with_overlapping_entities():
"""Test loading multiple ontology files with overlapping/related entities."""
ns = Namespace("http://example.org/automotive#")
g1 = Graph()
g1.add((ns.Vehicle, RDF.type, OWL.Class))
g1.add((ns.Car, RDF.type, OWL.Class))
g1.add((ns.Car, RDFS.subClassOf, ns.Vehicle))
g2 = Graph()
g2.add((ns.LuxuryCar, RDF.type, OWL.Class))
g2.add((ns.LuxuryCar, RDFS.subClassOf, ns.Car))
g2.add((ns.Mercedes, RDF.type, ns.LuxuryCar))
g2.add((ns.BMW, RDF.type, ns.LuxuryCar))
import tempfile
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f1:
g1.serialize(f1.name, format="xml")
file1_path = f1.name
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f2:
g2.serialize(f2.name, format="xml")
file2_path = f2.name
try:
resolver = RDFLibOntologyResolver(ontology_file=[file1_path, file2_path])
assert "vehicle" in resolver.lookup["classes"]
assert "car" in resolver.lookup["classes"]
assert "luxurycar" in resolver.lookup["classes"]
assert "mercedes" in resolver.lookup["individuals"]
assert "bmw" in resolver.lookup["individuals"]
nodes, relationships, start_node = resolver.get_subgraph("Mercedes", "individuals")
uri_labels = {resolver._uri_to_key(n.uri) for n in nodes}
assert "mercedes" in uri_labels
assert "luxurycar" in uri_labels
assert "car" in uri_labels
assert "vehicle" in uri_labels
finally:
import os
os.unlink(file1_path)
os.unlink(file2_path)