feat: Add OWL/RDF loader, semantic extraction, and embedding generation to OntologyEngine

This commit is contained in:
y-sudharshan 2025-09-10 13:43:18 +05:30
parent 8b6aaff554
commit ce82a16299
2 changed files with 101 additions and 3 deletions

View file

@ -139,16 +139,14 @@ class OntologyEngine:
async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
"""
Load data from a specified JSON or CSV file and return it in a structured format.
Load data from a specified JSON, CSV, or OWL/RDF file and return it in a structured format.
Parameters:
-----------
- file_path (str): The path to the file to load data from.
Returns:
--------
- Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
list of dictionaries or a single dictionary depending on content type.
"""
@ -162,6 +160,44 @@ class OntologyEngine:
content = await f.read()
reader = csv.DictReader(content.splitlines())
return list(reader)
elif file_path.endswith(".owl") or file_path.endswith(".rdf"):
from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
from cognee.infrastructure.llm.LLMGateway import LLMGateway
resolver = OntologyResolver(ontology_file=file_path)
nodes = []
edges = []
embeddings = {}
llm = LLMGateway()
for category in ["classes", "individuals"]:
for key, uri in resolver.lookup.get(category, {}).items():
node_info = {"id": key, "uri": str(uri), "category": category}
# Semantic extraction: get label and description if available
node_info["label"] = key
node_info["description"] = str(uri)
# Generate embedding for node
try:
embedding = llm.generate_embedding(text=node_info["label"] + " " + node_info["description"])
except Exception:
embedding = None
node_info["embedding"] = embedding
embeddings[key] = embedding
nodes.append(node_info)
for node in nodes:
_, node_edges, _ = resolver.get_subgraph(node_name=node["id"], node_type=node["category"])
for edge in node_edges:
edge_info = {"source": edge[0], "relation": edge[1], "target": edge[2]}
# Generate embedding for edge relation
try:
edge_embedding = llm.generate_embedding(text=edge[1])
except Exception:
edge_embedding = None
edge_info["embedding"] = edge_embedding
edges.append(edge_info)
# Store ontology data for search integration
self.ontology_nodes = nodes
self.ontology_edges = edges
self.ontology_embeddings = embeddings
return {"nodes": nodes, "edges": edges, "embeddings": embeddings}
else:
raise IngestionError(message="Unsupported file format")
except Exception as e:

View file

@ -1,3 +1,65 @@
import asyncio
from cognee.tasks.graph.infer_data_ontology import OntologyEngine
def test_load_owl_rdf_file(tmp_path):
# Create a minimal OWL file
owl_content = '''<?xml version="1.0"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:owl="http://www.w3.org/2002/07/owl#"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
<owl:Class rdf:about="http://example.org/test#Car"/>
<owl:NamedIndividual rdf:about="http://example.org/test#Audi">
<rdf:type rdf:resource="http://example.org/test#Car"/>
</owl:NamedIndividual>
</rdf:RDF>'''
owl_file = tmp_path / "test.owl"
owl_file.write_text(owl_content)
engine = OntologyEngine()
data = asyncio.run(engine.load_data(str(owl_file)))
assert "nodes" in data
assert "edges" in data
assert "embeddings" in data
assert any(n["id"] == "car" for n in data["nodes"])
assert any(n["id"] == "audi" for n in data["nodes"])
def test_embeddings_are_generated(tmp_path):
owl_content = '''<?xml version="1.0"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:owl="http://www.w3.org/2002/07/owl#"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
<owl:Class rdf:about="http://example.org/test#Car"/>
<owl:NamedIndividual rdf:about="http://example.org/test#Audi">
<rdf:type rdf:resource="http://example.org/test#Car"/>
</owl:NamedIndividual>
</rdf:RDF>'''
owl_file = tmp_path / "test.owl"
owl_file.write_text(owl_content)
engine = OntologyEngine()
data = asyncio.run(engine.load_data(str(owl_file)))
for node in data["nodes"]:
assert "embedding" in node
def test_search_integration(tmp_path):
# This test assumes search integration uses ontology_nodes
owl_content = '''<?xml version="1.0"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:owl="http://www.w3.org/2002/07/owl#"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
<owl:Class rdf:about="http://example.org/test#Car"/>
<owl:NamedIndividual rdf:about="http://example.org/test#Audi">
<rdf:type rdf:resource="http://example.org/test#Car"/>
</owl:NamedIndividual>
</rdf:RDF>'''
owl_file = tmp_path / "test.owl"
owl_file.write_text(owl_content)
engine = OntologyEngine()
data = asyncio.run(engine.load_data(str(owl_file)))
assert hasattr(engine, "ontology_nodes")
assert hasattr(engine, "ontology_edges")
assert hasattr(engine, "ontology_embeddings")
import pytest
from rdflib import Graph, Namespace, RDF, OWL, RDFS
from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver, AttachedOntologyNode