<!-- .github/pull_request_template.md -->
## Description
This pull request introduces edge‑centered payloads to the ingestion
process. Payloads are stored in the Triplet_text collection which is
compatible with the triplet_embedding memify pipeline.
Changes in This PR:
- Refactored custom edge handling, from now on they can be passed to the
add_data_points method so the ingestion is centralized and is happening
in one place.
- Added private methods to handle edge centered payload creation inside
the add_data_points.py
- Added unit tests to cover the new functionality
- Added integration tests
- Added e2e tests
Acceptance Criteria and Testing
Scenario 1:
-Set TRIPLET_EMBEDDING env var to True
-Run prune, add, cognify
-Verify the vector DB contains a non empty Triplet_text collection and
the number of triplets are matching with the number of edges in the
graph database
-Use the new triplet_completion search type and confirm it works
correctly.
Scenario 2:
-Set TRIPLET_EMBEDDING env var to True
-Run prune, add, cognify
-Verify the vector DB does not have the Triplet_text collection
-You should receive an error indicating that the Triplet_text is not
available
## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [x] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):
## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->
## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [x] **I have tested my changes thoroughly before submitting this PR**
- [x] **This PR contains minimal changes necessary to address the
issue/feature**
- [x] My code follows the project's coding standards and style
guidelines
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] I have added necessary documentation (if applicable)
- [x] All new and existing tests pass
- [x] I have searched existing PRs to ensure this change hasn't been
submitted already
- [x] I have linked any relevant issues in the description
- [x] My commits have clear and descriptive messages
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
* **New Features**
* Triplet embeddings supported—embeddings created from graph edges plus
connected node text
* Ability to supply custom edges when adding data points
* New configuration toggle to enable/disable triplet embedding
* **Tests**
* Added comprehensive unit and end-to-end tests for edge-centered
payloads and triplet embedding
* New CI job to run the edge-centered payload e2e test
* **Bug Fixes**
* Adjusted server start behavior to surface process output in parent
logs
<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---------
Co-authored-by: Pavel Zorin <pazonec@yandex.ru>
170 lines
5.3 KiB
Python
170 lines
5.3 KiB
Python
"""
|
|
End-to-end integration test for edge-centered payload and triplet embeddings.
|
|
|
|
"""
|
|
|
|
import os
|
|
import pathlib
|
|
import cognee
|
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
from cognee.modules.search.types import SearchType
|
|
from cognee.shared.logging_utils import get_logger
|
|
from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
|
|
from cognee.modules.ontology.ontology_config import Config
|
|
|
|
logger = get_logger()
|
|
|
|
text_data = """
|
|
Apple is a technology company that produces the iPhone, iPad, and Mac computers.
|
|
The company is known for its innovative products and ecosystem integration.
|
|
|
|
Microsoft develops the Windows operating system and Office productivity suite.
|
|
They are also major players in cloud computing with Azure.
|
|
|
|
Google created the Android operating system and provides search engine services.
|
|
The company is a leader in artificial intelligence and machine learning.
|
|
"""
|
|
|
|
ontology_content = """<?xml version="1.0"?>
|
|
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
xmlns:owl="http://www.w3.org/2002/07/owl#"
|
|
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
|
|
xmlns="http://example.org/tech#"
|
|
xml:base="http://example.org/tech">
|
|
|
|
<owl:Ontology rdf:about="http://example.org/tech"/>
|
|
|
|
<!-- Classes -->
|
|
<owl:Class rdf:ID="Company"/>
|
|
<owl:Class rdf:ID="TechnologyCompany"/>
|
|
<owl:Class rdf:ID="Product"/>
|
|
<owl:Class rdf:ID="Software"/>
|
|
<owl:Class rdf:ID="Hardware"/>
|
|
<owl:Class rdf:ID="Service"/>
|
|
|
|
<rdf:Description rdf:about="#TechnologyCompany">
|
|
<rdfs:subClassOf rdf:resource="#Company"/>
|
|
<rdfs:comment>A company operating in the technology sector.</rdfs:comment>
|
|
</rdf:Description>
|
|
|
|
<rdf:Description rdf:about="#Software">
|
|
<rdfs:subClassOf rdf:resource="#Product"/>
|
|
<rdfs:comment>Software products and applications.</rdfs:comment>
|
|
</rdf:Description>
|
|
|
|
<rdf:Description rdf:about="#Hardware">
|
|
<rdfs:subClassOf rdf:resource="#Product"/>
|
|
<rdfs:comment>Physical hardware products.</rdfs:comment>
|
|
</rdf:Description>
|
|
|
|
<!-- Individuals -->
|
|
<TechnologyCompany rdf:ID="apple">
|
|
<rdfs:label>Apple</rdfs:label>
|
|
</TechnologyCompany>
|
|
|
|
<TechnologyCompany rdf:ID="microsoft">
|
|
<rdfs:label>Microsoft</rdfs:label>
|
|
</TechnologyCompany>
|
|
|
|
<TechnologyCompany rdf:ID="google">
|
|
<rdfs:label>Google</rdfs:label>
|
|
</TechnologyCompany>
|
|
|
|
<Hardware rdf:ID="iphone">
|
|
<rdfs:label>iPhone</rdfs:label>
|
|
</Hardware>
|
|
|
|
<Software rdf:ID="windows">
|
|
<rdfs:label>Windows</rdfs:label>
|
|
</Software>
|
|
|
|
<Software rdf:ID="android">
|
|
<rdfs:label>Android</rdfs:label>
|
|
</Software>
|
|
|
|
</rdf:RDF>"""
|
|
|
|
|
|
async def main():
|
|
data_directory_path = str(
|
|
pathlib.Path(
|
|
os.path.join(
|
|
pathlib.Path(__file__).parent,
|
|
".data_storage/test_edge_centered_payload",
|
|
)
|
|
).resolve()
|
|
)
|
|
cognee_directory_path = str(
|
|
pathlib.Path(
|
|
os.path.join(
|
|
pathlib.Path(__file__).parent,
|
|
".cognee_system/test_edge_centered_payload",
|
|
)
|
|
).resolve()
|
|
)
|
|
|
|
cognee.config.data_root_directory(data_directory_path)
|
|
cognee.config.system_root_directory(cognee_directory_path)
|
|
|
|
dataset_name = "tech_companies"
|
|
|
|
await cognee.prune.prune_data()
|
|
await cognee.prune.prune_system(metadata=True)
|
|
|
|
await cognee.add(data=text_data, dataset_name=dataset_name)
|
|
|
|
import tempfile
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f:
|
|
f.write(ontology_content)
|
|
ontology_file_path = f.name
|
|
|
|
try:
|
|
logger.info(f"Loading ontology from: {ontology_file_path}")
|
|
config: Config = {
|
|
"ontology_config": {
|
|
"ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_file_path)
|
|
}
|
|
}
|
|
|
|
await cognee.cognify(datasets=[dataset_name], config=config)
|
|
graph_engine = await get_graph_engine()
|
|
nodes_phase2, edges_phase2 = await graph_engine.get_graph_data()
|
|
|
|
vector_engine = get_vector_engine()
|
|
triplets_phase2 = await vector_engine.search(
|
|
query_text="technology", limit=None, collection_name="Triplet_text"
|
|
)
|
|
|
|
assert len(triplets_phase2) == len(edges_phase2), (
|
|
f"Triplet embeddings and number of edges do not match. Vector db contains {len(triplets_phase2)} edge triplets while graph db contains {len(edges_phase2)} edges."
|
|
)
|
|
|
|
search_results_phase2 = await cognee.search(
|
|
query_type=SearchType.TRIPLET_COMPLETION,
|
|
query_text="What products does Apple make?",
|
|
)
|
|
|
|
assert search_results_phase2 is not None, (
|
|
"Search should return results for triplet embeddings in simple ontology use case."
|
|
)
|
|
|
|
finally:
|
|
if os.path.exists(ontology_file_path):
|
|
os.unlink(ontology_file_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
from cognee.shared.logging_utils import setup_logging
|
|
|
|
setup_logging()
|
|
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
try:
|
|
loop.run_until_complete(main())
|
|
finally:
|
|
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
loop.close()
|