feat: Adds edge centered payload and embedding structure during ingestion (#1853)

## Description This pull request introduces edge‑centered payloads to the ingestion process. Payloads are stored in the Triplet_text collection which is compatible with the triplet_embedding memify pipeline. Changes in This PR: - Refactored custom edge handling, from now on they can be passed to the add_data_points method so the ingestion is centralized and is happening in one place. - Added private methods to handle edge centered payload creation inside the add_data_points.py - Added unit tests to cover the new functionality - Added integration tests - Added e2e tests Acceptance Criteria and Testing Scenario 1: -Set TRIPLET_EMBEDDING env var to True -Run prune, add, cognify -Verify the vector DB contains a non empty Triplet_text collection and the number of triplets are matching with the number of edges in the graph database -Use the new triplet_completion search type and confirm it works correctly. Scenario 2: -Set TRIPLET_EMBEDDING env var to True -Run prune, add, cognify -Verify the vector DB does not have the Triplet_text collection -You should receive an error indicating that the Triplet_text is not available ## Type of Change  - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable)  ## Pre-submission Checklist  - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have added necessary documentation (if applicable) - [x] All new and existing tests pass - [x] I have searched existing PRs to ensure this change hasn't been submitted already - [x] I have linked any relevant issues in the description - [x] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.  ## Summary by CodeRabbit * **New Features** * Triplet embeddings supported—embeddings created from graph edges plus connected node text * Ability to supply custom edges when adding data points * New configuration toggle to enable/disable triplet embedding * **Tests** * Added comprehensive unit and end-to-end tests for edge-centered payloads and triplet embedding * New CI job to run the edge-centered payload e2e test * **Bug Fixes** * Adjusted server start behavior to surface process output in parent logs <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>  --------- Co-authored-by: Pavel Zorin <pazonec@yandex.ru>
2025-12-10 17:10:06 +01:00 · 2025-12-10 17:10:06 +01:00 · 001fbe699e
commit 001fbe699e
parent 49f7c5188c
9 changed files with 786 additions and 14 deletions
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@ -412,6 +412,35 @@ jobs:
          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
        run: uv run python ./cognee/tests/test_feedback_enrichment.py
  test-edge-centered-payload:
    name: Test Cognify - Edge Centered Payload
    runs-on: ubuntu-22.04
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
      - name: Cognee Setup
        uses: ./.github/actions/cognee_setup
        with:
          python-version: '3.11.x'
      - name: Dependencies already installed
        run: echo "Dependencies already installed in setup"
      - name: Run Edge Centered Payload Test
        env:
          ENV: 'dev'
          TRIPLET_EMBEDDING: True
          LLM_MODEL: ${{ secrets.LLM_MODEL }}
          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
        run: uv run python ./cognee/tests/test_edge_centered_payload.py
  run_conversation_sessions_test_redis:
    name: Conversation sessions test (Redis)
    runs-on: ubuntu-latest
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -3,6 +3,7 @@ from pydantic import BaseModel
 from typing import Union, Optional
 from uuid import UUID
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
 from cognee.shared.logging_utils import get_logger
 from cognee.shared.data_models import KnowledgeGraph
@ -272,6 +273,9 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
    if chunks_per_batch is None:
        chunks_per_batch = 100
    cognify_config = get_cognify_config()
    embed_triplets = cognify_config.triplet_embedding
    default_tasks = [
        Task(classify_documents),
        Task(check_permissions_on_dataset, user=user, permissions=["write"]),
@ -291,7 +295,11 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
            summarize_text,
            task_config={"batch_size": chunks_per_batch},
        ),
-        Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
+        Task(
            add_data_points,
            embed_triplets=embed_triplets,
            task_config={"batch_size": chunks_per_batch},
        ),
    ]
    return default_tasks
--- a/cognee/modules/cognify/config.py
+++ b/cognee/modules/cognify/config.py
@ -8,12 +8,14 @@ import os
 class CognifyConfig(BaseSettings):
    classification_model: object = DefaultContentPrediction
    summarization_model: object = SummarizedContent
    triplet_embedding: bool = False
    model_config = SettingsConfigDict(env_file=".env", extra="allow")
    def to_dict(self) -> dict:
        return {
            "classification_model": self.classification_model,
            "summarization_model": self.summarization_model,
            "triplet_embedding": self.triplet_embedding,
        }
--- a/cognee/tasks/graph/extract_graph_from_data.py
+++ b/cognee/tasks/graph/extract_graph_from_data.py
@ -2,9 +2,7 @@ import asyncio
 from typing import Type, List, Optional
 from pydantic import BaseModel
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
 from cognee.tasks.storage import index_graph_edges
 from cognee.tasks.storage.add_data_points import add_data_points
 from cognee.modules.ontology.ontology_config import Config
 from cognee.modules.ontology.get_default_ontology_resolver import (
@ -25,6 +23,7 @@ from cognee.tasks.graph.exceptions import (
    InvalidChunkGraphInputError,
    InvalidOntologyAdapterError,
 )
 from cognee.modules.cognify.config import get_cognify_config
 async def integrate_chunk_graphs(
@ -67,8 +66,6 @@ async def integrate_chunk_graphs(
            type(ontology_resolver).__name__ if ontology_resolver else "None"
        )
    graph_engine = await get_graph_engine()
    if graph_model is not KnowledgeGraph:
        for chunk_index, chunk_graph in enumerate(chunk_graphs):
            data_chunks[chunk_index].contains = chunk_graph
@ -84,12 +81,13 @@ async def integrate_chunk_graphs(
        data_chunks, chunk_graphs, ontology_resolver, existing_edges_map
    )
-    if len(graph_nodes) > 0:
+    cognify_config = get_cognify_config()
-        await add_data_points(graph_nodes)
+    embed_triplets = cognify_config.triplet_embedding
-    if len(graph_edges) > 0:
+    if len(graph_nodes) > 0:
-        await graph_engine.add_edges(graph_edges)
+        await add_data_points(
-        await index_graph_edges(graph_edges)
+            data_points=graph_nodes, custom_edges=graph_edges, embed_triplets=embed_triplets
        )
    return data_chunks
--- a/cognee/tasks/storage/add_data_points.py
+++ b/cognee/tasks/storage/add_data_points.py
@ -1,16 +1,23 @@
 import asyncio
-from typing import List
+from typing import List, Dict, Optional
 from cognee.infrastructure.engine import DataPoint
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.graph.utils import deduplicate_nodes_and_edges, get_graph_from_model
 from .index_data_points import index_data_points
 from .index_graph_edges import index_graph_edges
 from cognee.modules.engine.models import Triplet
 from cognee.shared.logging_utils import get_logger
 from cognee.tasks.storage.exceptions import (
    InvalidDataPointsInAddDataPointsError,
 )
 from ...modules.engine.utils import generate_node_id
 logger = get_logger("add_data_points")
-async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
+async def add_data_points(
    data_points: List[DataPoint], custom_edges: Optional[List] = None, embed_triplets: bool = False
 ) -> List[DataPoint]:
    """
    Add a batch of data points to the graph database by extracting nodes and edges,
    deduplicating them, and indexing them for retrieval.
@ -23,6 +30,10 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
    Args:
        data_points (List[DataPoint]):
            A list of data points to process and insert into the graph.
        custom_edges (List[tuple]): Custom edges between datapoints.
        embed_triplets (bool):
            If True, creates and indexes triplet embeddings from the graph structure.
            Defaults to False.
    Returns:
        List[DataPoint]:
@ -34,6 +45,7 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
        - Updates the node index via `index_data_points`.
        - Inserts nodes and edges into the graph engine.
        - Optionally updates the edge index via `index_graph_edges`.
        - Optionally creates and indexes triplet embeddings if embed_triplets is True.
    """
    if not isinstance(data_points, list):
@ -74,4 +86,132 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
    await graph_engine.add_edges(edges)
    await index_graph_edges(edges)
    if isinstance(custom_edges, list) and custom_edges:
        # This must be handled separately from datapoint edges, created a task in linear to dig deeper but (COG-3488)
        await graph_engine.add_edges(custom_edges)
        await index_graph_edges(custom_edges)
        edges.extend(custom_edges)
    if embed_triplets:
        triplets = _create_triplets_from_graph(nodes, edges)
        if triplets:
            await index_data_points(triplets)
            logger.info(f"Created and indexed {len(triplets)} triplets from graph structure")
    return data_points
 def _extract_embeddable_text_from_datapoint(data_point: DataPoint) -> str:
    """
    Extract embeddable text from a DataPoint using its index_fields metadata.
    Uses the same approach as index_data_points.
    Parameters:
    -----------
        - data_point (DataPoint): The data point to extract text from.
    Returns:
    --------
        - str: Concatenated string of all embeddable property values, or empty string if none found.
    """
    if not data_point or not hasattr(data_point, "metadata"):
        return ""
    index_fields = data_point.metadata.get("index_fields", [])
    if not index_fields:
        return ""
    embeddable_values = []
    for field_name in index_fields:
        field_value = getattr(data_point, field_name, None)
        if field_value is not None:
            field_value = str(field_value).strip()
            if field_value:
                embeddable_values.append(field_value)
    return " ".join(embeddable_values) if embeddable_values else ""
 def _create_triplets_from_graph(nodes: List[DataPoint], edges: List[tuple]) -> List[Triplet]:
    """
    Create Triplet objects from graph nodes and edges.
    This function processes graph edges and their corresponding nodes to create
    triplet datapoints with embeddable text, similar to the triplet embeddings pipeline.
    Parameters:
    -----------
        - nodes (List[DataPoint]): List of graph nodes extracted from data points
        - edges (List[tuple]): List of edge tuples in format
          (source_node_id, target_node_id, relationship_name, properties_dict)
          Note: All edges including those from DocumentChunk.contains are already extracted
          by get_graph_from_model and included in this list.
    Returns:
    --------
        - List[Triplet]: List of Triplet objects ready for indexing
    """
    node_map: Dict[str, DataPoint] = {}
    for node in nodes:
        if hasattr(node, "id"):
            node_id = str(node.id)
            if node_id not in node_map:
                node_map[node_id] = node
    triplets = []
    skipped_count = 0
    seen_ids = set()
    for edge_tuple in edges:
        if len(edge_tuple) < 4:
            continue
        source_node_id, target_node_id, relationship_name, edge_properties = (
            edge_tuple[0],
            edge_tuple[1],
            edge_tuple[2],
            edge_tuple[3],
        )
        source_node = node_map.get(str(source_node_id))
        target_node = node_map.get(str(target_node_id))
        if not source_node or not target_node or relationship_name is None:
            skipped_count += 1
            continue
        source_node_text = _extract_embeddable_text_from_datapoint(source_node)
        target_node_text = _extract_embeddable_text_from_datapoint(target_node)
        relationship_text = ""
        if isinstance(edge_properties, dict):
            edge_text = edge_properties.get("edge_text")
            if edge_text and isinstance(edge_text, str) and edge_text.strip():
                relationship_text = edge_text.strip()
        if not relationship_text and relationship_name:
            relationship_text = relationship_name
        if not source_node_text and not relationship_text and not relationship_name:
            skipped_count += 1
            continue
        embeddable_text = f"{source_node_text} -› {relationship_text}-›{target_node_text}".strip()
        triplet_id = generate_node_id(str(source_node_id) + relationship_name + str(target_node_id))
        if triplet_id in seen_ids:
            continue
        seen_ids.add(triplet_id)
        triplets.append(
            Triplet(
                id=triplet_id,
                from_node_id=str(source_node_id),
                to_node_id=str(target_node_id),
                text=embeddable_text,
            )
        )
    return triplets
--- a/cognee/tests/integration/tasks/test_add_data_points.py
+++ b/cognee/tests/integration/tasks/test_add_data_points.py
@ -0,0 +1,139 @@
 import pathlib
 import pytest
 import pytest_asyncio
 import cognee
 from cognee.low_level import setup
 from cognee.infrastructure.engine import DataPoint
 from cognee.tasks.storage.add_data_points import add_data_points
 from cognee.tasks.storage.exceptions import InvalidDataPointsInAddDataPointsError
 from cognee.infrastructure.databases.graph import get_graph_engine
 class Person(DataPoint):
    name: str
    age: int
    metadata: dict = {"index_fields": ["name"]}
 class Company(DataPoint):
    name: str
    industry: str
    metadata: dict = {"index_fields": ["name", "industry"]}
@pytest_asyncio.fixture
 async def clean_test_environment():
    """Set up a clean test environment for add_data_points tests."""
    base_dir = pathlib.Path(__file__).parent.parent.parent.parent
    system_directory_path = str(base_dir / ".cognee_system/test_add_data_points_integration")
    data_directory_path = str(base_dir / ".data_storage/test_add_data_points_integration")
    cognee.config.system_root_directory(system_directory_path)
    cognee.config.data_root_directory(data_directory_path)
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    await setup()
    yield
    try:
        await cognee.prune.prune_data()
        await cognee.prune.prune_system(metadata=True)
    except Exception:
        pass
@pytest.mark.asyncio
 async def test_add_data_points_comprehensive(clean_test_environment):
    """Comprehensive integration test for add_data_points functionality."""
    person1 = Person(name="Alice", age=30)
    person2 = Person(name="Bob", age=25)
    result = await add_data_points([person1, person2])
    assert result == [person1, person2]
    assert len(result) == 2
    graph_engine = await get_graph_engine()
    nodes, edges = await graph_engine.get_graph_data()
    assert len(nodes) >= 2
    result_empty = await add_data_points([])
    assert result_empty == []
    person3 = Person(name="Charlie", age=35)
    person4 = Person(name="Diana", age=32)
    custom_edge = (str(person3.id), str(person4.id), "knows", {"edge_text": "friends with"})
    result_custom = await add_data_points([person3, person4], custom_edges=[custom_edge])
    assert len(result_custom) == 2
    nodes, edges = await graph_engine.get_graph_data()
    assert len(edges) == 1
    assert len(nodes) == 4
    class Employee(DataPoint):
        name: str
        works_at: Company
        metadata: dict = {"index_fields": ["name"]}
    company = Company(name="TechCorp", industry="Technology")
    employee = Employee(name="Eve", works_at=company)
    result_rel = await add_data_points([employee])
    assert len(result_rel) == 1
    nodes, edges = await graph_engine.get_graph_data()
    assert len(nodes) == 6
    assert len(edges) == 2
    person5 = Person(name="Frank", age=40)
    person6 = Person(name="Grace", age=38)
    triplet_edge = (str(person5.id), str(person6.id), "married_to", {"edge_text": "is married to"})
    result_triplet = await add_data_points(
        [person5, person6], custom_edges=[triplet_edge], embed_triplets=True
    )
    assert len(result_triplet) == 2
    nodes, edges = await graph_engine.get_graph_data()
    assert len(nodes) == 8
    assert len(edges) == 3
    batch1 = [Person(name="Leo", age=25), Person(name="Mia", age=30)]
    batch2 = [Person(name="Noah", age=35), Person(name="Olivia", age=40)]
    result_batch1 = await add_data_points(batch1)
    result_batch2 = await add_data_points(batch2)
    assert len(result_batch1) == 2
    assert len(result_batch2) == 2
    nodes, edges = await graph_engine.get_graph_data()
    assert len(nodes) == 12
    assert len(edges) == 3
    person7 = Person(name="Paul", age=33)
    person8 = Person(name="Quinn", age=31)
    edge1 = (str(person7.id), str(person8.id), "colleague_of", {"edge_text": "works with"})
    edge2 = (str(person8.id), str(person7.id), "colleague_of", {"edge_text": "works with"})
    result_bi = await add_data_points([person7, person8], custom_edges=[edge1, edge2])
    assert len(result_bi) == 2
    nodes, edges = await graph_engine.get_graph_data()
    assert len(nodes) == 14
    assert len(edges) == 5
    person_invalid = Person(name="Invalid", age=50)
    with pytest.raises(InvalidDataPointsInAddDataPointsError, match="must be a list"):
        await add_data_points(person_invalid)
    with pytest.raises(InvalidDataPointsInAddDataPointsError, match="must be a DataPoint"):
        await add_data_points(["not", "datapoints"])
    final_nodes, final_edges = await graph_engine.get_graph_data()
    assert len(final_nodes) == 14
    assert len(final_edges) == 5
--- a/cognee/tests/test_cognee_server_start.py
+++ b/cognee/tests/test_cognee_server_start.py
@ -25,8 +25,6 @@ class TestCogneeServerStart(unittest.TestCase):
                "--port",
                "8000",
            ],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            preexec_fn=os.setsid,
        )
        # Give the server some time to start
--- a/cognee/tests/test_edge_centered_payload.py
+++ b/cognee/tests/test_edge_centered_payload.py
@ -0,0 +1,170 @@
 """
 End-to-end integration test for edge-centered payload and triplet embeddings.
 """
 import os
 import pathlib
 import cognee
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.search.types import SearchType
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
 from cognee.modules.ontology.ontology_config import Config
 logger = get_logger()
 text_data = """
 Apple is a technology company that produces the iPhone, iPad, and Mac computers.
 The company is known for its innovative products and ecosystem integration.
 Microsoft develops the Windows operating system and Office productivity suite.
 They are also major players in cloud computing with Azure.
 Google created the Android operating system and provides search engine services.
 The company is a leader in artificial intelligence and machine learning.
 """
 ontology_content = """<?xml version="1.0"?>
 <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:owl="http://www.w3.org/2002/07/owl#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
         xmlns="http://example.org/tech#"
         xml:base="http://example.org/tech">
    <owl:Ontology rdf:about="http://example.org/tech"/>
    <!-- Classes -->
    <owl:Class rdf:ID="Company"/>
    <owl:Class rdf:ID="TechnologyCompany"/>
    <owl:Class rdf:ID="Product"/>
    <owl:Class rdf:ID="Software"/>
    <owl:Class rdf:ID="Hardware"/>
    <owl:Class rdf:ID="Service"/>
    <rdf:Description rdf:about="#TechnologyCompany">
        <rdfs:subClassOf rdf:resource="#Company"/>
        <rdfs:comment>A company operating in the technology sector.</rdfs:comment>
    </rdf:Description>
    <rdf:Description rdf:about="#Software">
        <rdfs:subClassOf rdf:resource="#Product"/>
        <rdfs:comment>Software products and applications.</rdfs:comment>
    </rdf:Description>
    <rdf:Description rdf:about="#Hardware">
        <rdfs:subClassOf rdf:resource="#Product"/>
        <rdfs:comment>Physical hardware products.</rdfs:comment>
    </rdf:Description>
    <!-- Individuals -->
    <TechnologyCompany rdf:ID="apple">
        <rdfs:label>Apple</rdfs:label>
    </TechnologyCompany>
    <TechnologyCompany rdf:ID="microsoft">
        <rdfs:label>Microsoft</rdfs:label>
    </TechnologyCompany>
    <TechnologyCompany rdf:ID="google">
        <rdfs:label>Google</rdfs:label>
    </TechnologyCompany>
    <Hardware rdf:ID="iphone">
        <rdfs:label>iPhone</rdfs:label>
    </Hardware>
    <Software rdf:ID="windows">
        <rdfs:label>Windows</rdfs:label>
    </Software>
    <Software rdf:ID="android">
        <rdfs:label>Android</rdfs:label>
    </Software>
 </rdf:RDF>"""
 async def main():
    data_directory_path = str(
        pathlib.Path(
            os.path.join(
                pathlib.Path(__file__).parent,
                ".data_storage/test_edge_centered_payload",
            )
        ).resolve()
    )
    cognee_directory_path = str(
        pathlib.Path(
            os.path.join(
                pathlib.Path(__file__).parent,
                ".cognee_system/test_edge_centered_payload",
            )
        ).resolve()
    )
    cognee.config.data_root_directory(data_directory_path)
    cognee.config.system_root_directory(cognee_directory_path)
    dataset_name = "tech_companies"
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    await cognee.add(data=text_data, dataset_name=dataset_name)
    import tempfile
    with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f:
        f.write(ontology_content)
        ontology_file_path = f.name
    try:
        logger.info(f"Loading ontology from: {ontology_file_path}")
        config: Config = {
            "ontology_config": {
                "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_file_path)
            }
        }
        await cognee.cognify(datasets=[dataset_name], config=config)
        graph_engine = await get_graph_engine()
        nodes_phase2, edges_phase2 = await graph_engine.get_graph_data()
        vector_engine = get_vector_engine()
        triplets_phase2 = await vector_engine.search(
            query_text="technology", limit=None, collection_name="Triplet_text"
        )
        assert len(triplets_phase2) == len(edges_phase2), (
            f"Triplet embeddings and number of edges do not match. Vector db contains {len(triplets_phase2)} edge triplets while graph db contains {len(edges_phase2)} edges."
        )
        search_results_phase2 = await cognee.search(
            query_type=SearchType.TRIPLET_COMPLETION,
            query_text="What products does Apple make?",
        )
        assert search_results_phase2 is not None, (
            "Search should return results for triplet embeddings in simple ontology use case."
        )
    finally:
        if os.path.exists(ontology_file_path):
            os.unlink(ontology_file_path)
 if __name__ == "__main__":
    import asyncio
    from cognee.shared.logging_utils import setup_logging
    setup_logging()
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        loop.run_until_complete(main())
    finally:
        loop.run_until_complete(loop.shutdown_asyncgens())
        loop.close()
--- a/cognee/tests/unit/tasks/storage/test_add_data_points.py
+++ b/cognee/tests/unit/tasks/storage/test_add_data_points.py
@ -0,0 +1,288 @@
 import pytest
 from unittest.mock import AsyncMock, patch
 import sys
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.engine.models import Triplet
 from cognee.tasks.storage.add_data_points import (
    add_data_points,
    InvalidDataPointsInAddDataPointsError,
    _extract_embeddable_text_from_datapoint,
    _create_triplets_from_graph,
 )
 adp_module = sys.modules["cognee.tasks.storage.add_data_points"]
 class SimplePoint(DataPoint):
    text: str
    metadata: dict = {"index_fields": ["text"]}
@pytest.mark.asyncio
@pytest.mark.parametrize("bad_input", [None, ["not_datapoint"]])
 async def test_add_data_points_validates_inputs(bad_input):
    with pytest.raises(InvalidDataPointsInAddDataPointsError):
        await add_data_points(bad_input)
@pytest.mark.asyncio
@patch.object(adp_module, "index_graph_edges")
@patch.object(adp_module, "index_data_points")
@patch.object(adp_module, "get_graph_engine")
@patch.object(adp_module, "deduplicate_nodes_and_edges")
@patch.object(adp_module, "get_graph_from_model")
 async def test_add_data_points_indexes_nodes_and_edges(
    mock_get_graph, mock_dedup, mock_get_engine, mock_index_nodes, mock_index_edges
 ):
    dp1 = SimplePoint(text="first")
    dp2 = SimplePoint(text="second")
    edge1 = (str(dp1.id), str(dp2.id), "related_to", {"edge_text": "connects"})
    custom_edges = [(str(dp2.id), str(dp1.id), "custom_edge", {})]
    mock_get_graph.side_effect = [([dp1], [edge1]), ([dp2], [])]
    mock_dedup.side_effect = lambda n, e: (n, e)
    graph_engine = AsyncMock()
    mock_get_engine.return_value = graph_engine
    result = await add_data_points([dp1, dp2], custom_edges=custom_edges)
    assert result == [dp1, dp2]
    graph_engine.add_nodes.assert_awaited_once()
    mock_index_nodes.assert_awaited_once()
    assert graph_engine.add_edges.await_count == 2
    assert edge1 in graph_engine.add_edges.await_args_list[0].args[0]
    assert graph_engine.add_edges.await_args_list[1].args[0] == custom_edges
    assert mock_index_edges.await_count == 2
@pytest.mark.asyncio
@patch.object(adp_module, "index_graph_edges")
@patch.object(adp_module, "index_data_points")
@patch.object(adp_module, "get_graph_engine")
@patch.object(adp_module, "deduplicate_nodes_and_edges")
@patch.object(adp_module, "get_graph_from_model")
 async def test_add_data_points_indexes_triplets_when_enabled(
    mock_get_graph, mock_dedup, mock_get_engine, mock_index_nodes, mock_index_edges
 ):
    dp1 = SimplePoint(text="source")
    dp2 = SimplePoint(text="target")
    edge1 = (str(dp1.id), str(dp2.id), "relates", {"edge_text": "describes"})
    mock_get_graph.side_effect = [([dp1], [edge1]), ([dp2], [])]
    mock_dedup.side_effect = lambda n, e: (n, e)
    graph_engine = AsyncMock()
    mock_get_engine.return_value = graph_engine
    await add_data_points([dp1, dp2], embed_triplets=True)
    assert mock_index_nodes.await_count == 2
    nodes_arg = mock_index_nodes.await_args_list[0].args[0]
    triplets_arg = mock_index_nodes.await_args_list[1].args[0]
    assert nodes_arg == [dp1, dp2]
    assert len(triplets_arg) == 1
    assert isinstance(triplets_arg[0], Triplet)
    mock_index_edges.assert_awaited_once()
@pytest.mark.asyncio
@patch.object(adp_module, "index_graph_edges")
@patch.object(adp_module, "index_data_points")
@patch.object(adp_module, "get_graph_engine")
@patch.object(adp_module, "deduplicate_nodes_and_edges")
@patch.object(adp_module, "get_graph_from_model")
 async def test_add_data_points_with_empty_list(
    mock_get_graph, mock_dedup, mock_get_engine, mock_index_nodes, mock_index_edges
 ):
    mock_dedup.side_effect = lambda n, e: (n, e)
    graph_engine = AsyncMock()
    mock_get_engine.return_value = graph_engine
    result = await add_data_points([])
    assert result == []
    mock_get_graph.assert_not_called()
    graph_engine.add_nodes.assert_awaited_once_with([])
@pytest.mark.asyncio
@patch.object(adp_module, "index_graph_edges")
@patch.object(adp_module, "index_data_points")
@patch.object(adp_module, "get_graph_engine")
@patch.object(adp_module, "deduplicate_nodes_and_edges")
@patch.object(adp_module, "get_graph_from_model")
 async def test_add_data_points_with_single_datapoint(
    mock_get_graph, mock_dedup, mock_get_engine, mock_index_nodes, mock_index_edges
 ):
    dp = SimplePoint(text="single")
    mock_get_graph.side_effect = [([dp], [])]
    mock_dedup.side_effect = lambda n, e: (n, e)
    graph_engine = AsyncMock()
    mock_get_engine.return_value = graph_engine
    result = await add_data_points([dp])
    assert result == [dp]
    mock_get_graph.assert_called_once()
    mock_index_nodes.assert_awaited_once()
 def test_extract_embeddable_text_from_datapoint():
    dp = SimplePoint(text="hello world")
    text = _extract_embeddable_text_from_datapoint(dp)
    assert text == "hello world"
 def test_extract_embeddable_text_with_multiple_fields():
    class MultiField(DataPoint):
        title: str
        description: str
        metadata: dict = {"index_fields": ["title", "description"]}
    dp = MultiField(title="Test", description="Description")
    text = _extract_embeddable_text_from_datapoint(dp)
    assert text == "Test Description"
 def test_extract_embeddable_text_with_no_index_fields():
    class NoIndex(DataPoint):
        text: str
        metadata: dict = {"index_fields": []}
    dp = NoIndex(text="ignored")
    text = _extract_embeddable_text_from_datapoint(dp)
    assert text == ""
 def test_create_triplets_from_graph():
    dp1 = SimplePoint(text="source node")
    dp2 = SimplePoint(text="target node")
    edge = (str(dp1.id), str(dp2.id), "connects_to", {"edge_text": "links"})
    triplets = _create_triplets_from_graph([dp1, dp2], [edge])
    assert len(triplets) == 1
    assert isinstance(triplets[0], Triplet)
    assert triplets[0].from_node_id == str(dp1.id)
    assert triplets[0].to_node_id == str(dp2.id)
    assert "source node" in triplets[0].text
    assert "target node" in triplets[0].text
 def test_extract_embeddable_text_with_none_datapoint():
    text = _extract_embeddable_text_from_datapoint(None)
    assert text == ""
 def test_extract_embeddable_text_without_metadata():
    class NoMetadata(DataPoint):
        text: str
    dp = NoMetadata(text="test")
    delattr(dp, "metadata")
    text = _extract_embeddable_text_from_datapoint(dp)
    assert text == ""
 def test_extract_embeddable_text_with_whitespace_only():
    class WhitespaceField(DataPoint):
        text: str
        metadata: dict = {"index_fields": ["text"]}
    dp = WhitespaceField(text="   ")
    text = _extract_embeddable_text_from_datapoint(dp)
    assert text == ""
 def test_create_triplets_skips_short_edge_tuples():
    dp = SimplePoint(text="node")
    incomplete_edge = (str(dp.id), str(dp.id))
    triplets = _create_triplets_from_graph([dp], [incomplete_edge])
    assert len(triplets) == 0
 def test_create_triplets_skips_missing_source_node():
    dp1 = SimplePoint(text="target")
    edge = ("missing_id", str(dp1.id), "relates", {})
    triplets = _create_triplets_from_graph([dp1], [edge])
    assert len(triplets) == 0
 def test_create_triplets_skips_missing_target_node():
    dp1 = SimplePoint(text="source")
    edge = (str(dp1.id), "missing_id", "relates", {})
    triplets = _create_triplets_from_graph([dp1], [edge])
    assert len(triplets) == 0
 def test_create_triplets_skips_none_relationship():
    dp1 = SimplePoint(text="source")
    dp2 = SimplePoint(text="target")
    edge = (str(dp1.id), str(dp2.id), None, {})
    triplets = _create_triplets_from_graph([dp1, dp2], [edge])
    assert len(triplets) == 0
 def test_create_triplets_uses_relationship_name_when_no_edge_text():
    dp1 = SimplePoint(text="source")
    dp2 = SimplePoint(text="target")
    edge = (str(dp1.id), str(dp2.id), "connects_to", {})
    triplets = _create_triplets_from_graph([dp1, dp2], [edge])
    assert len(triplets) == 1
    assert "connects_to" in triplets[0].text
 def test_create_triplets_prevents_duplicates():
    dp1 = SimplePoint(text="source")
    dp2 = SimplePoint(text="target")
    edge = (str(dp1.id), str(dp2.id), "relates", {"edge_text": "links"})
    triplets = _create_triplets_from_graph([dp1, dp2], [edge, edge])
    assert len(triplets) == 1
 def test_create_triplets_skips_nodes_without_id():
    class NodeNoId:
        pass
    dp = SimplePoint(text="valid")
    node_no_id = NodeNoId()
    edge = (str(dp.id), "some_id", "relates", {})
    triplets = _create_triplets_from_graph([dp, node_no_id], [edge])
    assert len(triplets) == 0
@pytest.mark.asyncio
@patch.object(adp_module, "index_graph_edges")
@patch.object(adp_module, "index_data_points")
@patch.object(adp_module, "get_graph_engine")
@patch.object(adp_module, "deduplicate_nodes_and_edges")
@patch.object(adp_module, "get_graph_from_model")
 async def test_add_data_points_with_empty_custom_edges(
    mock_get_graph, mock_dedup, mock_get_engine, mock_index_nodes, mock_index_edges
 ):
    dp = SimplePoint(text="test")
    mock_get_graph.side_effect = [([dp], [])]
    mock_dedup.side_effect = lambda n, e: (n, e)
    graph_engine = AsyncMock()
    mock_get_engine.return_value = graph_engine
    result = await add_data_points([dp], custom_edges=[])
    assert result == [dp]
    assert graph_engine.add_edges.await_count == 1