feat: Adds core db tests for main search (#1006)

## Description Adds core db tests for main search ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-06-24 10:51:34 +02:00 · 2025-06-24 10:51:34 +02:00 · 97d05f105e
commit 97d05f105e
parent 82e35374d0
4 changed files with 387 additions and 0 deletions
--- a/.github/workflows/graph_db_tests.yml
+++ b/.github/workflows/graph_db_tests.yml
@ -1,5 +1,8 @@
 name: Reusable Graph DB Tests

+permissions:
+  contents: read
+
 on:
  workflow_call:
    inputs:
--- a/.github/workflows/search_db_tests.yml
+++ b/.github/workflows/search_db_tests.yml
@ -0,0 +1,230 @@
+name: Reusable Search DB Tests
+
+permissions:
+  contents: read
+
+on:
+  workflow_call:
+    inputs:
+      databases:
+        required: false
+        type: string
+        default: "all"
+        description: "Which vector databases to test (comma-separated list or 'all')"
+    secrets:
+      WEAVIATE_API_URL:
+        required: false
+      WEAVIATE_API_KEY:
+        required: false
+
+jobs:
+  run-kuzu-lance-sqlite-search-tests:
+    name: Search test for Kuzu/LanceDB/Sqlite
+    runs-on: ubuntu-22.04
+    if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'kuzu/lance/sqlite') }}
+    steps:
+      - name: Check out
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: ${{ inputs.python-version }}
+
+      - name: Install specific db dependency
+        run: |
+          poetry install -E kuzu
+
+      - name: Run Kuzu search Tests
+        env:
+          ENV: 'dev'
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+          GRAPH_DATABASE_PROVIDER: 'kuzu'
+          VECTOR_DB_PROVIDER: 'lancedb'
+          DB_PROVIDER: 'sqlite'
+        run: poetry run python ./cognee/tests/test_search_db.py
+
+  run-neo4j-lance-sqlite-search-tests:
+    name: Search test for Neo4j/LanceDB/Sqlite
+    runs-on: ubuntu-22.04
+    if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'neo4j/lance/sqlite') }}
+    services:
+      neo4j:
+        image: neo4j:5.11
+        env:
+          NEO4J_AUTH: neo4j/pleaseletmein
+          NEO4J_PLUGINS: '["apoc","graph-data-science"]'
+        ports:
+          - 7474:7474
+          - 7687:7687
+        options: >-
+          --health-cmd="cypher-shell -u neo4j -p pleaseletmein 'RETURN 1'"
+          --health-interval=10s
+          --health-timeout=5s
+          --health-retries=5
+
+    steps:
+      - name: Check out
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: ${{ inputs.python-version }}
+
+      - name: Install specific db dependency
+        run: |
+          poetry install -E neo4j
+
+      - name: Run Neo4j search Tests
+        env:
+          ENV: 'dev'
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+          GRAPH_DATABASE_PROVIDER: 'neo4j'
+          VECTOR_DB_PROVIDER: 'lancedb'
+          DB_PROVIDER: 'sqlite'
+          GRAPH_DATABASE_URL: bolt://localhost:7687
+          GRAPH_DATABASE_USERNAME: neo4j
+          GRAPH_DATABASE_PASSWORD: pleaseletmein
+        run: poetry run python ./cognee/tests/test_search_db.py
+
+  run-kuzu-pgvector-postgres-search-tests:
+      name: Search test for Kuzu/PGVector/Postgres
+      runs-on: ubuntu-22.04
+      if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'kuzu/pgvector/postgres') }}
+      services:
+        postgres:
+          image: pgvector/pgvector:pg17
+          env:
+            POSTGRES_USER: cognee
+            POSTGRES_PASSWORD: cognee
+            POSTGRES_DB: cognee_db
+          options: >-
+            --health-cmd pg_isready
+            --health-interval 10s
+            --health-timeout 5s
+            --health-retries 5
+          ports:
+            - 5432:5432
+      steps:
+        - name: Check out
+          uses: actions/checkout@v4
+          with:
+            fetch-depth: 0
+
+        - name: Cognee Setup
+          uses: ./.github/actions/cognee_setup
+          with:
+            python-version: ${{ inputs.python-version }}
+
+        - name: Install dependencies
+          run: poetry install -E kuzu -E postgres
+
+        - name: Run Kuzu/PGVector/Postgres Tests
+          env:
+            ENV: dev
+            LLM_MODEL: ${{ secrets.LLM_MODEL }}
+            LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+            LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+            LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+            EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+            EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+            EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+            EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+            GRAPH_DATABASE_PROVIDER: 'kuzu'
+            VECTOR_DB_PROVIDER: 'pgvector'
+            DB_PROVIDER: 'postgres'
+            DB_NAME: 'cognee_db'
+            DB_HOST: '127.0.0.1'
+            DB_PORT: 5432
+            DB_USERNAME: cognee
+            DB_PASSWORD: cognee
+          run: poetry run python ./cognee/tests/test_search_db.py
+
+  run-neo4j-pgvector-postgres-search-tests:
+    name: Search test for Neo4j/PGVector/Postgres
+    runs-on: ubuntu-22.04
+    if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'neo4j/pgvector/postgres') }}
+    services:
+      neo4j:
+        image: neo4j:5.11
+        env:
+          NEO4J_AUTH: neo4j/pleaseletmein
+          NEO4J_PLUGINS: '["apoc","graph-data-science"]'
+        ports:
+          - 7474:7474
+          - 7687:7687
+        options: >-
+          --health-cmd="cypher-shell -u neo4j -p pleaseletmein 'RETURN 1'"
+          --health-interval=10s
+          --health-timeout=5s
+          --health-retries=5
+      postgres:
+        image: pgvector/pgvector:pg17
+        env:
+          POSTGRES_USER: cognee
+          POSTGRES_PASSWORD: cognee
+          POSTGRES_DB: cognee_db
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries=5
+    steps:
+      - name: Check out
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: ${{ inputs.python-version }}
+
+      - name: Install dependencies
+        run: |
+          poetry install -E neo4j -E postgres
+
+      - name: Run Neo4j + PGVector + Postgres search Tests
+        env:
+          ENV: dev
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+          GRAPH_DATABASE_PROVIDER: 'neo4j'
+          VECTOR_DB_PROVIDER: 'pgvector'
+          DB_PROVIDER: 'postgres'
+          GRAPH_DATABASE_URL: bolt://localhost:7687
+          GRAPH_DATABASE_USERNAME: neo4j
+          GRAPH_DATABASE_PASSWORD: pleaseletmein
+          DB_NAME: cognee_db
+          DB_HOST: 127.0.0.1
+          DB_PORT: 5432
+          DB_USERNAME: cognee
+          DB_PASSWORD: cognee
+        run: poetry run python ./cognee/tests/test_search_db.py
--- a/.github/workflows/test_suites.yml
+++ b/.github/workflows/test_suites.yml
@ -45,6 +45,12 @@ jobs:
    uses: ./.github/workflows/graph_db_tests.yml
    secrets: inherit

+  search-db-tests:
+    name: Search Test on Different DBs
+    needs: [basic-tests, e2e-tests, graph-db-tests]
+    uses: ./.github/workflows/search_db_tests.yml
+    secrets: inherit
+
  relational-db-migration-tests:
    name: Relational DB Migration Tests
    needs: [ basic-tests, e2e-tests, graph-db-tests]
--- a/cognee/tests/test_search_db.py
+++ b/cognee/tests/test_search_db.py
@ -0,0 +1,148 @@
+import os
+import pathlib
+
+from dns.e164 import query
+
+import cognee
+from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
+from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
+from cognee.modules.retrieval.graph_completion_context_extension_retriever import (
+    GraphCompletionContextExtensionRetriever,
+)
+from cognee.modules.retrieval.graph_completion_cot_retriever import GraphCompletionCotRetriever
+from cognee.modules.retrieval.graph_summary_completion_retriever import (
+    GraphSummaryCompletionRetriever,
+)
+from cognee.modules.search.operations import get_history
+from cognee.modules.users.methods import get_default_user
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.search.types import SearchType
+from cognee.modules.engine.models import NodeSet
+
+logger = get_logger()
+
+
+async def main():
+    # This test runs for multiple db settings, to run this locally set the corresponding db envs
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    dataset_name = "test_dataset"
+
+    text_1 = """Germany is located in europe right next to the Netherlands"""
+    await cognee.add(text_1, dataset_name)
+
+    text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
+    At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
+    Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
+    The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
+    Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
+    In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
+    """
+
+    await cognee.add([text], dataset_name)
+
+    await cognee.cognify([dataset_name])
+
+    context_gk = await GraphCompletionRetriever().get_context(
+        query="Next to which country is Germany located?"
+    )
+    context_gk_cot = await GraphCompletionCotRetriever().get_context(
+        query="Next to which country is Germany located?"
+    )
+    context_gk_ext = await GraphCompletionContextExtensionRetriever().get_context(
+        query="Next to which country is Germany located?"
+    )
+    context_gk_sum = await GraphSummaryCompletionRetriever().get_context(
+        query="Next to which country is Germany located?"
+    )
+
+    for name, context in [
+        ("GraphCompletionRetriever", context_gk),
+        ("GraphCompletionCotRetriever", context_gk_cot),
+        ("GraphCompletionContextExtensionRetriever", context_gk_ext),
+        ("GraphSummaryCompletionRetriever", context_gk_sum),
+    ]:
+        assert isinstance(context, str), f"{name}: Context should be a string"
+        assert context.strip(), f"{name}: Context should not be empty"
+        lower = context.lower()
+        assert "germany" in lower or "netherlands" in lower, (
+            f"{name}: Context did not contain 'germany' or 'netherlands'; got: {context!r}"
+        )
+
+    triplets_gk = await GraphCompletionRetriever().get_triplets(
+        query="Next to which country is Germany located?"
+    )
+    triplets_gk_cot = await GraphCompletionCotRetriever().get_triplets(
+        query="Next to which country is Germany located?"
+    )
+    triplets_gk_ext = await GraphCompletionContextExtensionRetriever().get_triplets(
+        query="Next to which country is Germany located?"
+    )
+    triplets_gk_sum = await GraphSummaryCompletionRetriever().get_triplets(
+        query="Next to which country is Germany located?"
+    )
+
+    for name, triplets in [
+        ("GraphCompletionRetriever", triplets_gk),
+        ("GraphCompletionCotRetriever", triplets_gk_cot),
+        ("GraphCompletionContextExtensionRetriever", triplets_gk_ext),
+        ("GraphSummaryCompletionRetriever", triplets_gk_sum),
+    ]:
+        assert isinstance(triplets, list), f"{name}: Triplets should be a list"
+        assert triplets, f"{name}: Triplets list should not be empty"
+        for edge in triplets:
+            assert isinstance(edge, Edge), f"{name}: Elements should be Edge instances"
+            distance = edge.attributes.get("vector_distance")
+            node1_distance = edge.node1.attributes.get("vector_distance")
+            node2_distance = edge.node2.attributes.get("vector_distance")
+            assert isinstance(distance, float), (
+                f"{name}: vector_distance should be float, got {type(distance)}"
+            )
+            assert 0 <= distance <= 1, (
+                f"{name}: edge vector_distance {distance} out of [0,1], this shouldn't happen"
+            )
+            assert 0 <= node1_distance <= 1, (
+                f"{name}: node_1 vector_distance {distance} out of [0,1], this shouldn't happen"
+            )
+            assert 0 <= node2_distance <= 1, (
+                f"{name}: node_2 vector_distance {distance} out of [0,1], this shouldn't happen"
+            )
+
+    completion_gk = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION,
+        query_text="Next to which country is Germany located?",
+    )
+    completion_cot = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION_COT,
+        query_text="Next to which country is Germany located?",
+    )
+    completion_ext = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION_CONTEXT_EXTENSION,
+        query_text="Next to which country is Germany located?",
+    )
+    completion_sum = await cognee.search(
+        query_type=SearchType.GRAPH_SUMMARY_COMPLETION,
+        query_text="Next to which country is Germany located?",
+    )
+
+    for name, completion in [
+        ("GRAPH_COMPLETION", completion_gk),
+        ("GRAPH_COMPLETION_COT", completion_cot),
+        ("GRAPH_COMPLETION_CONTEXT_EXTENSION", completion_ext),
+        ("GRAPH_SUMMARY_COMPLETION", completion_sum),
+    ]:
+        assert isinstance(completion, list), f"{name}: should return a list"
+        assert len(completion) == 1, f"{name}: expected single-element list, got {len(completion)}"
+        text = completion[0]
+        assert isinstance(text, str), f"{name}: element should be a string"
+        assert text.strip(), f"{name}: string should not be empty"
+        assert "netherlands" in text.lower(), (
+            f"{name}: expected 'netherlands' in result, got: {text!r}"
+        )
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())