feat: Adds core db tests for main search (#1006)

<!-- .github/pull_request_template.md -->

## Description
 Adds core db tests for main search

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
This commit is contained in:
hajdul88 2025-06-24 10:51:34 +02:00 committed by GitHub
parent 82e35374d0
commit 97d05f105e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 387 additions and 0 deletions

View file

@ -1,5 +1,8 @@
name: Reusable Graph DB Tests
permissions:
contents: read
on:
workflow_call:
inputs:

230
.github/workflows/search_db_tests.yml vendored Normal file
View file

@ -0,0 +1,230 @@
name: Reusable Search DB Tests
permissions:
contents: read
on:
workflow_call:
inputs:
databases:
required: false
type: string
default: "all"
description: "Which vector databases to test (comma-separated list or 'all')"
secrets:
WEAVIATE_API_URL:
required: false
WEAVIATE_API_KEY:
required: false
jobs:
run-kuzu-lance-sqlite-search-tests:
name: Search test for Kuzu/LanceDB/Sqlite
runs-on: ubuntu-22.04
if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'kuzu/lance/sqlite') }}
steps:
- name: Check out
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: ${{ inputs.python-version }}
- name: Install specific db dependency
run: |
poetry install -E kuzu
- name: Run Kuzu search Tests
env:
ENV: 'dev'
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
GRAPH_DATABASE_PROVIDER: 'kuzu'
VECTOR_DB_PROVIDER: 'lancedb'
DB_PROVIDER: 'sqlite'
run: poetry run python ./cognee/tests/test_search_db.py
run-neo4j-lance-sqlite-search-tests:
name: Search test for Neo4j/LanceDB/Sqlite
runs-on: ubuntu-22.04
if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'neo4j/lance/sqlite') }}
services:
neo4j:
image: neo4j:5.11
env:
NEO4J_AUTH: neo4j/pleaseletmein
NEO4J_PLUGINS: '["apoc","graph-data-science"]'
ports:
- 7474:7474
- 7687:7687
options: >-
--health-cmd="cypher-shell -u neo4j -p pleaseletmein 'RETURN 1'"
--health-interval=10s
--health-timeout=5s
--health-retries=5
steps:
- name: Check out
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: ${{ inputs.python-version }}
- name: Install specific db dependency
run: |
poetry install -E neo4j
- name: Run Neo4j search Tests
env:
ENV: 'dev'
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
GRAPH_DATABASE_PROVIDER: 'neo4j'
VECTOR_DB_PROVIDER: 'lancedb'
DB_PROVIDER: 'sqlite'
GRAPH_DATABASE_URL: bolt://localhost:7687
GRAPH_DATABASE_USERNAME: neo4j
GRAPH_DATABASE_PASSWORD: pleaseletmein
run: poetry run python ./cognee/tests/test_search_db.py
run-kuzu-pgvector-postgres-search-tests:
name: Search test for Kuzu/PGVector/Postgres
runs-on: ubuntu-22.04
if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'kuzu/pgvector/postgres') }}
services:
postgres:
image: pgvector/pgvector:pg17
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 5432:5432
steps:
- name: Check out
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: ${{ inputs.python-version }}
- name: Install dependencies
run: poetry install -E kuzu -E postgres
- name: Run Kuzu/PGVector/Postgres Tests
env:
ENV: dev
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
GRAPH_DATABASE_PROVIDER: 'kuzu'
VECTOR_DB_PROVIDER: 'pgvector'
DB_PROVIDER: 'postgres'
DB_NAME: 'cognee_db'
DB_HOST: '127.0.0.1'
DB_PORT: 5432
DB_USERNAME: cognee
DB_PASSWORD: cognee
run: poetry run python ./cognee/tests/test_search_db.py
run-neo4j-pgvector-postgres-search-tests:
name: Search test for Neo4j/PGVector/Postgres
runs-on: ubuntu-22.04
if: ${{ inputs.databases == 'all' || contains(inputs.databases, 'neo4j/pgvector/postgres') }}
services:
neo4j:
image: neo4j:5.11
env:
NEO4J_AUTH: neo4j/pleaseletmein
NEO4J_PLUGINS: '["apoc","graph-data-science"]'
ports:
- 7474:7474
- 7687:7687
options: >-
--health-cmd="cypher-shell -u neo4j -p pleaseletmein 'RETURN 1'"
--health-interval=10s
--health-timeout=5s
--health-retries=5
postgres:
image: pgvector/pgvector:pg17
env:
POSTGRES_USER: cognee
POSTGRES_PASSWORD: cognee
POSTGRES_DB: cognee_db
ports:
- 5432:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries=5
steps:
- name: Check out
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: ${{ inputs.python-version }}
- name: Install dependencies
run: |
poetry install -E neo4j -E postgres
- name: Run Neo4j + PGVector + Postgres search Tests
env:
ENV: dev
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
GRAPH_DATABASE_PROVIDER: 'neo4j'
VECTOR_DB_PROVIDER: 'pgvector'
DB_PROVIDER: 'postgres'
GRAPH_DATABASE_URL: bolt://localhost:7687
GRAPH_DATABASE_USERNAME: neo4j
GRAPH_DATABASE_PASSWORD: pleaseletmein
DB_NAME: cognee_db
DB_HOST: 127.0.0.1
DB_PORT: 5432
DB_USERNAME: cognee
DB_PASSWORD: cognee
run: poetry run python ./cognee/tests/test_search_db.py

View file

@ -45,6 +45,12 @@ jobs:
uses: ./.github/workflows/graph_db_tests.yml
secrets: inherit
search-db-tests:
name: Search Test on Different DBs
needs: [basic-tests, e2e-tests, graph-db-tests]
uses: ./.github/workflows/search_db_tests.yml
secrets: inherit
relational-db-migration-tests:
name: Relational DB Migration Tests
needs: [ basic-tests, e2e-tests, graph-db-tests]

View file

@ -0,0 +1,148 @@
import os
import pathlib
from dns.e164 import query
import cognee
from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
from cognee.modules.retrieval.graph_completion_context_extension_retriever import (
GraphCompletionContextExtensionRetriever,
)
from cognee.modules.retrieval.graph_completion_cot_retriever import GraphCompletionCotRetriever
from cognee.modules.retrieval.graph_summary_completion_retriever import (
GraphSummaryCompletionRetriever,
)
from cognee.modules.search.operations import get_history
from cognee.modules.users.methods import get_default_user
from cognee.shared.logging_utils import get_logger
from cognee.modules.search.types import SearchType
from cognee.modules.engine.models import NodeSet
logger = get_logger()
async def main():
# This test runs for multiple db settings, to run this locally set the corresponding db envs
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
dataset_name = "test_dataset"
text_1 = """Germany is located in europe right next to the Netherlands"""
await cognee.add(text_1, dataset_name)
text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
"""
await cognee.add([text], dataset_name)
await cognee.cognify([dataset_name])
context_gk = await GraphCompletionRetriever().get_context(
query="Next to which country is Germany located?"
)
context_gk_cot = await GraphCompletionCotRetriever().get_context(
query="Next to which country is Germany located?"
)
context_gk_ext = await GraphCompletionContextExtensionRetriever().get_context(
query="Next to which country is Germany located?"
)
context_gk_sum = await GraphSummaryCompletionRetriever().get_context(
query="Next to which country is Germany located?"
)
for name, context in [
("GraphCompletionRetriever", context_gk),
("GraphCompletionCotRetriever", context_gk_cot),
("GraphCompletionContextExtensionRetriever", context_gk_ext),
("GraphSummaryCompletionRetriever", context_gk_sum),
]:
assert isinstance(context, str), f"{name}: Context should be a string"
assert context.strip(), f"{name}: Context should not be empty"
lower = context.lower()
assert "germany" in lower or "netherlands" in lower, (
f"{name}: Context did not contain 'germany' or 'netherlands'; got: {context!r}"
)
triplets_gk = await GraphCompletionRetriever().get_triplets(
query="Next to which country is Germany located?"
)
triplets_gk_cot = await GraphCompletionCotRetriever().get_triplets(
query="Next to which country is Germany located?"
)
triplets_gk_ext = await GraphCompletionContextExtensionRetriever().get_triplets(
query="Next to which country is Germany located?"
)
triplets_gk_sum = await GraphSummaryCompletionRetriever().get_triplets(
query="Next to which country is Germany located?"
)
for name, triplets in [
("GraphCompletionRetriever", triplets_gk),
("GraphCompletionCotRetriever", triplets_gk_cot),
("GraphCompletionContextExtensionRetriever", triplets_gk_ext),
("GraphSummaryCompletionRetriever", triplets_gk_sum),
]:
assert isinstance(triplets, list), f"{name}: Triplets should be a list"
assert triplets, f"{name}: Triplets list should not be empty"
for edge in triplets:
assert isinstance(edge, Edge), f"{name}: Elements should be Edge instances"
distance = edge.attributes.get("vector_distance")
node1_distance = edge.node1.attributes.get("vector_distance")
node2_distance = edge.node2.attributes.get("vector_distance")
assert isinstance(distance, float), (
f"{name}: vector_distance should be float, got {type(distance)}"
)
assert 0 <= distance <= 1, (
f"{name}: edge vector_distance {distance} out of [0,1], this shouldn't happen"
)
assert 0 <= node1_distance <= 1, (
f"{name}: node_1 vector_distance {distance} out of [0,1], this shouldn't happen"
)
assert 0 <= node2_distance <= 1, (
f"{name}: node_2 vector_distance {distance} out of [0,1], this shouldn't happen"
)
completion_gk = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text="Next to which country is Germany located?",
)
completion_cot = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION_COT,
query_text="Next to which country is Germany located?",
)
completion_ext = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION_CONTEXT_EXTENSION,
query_text="Next to which country is Germany located?",
)
completion_sum = await cognee.search(
query_type=SearchType.GRAPH_SUMMARY_COMPLETION,
query_text="Next to which country is Germany located?",
)
for name, completion in [
("GRAPH_COMPLETION", completion_gk),
("GRAPH_COMPLETION_COT", completion_cot),
("GRAPH_COMPLETION_CONTEXT_EXTENSION", completion_ext),
("GRAPH_SUMMARY_COMPLETION", completion_sum),
]:
assert isinstance(completion, list), f"{name}: should return a list"
assert len(completion) == 1, f"{name}: expected single-element list, got {len(completion)}"
text = completion[0]
assert isinstance(text, str), f"{name}: element should be a string"
assert text.strip(), f"{name}: string should not be empty"
assert "netherlands" in text.lower(), (
f"{name}: expected 'netherlands' in result, got: {text!r}"
)
if __name__ == "__main__":
import asyncio
asyncio.run(main())