Merge pull request #197 from topoteretes/COG-577-add-unit-test-task

Cog 577 add unit test task
2024-11-13 15:34:13 +01:00 · 2024-11-13 15:34:13 +01:00 · 7a72aa44d9
commit 7a72aa44d9
parent cdaf63f57c 49bc07d30d
24 changed files with 360 additions and 185 deletions
--- a/.github/workflows/test_python_3_10.yml
+++ b/.github/workflows/test_python_3_10.yml
@ -50,8 +50,11 @@ jobs:
      - name: Install dependencies
        run: poetry install --no-interaction
-      - name: Run tests
+      - name: Run unit tests
-        run: poetry run pytest tests/
+        run: poetry run pytest cognee/tests/unit/
      - name: Run integration tests
        run: poetry run pytest cognee/tests/integration/
      - name: Run default basic pipeline
        env:
--- a/.github/workflows/test_python_3_11.yml
+++ b/.github/workflows/test_python_3_11.yml
@ -50,8 +50,11 @@ jobs:
      - name: Install dependencies
        run: poetry install --no-interaction
-      - name: Run tests
+      - name: Run unit tests
-        run: poetry run pytest tests/
+        run: poetry run pytest cognee/tests/unit/
      - name: Run integration tests
        run: poetry run pytest cognee/tests/integration/
      - name: Run default basic pipeline
        env:
--- a/.github/workflows/test_python_3_9.yml
+++ b/.github/workflows/test_python_3_9.yml
@ -50,8 +50,11 @@ jobs:
      - name: Install dependencies
        run: poetry install --no-interaction
-      - name: Run tests
+      - name: Run unit tests
-        run: poetry run pytest tests/
+        run: poetry run pytest cognee/tests/unit/
      - name: Run integration tests
        run: poetry run pytest cognee/tests/integration/
      - name: Run default basic pipeline
        env:
--- a/cognee/infrastructure/engine/tests/model_to_graph_to_model.test.py
+++ b/cognee/infrastructure/engine/tests/model_to_graph_to_model.test.py
@ -1,72 +0,0 @@
 from enum import Enum
 from typing import Optional
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.graph.utils import get_graph_from_model, get_model_instance_from_graph
 if __name__ == "__main__":
    class CarTypeName(Enum):
        Pickup = "Pickup"
        Sedan = "Sedan"
        SUV = "SUV"
        Coupe = "Coupe"
        Convertible = "Convertible"
        Hatchback = "Hatchback"
        Wagon = "Wagon"
        Minivan = "Minivan"
        Van = "Van"
    class CarType(DataPoint):
        id: str
        name: CarTypeName
        _metadata: dict = dict(index_fields = ["name"])
    class Car(DataPoint):
        id: str
        brand: str
        model: str
        year: int
        color: str
        is_type: CarType
    class Person(DataPoint):
        id: str
        name: str
        age: int
        owns_car: list[Car]
        driving_licence: Optional[dict]
        _metadata: dict = dict(index_fields = ["name"])
    boris = Person(
        id = "boris",
        name = "Boris",
        age = 30,
        owns_car = [
            Car(
                id = "car1",
                brand = "Toyota",
                model = "Camry",
                year = 2020,
                color = "Blue",
                is_type = CarType(id = "sedan", name = CarTypeName.Sedan),
            ),
        ],
        driving_licence = {
            "issued_by": "PU Vrsac",
            "issued_on": "2025-11-06",
            "number": "1234567890",
            "expires_on": "2025-11-06",
        },
    )
    nodes, edges = get_graph_from_model(boris)
    print(nodes)
    print(edges)
    person_data = nodes[len(nodes) - 1]
    parsed_person = get_model_instance_from_graph(nodes, edges, 'boris')
    print(parsed_person)
--- a/cognee/modules/data/processing/document_types/tests/PdfDocument.test.py
+++ b/cognee/modules/data/processing/document_types/tests/PdfDocument.test.py
@ -1,13 +0,0 @@
 import os
 from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
 if __name__ == "__main__":
    test_file_path = os.path.join(os.path.dirname(__file__), "artificial-inteligence.pdf")
    pdf_doc = PdfDocument("Test document.pdf", test_file_path, chunking_strategy="paragraph")
    reader = pdf_doc.get_reader()
    for paragraph_data in reader.read():
        print(paragraph_data["word_count"])
        print(paragraph_data["text"])
        print(paragraph_data["cut_type"])
        print("\n")
--- a/cognee/modules/data/processing/document_types/tests/artificial-inteligence.pdf
+++ b/cognee/modules/data/processing/document_types/tests/artificial-inteligence.pdf
--- a/cognee/modules/data/processing/document_types/tests/soldiers-home.pdf
+++ b/cognee/modules/data/processing/document_types/tests/soldiers-home.pdf
--- a/cognee/modules/pipelines/operations/tests/init.py
+++ b/cognee/modules/pipelines/operations/tests/init.py
--- a/cognee/modules/pipelines/operations/tests/artificial-inteligence.v1.pdf
+++ b/cognee/modules/pipelines/operations/tests/artificial-inteligence.v1.pdf
--- a/cognee/modules/pipelines/operations/tests/artificial-inteligence.v2.pdf
+++ b/cognee/modules/pipelines/operations/tests/artificial-inteligence.v2.pdf
--- a/cognee/modules/pipelines/operations/tests/get_graph_url.py
+++ b/cognee/modules/pipelines/operations/tests/get_graph_url.py
@ -1,14 +0,0 @@
 import asyncio
 from cognee.shared.utils import render_graph
 from cognee.infrastructure.databases.graph import get_graph_engine
 if __name__ == "__main__":
    async def main():
        graph_client = await get_graph_engine()
        graph = graph_client.graph
        graph_url = await render_graph(graph)
        print(graph_url)
    asyncio.run(main())
--- a/cognee/tasks/chunks/tests/chunk_by_paragraph.test.py
+++ b/cognee/tasks/chunks/tests/chunk_by_paragraph.test.py
@ -1,53 +0,0 @@
 from cognee.tasks.chunks import chunk_by_paragraph
 if __name__ == "__main__":
    def test_chunking_on_whole_text():
        test_text = """This is example text. It contains multiple sentences.
        This is a second paragraph. First two paragraphs are whole.
        Third paragraph is a bit longer and is finished with a dot."""
        chunks = []
        for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs = False):
            chunks.append(chunk_data)
        assert len(chunks) == 3
        assert chunks[0]["text"] == "This is example text. It contains multiple sentences."
        assert chunks[0]["word_count"] == 8
        assert chunks[0]["cut_type"] == "paragraph_end"
        assert chunks[1]["text"] == "This is a second paragraph. First two paragraphs are whole."
        assert chunks[1]["word_count"] == 10
        assert chunks[1]["cut_type"] == "paragraph_end"
        assert chunks[2]["text"] == "Third paragraph is a bit longer and is finished with a dot."
        assert chunks[2]["word_count"] == 12
        assert chunks[2]["cut_type"] == "sentence_end"
    def test_chunking_on_cut_text():
        test_text = """This is example text. It contains multiple sentences.
        This is a second paragraph. First two paragraphs are whole.
        Third paragraph is cut and is missing the dot at the end"""
        chunks = []
        for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs = False):
            chunks.append(chunk_data)
        assert len(chunks) == 3
        assert chunks[0]["text"] == "This is example text. It contains multiple sentences."
        assert chunks[0]["word_count"] == 8
        assert chunks[0]["cut_type"] == "paragraph_end"
        assert chunks[1]["text"] == "This is a second paragraph. First two paragraphs are whole."
        assert chunks[1]["word_count"] == 10
        assert chunks[1]["cut_type"] == "paragraph_end"
        assert chunks[2]["text"] == "Third paragraph is cut and is missing the dot at the end"
        assert chunks[2]["word_count"] == 12
        assert chunks[2]["cut_type"] == "sentence_cut"
    test_chunking_on_whole_text()
    test_chunking_on_cut_text()
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@ -5,14 +5,12 @@ EXTENSION_TO_DOCUMENT_CLASS = {
    "pdf": PdfDocument,
    "audio": AudioDocument,
    "image": ImageDocument,
    "pdf": TextDocument,
    "txt": TextDocument
 }
 def classify_documents(data_documents: list[Data]) -> list[Document]:
    documents = [
-        EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location)
+        EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name)
        for data_item in data_documents
    ]
    return documents
--- a/cognee/tests/integration/run_toy_tasks/conftest.py
+++ b/cognee/tests/integration/run_toy_tasks/conftest.py
@ -0,0 +1,11 @@
 import os
 import pytest
@pytest.fixture(autouse=True, scope="session")
 def copy_cognee_db_to_target_location():
    os.makedirs("cognee/.cognee_system/databases/", exist_ok=True)
    os.system(
        "cp cognee/tests/integration/run_toy_tasks/data/cognee_db cognee/.cognee_system/databases/cognee_db"
    )
--- a/cognee/tests/integration/run_toy_tasks/data/cognee_db
+++ b/cognee/tests/integration/run_toy_tasks/data/cognee_db
--- a/cognee/modules/pipelines/operations/tests/run_tasks_from_queue.test.py
+++ b/cognee/modules/pipelines/operations/tests/run_tasks_from_queue.test.py
@ -1,8 +1,10 @@
 import asyncio
 from queue import Queue
 from cognee.modules.pipelines.operations.run_tasks import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
 async def pipeline(data_queue):
    async def queue_consumer():
        while not data_queue.is_closed:
@ -17,20 +19,25 @@ async def pipeline(data_queue):
    async def multiply_by_two(num):
        yield num * 2
-    tasks_run = run_tasks([
+    tasks_run = run_tasks(
-        Task(queue_consumer),
+        [
-        Task(add_one),
+            Task(queue_consumer),
-        Task(multiply_by_two),
+            Task(add_one),
-    ])
+            Task(multiply_by_two),
        ],
        pipeline_name="test_run_tasks_from_queue",
    )
-    results = [2, 4, 6, 8, 10, 12, 14, 16, 18]
+    results = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    index = 0
    async for result in tasks_run:
-        print(result)
+        assert (
-        assert result == results[index]
+            result == results[index]
        ), f"at {index = }: {result = } != {results[index] = }"
        index += 1
-async def main():
+
 async def run_queue():
    data_queue = Queue()
    data_queue.is_closed = False
@ -42,5 +49,6 @@ async def main():
    await asyncio.gather(pipeline(data_queue), queue_producer())
-if __name__ == "__main__":
+
-    asyncio.run(main())
+def test_run_tasks_from_queue():
    asyncio.run(run_queue())
--- a/cognee/modules/pipelines/operations/tests/run_tasks.test.py
+++ b/cognee/modules/pipelines/operations/tests/run_tasks.test.py
@ -1,9 +1,10 @@
 import asyncio
 from cognee.modules.pipelines.operations.run_tasks import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
-async def main():
+async def run_and_check_tasks():
    def number_generator(num):
        for i in range(num):
            yield i + 1
@ -18,19 +19,25 @@ async def main():
    async def add_one_single(num):
        yield num + 1
-    pipeline = run_tasks([
+    pipeline = run_tasks(
-        Task(number_generator),
+        [
-        Task(add_one, task_config = {"batch_size": 5}),
+            Task(number_generator),
-        Task(multiply_by_two, task_config = {"batch_size": 1}),
+            Task(add_one, task_config={"batch_size": 5}),
-        Task(add_one_single),
+            Task(multiply_by_two, task_config={"batch_size": 1}),
-    ], 10)
+            Task(add_one_single),
        ],
        10,
        pipeline_name="test_run_tasks",
    )
    results = [5, 7, 9, 11, 13, 15, 17, 19, 21, 23]
    index = 0
    async for result in pipeline:
-        print(result)
+        assert (
-        assert result == results[index]
+            result == results[index]
        ), f"at {index = }: {result = } != {results[index] = }"
        index += 1
-if __name__ == "__main__":
+
-    asyncio.run(main())
+def test_run_tasks():
    asyncio.run(run_and_check_tasks())
--- a/cognee/tests/unit/documents/PdfDocument_test.py
+++ b/cognee/tests/unit/documents/PdfDocument_test.py
@ -0,0 +1,34 @@
 import os
 import uuid
 from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
 GROUND_TRUTH = [
    {"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
    {"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
 ]
 def test_PdfDocument():
    test_file_path = os.path.join(
        os.sep,
        *(os.path.dirname(__file__).split(os.sep)[:-2]),
        "test_data",
        "artificial-intelligence.pdf",
    )
    pdf_doc = PdfDocument(
        id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path
    )
    for ground_truth, paragraph_data in zip(
        GROUND_TRUTH, pdf_doc.read(chunk_size=1024)
    ):
        assert (
            ground_truth["word_count"] == paragraph_data.word_count
        ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
        assert ground_truth["len_text"] == len(
            paragraph_data.text
        ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
        assert (
            ground_truth["cut_type"] == paragraph_data.cut_type
        ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
--- a/cognee/tests/unit/interfaces/graph/conftest.py
+++ b/cognee/tests/unit/interfaces/graph/conftest.py
@ -0,0 +1,80 @@
 from datetime import datetime, timezone
 from enum import Enum
 from typing import Optional
 import pytest
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.graph.utils import (
    get_graph_from_model,
    get_model_instance_from_graph,
 )
 class CarTypeName(Enum):
    Pickup = "Pickup"
    Sedan = "Sedan"
    SUV = "SUV"
    Coupe = "Coupe"
    Convertible = "Convertible"
    Hatchback = "Hatchback"
    Wagon = "Wagon"
    Minivan = "Minivan"
    Van = "Van"
 class CarType(DataPoint):
    id: str
    name: CarTypeName
    _metadata: dict = dict(index_fields=["name"])
 class Car(DataPoint):
    id: str
    brand: str
    model: str
    year: int
    color: str
    is_type: CarType
 class Person(DataPoint):
    id: str
    name: str
    age: int
    owns_car: list[Car]
    driving_license: Optional[dict]
    _metadata: dict = dict(index_fields=["name"])
@pytest.fixture(scope="session")
 def graph_outputs():
    boris = Person(
        id="boris",
        name="Boris",
        age=30,
        owns_car=[
            Car(
                id="car1",
                brand="Toyota",
                model="Camry",
                year=2020,
                color="Blue",
                is_type=CarType(id="sedan", name=CarTypeName.Sedan),
            )
        ],
        driving_license={
            "issued_by": "PU Vrsac",
            "issued_on": "2025-11-06",
            "number": "1234567890",
            "expires_on": "2025-11-06",
        },
    )
    nodes, edges = get_graph_from_model(boris)
    car, person = nodes[0], nodes[1]
    edge = edges[0]
    parsed_person = get_model_instance_from_graph(nodes, edges, "boris")
    return (car, person, edge, parsed_person)
--- a/cognee/tests/unit/interfaces/graph/get_graph_from_model_test.py
+++ b/cognee/tests/unit/interfaces/graph/get_graph_from_model_test.py
@ -0,0 +1,54 @@
 from cognee.tests.unit.interfaces.graph.util import run_test_against_ground_truth
 EDGE_GROUND_TRUTH = (
    "boris",
    "car1",
    "owns_car",
    {
        "source_node_id": "boris",
        "target_node_id": "car1",
        "relationship_name": "owns_car",
        "metadata": {"type": "list"},
    },
 )
 CAR_GROUND_TRUTH = {
    "id": "car1",
    "brand": "Toyota",
    "model": "Camry",
    "year": 2020,
    "color": "Blue",
 }
 PERSON_GROUND_TRUTH = {
    "id": "boris",
    "name": "Boris",
    "age": 30,
    "driving_license": {
        "issued_by": "PU Vrsac",
        "issued_on": "2025-11-06",
        "number": "1234567890",
        "expires_on": "2025-11-06",
    },
 }
 def test_extracted_person(graph_outputs):
    (_, person, _, _) = graph_outputs
    run_test_against_ground_truth("person", person, PERSON_GROUND_TRUTH)
 def test_extracted_car(graph_outputs):
    (car, _, _, _) = graph_outputs
    run_test_against_ground_truth("car", car, CAR_GROUND_TRUTH)
 def test_extracted_edge(graph_outputs):
    (_, _, edge, _) = graph_outputs
    assert (
        EDGE_GROUND_TRUTH[:3] == edge[:3]
    ), f"{EDGE_GROUND_TRUTH[:3] = } != {edge[:3] = }"
    for key, ground_truth in EDGE_GROUND_TRUTH[3].items():
        assert ground_truth == edge[3][key], f"{ground_truth = } != {edge[3][key] = }"
--- a/cognee/tests/unit/interfaces/graph/get_model_instance_from_graph_test.py
+++ b/cognee/tests/unit/interfaces/graph/get_model_instance_from_graph_test.py
@ -0,0 +1,29 @@
 from cognee.tests.unit.interfaces.graph.util import run_test_against_ground_truth
 PARSED_PERSON_GROUND_TRUTH = {
    "id": "boris",
    "name": "Boris",
    "age": 30,
    "driving_license": {
        "issued_by": "PU Vrsac",
        "issued_on": "2025-11-06",
        "number": "1234567890",
        "expires_on": "2025-11-06",
    },
 }
 CAR_GROUND_TRUTH = {
    "id": "car1",
    "brand": "Toyota",
    "model": "Camry",
    "year": 2020,
    "color": "Blue",
 }
 def test_parsed_person(graph_outputs):
    (_, _, _, parsed_person) = graph_outputs
    run_test_against_ground_truth(
        "parsed_person", parsed_person, PARSED_PERSON_GROUND_TRUTH
    )
    run_test_against_ground_truth("car", parsed_person.owns_car[0], CAR_GROUND_TRUTH)
--- a/cognee/tests/unit/interfaces/graph/util.py
+++ b/cognee/tests/unit/interfaces/graph/util.py
@ -0,0 +1,30 @@
 from datetime import datetime, timezone
 from typing import Any, Dict
 def run_test_against_ground_truth(
    test_target_item_name: str, test_target_item: Any, ground_truth_dict: Dict[str, Any]
 ):
    """Validates test target item attributes against ground truth values.
    Args:
        test_target_item_name: Name of the item being tested (for error messages)
        test_target_item: Object whose attributes are being validated
        ground_truth_dict: Dictionary containing expected values
    Raises:
        AssertionError: If any attribute doesn't match ground truth or if update timestamp is too old
    """
    for key, ground_truth in ground_truth_dict.items():
        if isinstance(ground_truth, dict):
            for key2, ground_truth2 in ground_truth.items():
                assert (
                    ground_truth2 == getattr(test_target_item, key)[key2]
                ), f"{test_target_item_name}/{key = }/{key2 = }: {ground_truth2 = } != {getattr(test_target_item, key)[key2] = }"
        else:
            assert ground_truth == getattr(
                test_target_item, key
            ), f"{test_target_item_name}/{key = }: {ground_truth = } != {getattr(test_target_item, key) = }"
    time_delta = datetime.now(timezone.utc) - getattr(test_target_item, "updated_at")
    assert time_delta.total_seconds() < 60, f"{ time_delta.total_seconds() = }"
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
@ -0,0 +1,69 @@
 from cognee.tasks.chunks import chunk_by_paragraph
 GROUND_TRUTH = {
    "whole_text": [
        {
            "text": "This is example text. It contains multiple sentences.",
            "word_count": 8,
            "cut_type": "paragraph_end",
        },
        {
            "text": "This is a second paragraph. First two paragraphs are whole.",
            "word_count": 10,
            "cut_type": "paragraph_end",
        },
        {
            "text": "Third paragraph is a bit longer and is finished with a dot.",
            "word_count": 12,
            "cut_type": "sentence_end",
        },
    ],
    "cut_text": [
        {
            "text": "This is example text. It contains multiple sentences.",
            "word_count": 8,
            "cut_type": "paragraph_end",
        },
        {
            "text": "This is a second paragraph. First two paragraphs are whole.",
            "word_count": 10,
            "cut_type": "paragraph_end",
        },
        {
            "text": "Third paragraph is cut and is missing the dot at the end",
            "word_count": 12,
            "cut_type": "sentence_cut",
        },
    ],
 }
 INPUT_TEXT = {
    "whole_text": """This is example text. It contains multiple sentences.
    This is a second paragraph. First two paragraphs are whole.
    Third paragraph is a bit longer and is finished with a dot.""",
    "cut_text": """This is example text. It contains multiple sentences.
    This is a second paragraph. First two paragraphs are whole.
    Third paragraph is cut and is missing the dot at the end""",
 }
 def run_chunking_test(test_text, expected_chunks):
    chunks = []
    for chunk_data in chunk_by_paragraph(test_text, 12, batch_paragraphs=False):
        chunks.append(chunk_data)
    assert len(chunks) == 3
    for expected_chunks_item, chunk in zip(expected_chunks, chunks):
        for key in ["text", "word_count", "cut_type"]:
            assert (
                chunk[key] == expected_chunks_item[key]
            ), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
 def test_chunking_whole_text():
    run_chunking_test(INPUT_TEXT["whole_text"], GROUND_TRUTH["whole_text"])
 def test_chunking_cut_text():
    run_chunking_test(INPUT_TEXT["cut_text"], GROUND_TRUTH["cut_text"])
--- a/pytest.ini
+++ b/pytest.ini
@ -1,2 +0,0 @@
 [pytest]
 addopts = tests/