Add first three unit tests

2024-11-11 16:39:46 +01:00 · 2024-11-11 16:39:46 +01:00 · dce894bfd3
commit dce894bfd3
parent cdaf63f57c
7 changed files with 21 additions and 14 deletions
--- a/cognee/modules/data/processing/document_types/tests/PdfDocument.test.py
+++ b/cognee/modules/data/processing/document_types/tests/PdfDocument.test.py
@ -1,13 +0,0 @@
-import os
-from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
-
-if __name__ == "__main__":
-    test_file_path = os.path.join(os.path.dirname(__file__), "artificial-inteligence.pdf")
-    pdf_doc = PdfDocument("Test document.pdf", test_file_path, chunking_strategy="paragraph")
-    reader = pdf_doc.get_reader()
-
-    for paragraph_data in reader.read():
-        print(paragraph_data["word_count"])
-        print(paragraph_data["text"])
-        print(paragraph_data["cut_type"])
-        print("\n")
--- a/cognee/modules/pipelines/operations/tests/artificial-inteligence.v1.pdf
+++ b/cognee/modules/pipelines/operations/tests/artificial-inteligence.v1.pdf
--- a/cognee/modules/pipelines/operations/tests/artificial-inteligence.v2.pdf
+++ b/cognee/modules/pipelines/operations/tests/artificial-inteligence.v2.pdf
--- a/cognee/tests/.DS_Store
+++ b/cognee/tests/.DS_Store
--- a/cognee/tests/unit/documents/pdf_document.py
+++ b/cognee/tests/unit/documents/pdf_document.py
@ -0,0 +1,20 @@
+import os
+import uuid
+from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
+
+GROUND_TRUTH = [
+    {"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
+    {"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
+]
+
+
+if __name__ == "__main__":
+    test_file_path = os.path.join(os.sep, *(os.path.dirname(__file__).split(os.sep)[:-2]),"test_data", "artificial-intelligence.pdf")
+    pdf_doc = PdfDocument(id = uuid.uuid4(), name = "Test document.pdf", raw_data_location = test_file_path)
+
+    for ground_truth, paragraph_data in zip(GROUND_TRUTH, pdf_doc.read(chunk_size = 1024)):
+        assert ground_truth["word_count"] == paragraph_data.word_count, f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+        assert ground_truth["len_text"] == len(paragraph_data.text), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+        assert ground_truth["cut_type"] == paragraph_data.cut_type, f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
+
+
--- a/cognee/modules/pipelines/operations/tests/run_tasks.test.py
+++ b/cognee/modules/pipelines/operations/tests/run_tasks.test.py
--- a/cognee/modules/pipelines/operations/tests/run_tasks_from_queue.test.py
+++ b/cognee/modules/pipelines/operations/tests/run_tasks_from_queue.test.py
@ -23,7 +23,7 @@ async def pipeline(data_queue):
        Task(multiply_by_two),
    ])

-    results = [2, 4, 6, 8, 10, 12, 14, 16, 18]
+    results = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    index = 0
    async for result in tasks_run:
        print(result)