Add first three unit tests
This commit is contained in:
parent
cdaf63f57c
commit
dce894bfd3
7 changed files with 21 additions and 14 deletions
|
|
@ -1,13 +0,0 @@
|
||||||
import os
|
|
||||||
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_file_path = os.path.join(os.path.dirname(__file__), "artificial-inteligence.pdf")
|
|
||||||
pdf_doc = PdfDocument("Test document.pdf", test_file_path, chunking_strategy="paragraph")
|
|
||||||
reader = pdf_doc.get_reader()
|
|
||||||
|
|
||||||
for paragraph_data in reader.read():
|
|
||||||
print(paragraph_data["word_count"])
|
|
||||||
print(paragraph_data["text"])
|
|
||||||
print(paragraph_data["cut_type"])
|
|
||||||
print("\n")
|
|
||||||
Binary file not shown.
Binary file not shown.
BIN
cognee/tests/.DS_Store
vendored
Normal file
BIN
cognee/tests/.DS_Store
vendored
Normal file
Binary file not shown.
20
cognee/tests/unit/documents/pdf_document.py
Normal file
20
cognee/tests/unit/documents/pdf_document.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
|
||||||
|
|
||||||
|
GROUND_TRUTH = [
|
||||||
|
{"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
|
||||||
|
{"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_file_path = os.path.join(os.sep, *(os.path.dirname(__file__).split(os.sep)[:-2]),"test_data", "artificial-intelligence.pdf")
|
||||||
|
pdf_doc = PdfDocument(id = uuid.uuid4(), name = "Test document.pdf", raw_data_location = test_file_path)
|
||||||
|
|
||||||
|
for ground_truth, paragraph_data in zip(GROUND_TRUTH, pdf_doc.read(chunk_size = 1024)):
|
||||||
|
assert ground_truth["word_count"] == paragraph_data.word_count, f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
|
||||||
|
assert ground_truth["len_text"] == len(paragraph_data.text), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
|
||||||
|
assert ground_truth["cut_type"] == paragraph_data.cut_type, f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -23,7 +23,7 @@ async def pipeline(data_queue):
|
||||||
Task(multiply_by_two),
|
Task(multiply_by_two),
|
||||||
])
|
])
|
||||||
|
|
||||||
results = [2, 4, 6, 8, 10, 12, 14, 16, 18]
|
results = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
|
||||||
index = 0
|
index = 0
|
||||||
async for result in tasks_run:
|
async for result in tasks_run:
|
||||||
print(result)
|
print(result)
|
||||||
Loading…
Add table
Reference in a new issue