From bbd51e8c27c4105edefc415ae4750b9399b9ad9b Mon Sep 17 00:00:00 2001 From: vasilije Date: Thu, 16 Jan 2025 21:08:54 +0100 Subject: [PATCH] Fix for now --- cognee-mcp/cognee_mcp/server.py | 1 + cognee/shared/utils.py | 16 --------- .../documents/AudioDocument_test.py | 18 +++++----- .../documents/ImageDocument_test.py | 18 +++++----- .../integration/documents/PdfDocument_test.py | 18 +++++----- .../documents/TextDocument_test.py | 18 +++++----- .../documents/UnstructuredDocument_test.py | 30 ++++++++-------- cognee/tests/test_deduplication.py | 12 +++---- cognee/tests/test_falkordb.py | 6 ++-- cognee/tests/test_library.py | 6 ++-- cognee/tests/test_pgvector.py | 36 +++++++++---------- .../chunks/chunk_by_paragraph_2_test.py | 18 +++++----- .../chunks/chunk_by_paragraph_test.py | 6 ++-- .../chunks/chunk_by_sentence_test.py | 12 +++---- .../processing/chunks/chunk_by_word_test.py | 6 ++-- 15 files changed, 103 insertions(+), 118 deletions(-) diff --git a/cognee-mcp/cognee_mcp/server.py b/cognee-mcp/cognee_mcp/server.py index 37238a783..087411fa5 100644 --- a/cognee-mcp/cognee_mcp/server.py +++ b/cognee-mcp/cognee_mcp/server.py @@ -130,6 +130,7 @@ def get_freshest_png(directory: str) -> Image.Image: except (IOError, OSError) as e: raise IOError(f"Failed to open PNG file {freshest_path}") from e + @server.call_tool() async def handle_call_tool( name: str, arguments: dict | None diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index 944ae798e..f1eae1ace 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -11,8 +11,6 @@ import networkx as nx import pandas as pd import matplotlib.pyplot as plt import tiktoken -import nltk -import base64 import time import logging @@ -30,7 +28,6 @@ from cognee.shared.exceptions import IngestionError proxy_url = "https://test.prometh.ai" - def get_entities(tagged_tokens): nltk.download("maxent_ne_chunker", quiet=True) from nltk.chunk import ne_chunk @@ -271,11 +268,6 @@ async def render_graph( # return df.replace([np.inf, -np.inf, np.nan], None) - - - - - logging.basicConfig(level=logging.INFO) @@ -450,14 +442,6 @@ async def create_cognee_style_network_with_logo( ) p.add_tools(hover_tool) - # Get the latest Unix timestamp as an integer - timestamp = int(time.time()) - - # Construct your filename - filename = f"{timestamp}.png" - - - logging.info(f"Saving visualization to {output_filename}...") html_content = file_html(p, CDN, title) with open(output_filename, "w") as f: diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index e07a2431b..dbd43ddda 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -36,12 +36,12 @@ def test_AudioDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ground_truth["word_count"] == paragraph_data.word_count, ( - f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - ) - assert ground_truth["len_text"] == len(paragraph_data.text), ( - f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - ) - assert ground_truth["cut_type"] == paragraph_data.cut_type, ( - f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' - ) + assert ( + ground_truth["word_count"] == paragraph_data.word_count + ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + assert ground_truth["len_text"] == len( + paragraph_data.text + ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + assert ( + ground_truth["cut_type"] == paragraph_data.cut_type + ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index b8d585419..c0877ae99 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -25,12 +25,12 @@ def test_ImageDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") ): - assert ground_truth["word_count"] == paragraph_data.word_count, ( - f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - ) - assert ground_truth["len_text"] == len(paragraph_data.text), ( - f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - ) - assert ground_truth["cut_type"] == paragraph_data.cut_type, ( - f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' - ) + assert ( + ground_truth["word_count"] == paragraph_data.word_count + ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + assert ground_truth["len_text"] == len( + paragraph_data.text + ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + assert ( + ground_truth["cut_type"] == paragraph_data.cut_type + ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index fc4307846..8f28815d3 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -27,12 +27,12 @@ def test_PdfDocument(): for ground_truth, paragraph_data in zip( GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker") ): - assert ground_truth["word_count"] == paragraph_data.word_count, ( - f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - ) - assert ground_truth["len_text"] == len(paragraph_data.text), ( - f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - ) - assert ground_truth["cut_type"] == paragraph_data.cut_type, ( - f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' - ) + assert ( + ground_truth["word_count"] == paragraph_data.word_count + ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + assert ground_truth["len_text"] == len( + paragraph_data.text + ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + assert ( + ground_truth["cut_type"] == paragraph_data.cut_type + ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 6daec62b7..1e143d563 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -39,12 +39,12 @@ def test_TextDocument(input_file, chunk_size): for ground_truth, paragraph_data in zip( GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker") ): - assert ground_truth["word_count"] == paragraph_data.word_count, ( - f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' - ) - assert ground_truth["len_text"] == len(paragraph_data.text), ( - f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' - ) - assert ground_truth["cut_type"] == paragraph_data.cut_type, ( - f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' - ) + assert ( + ground_truth["word_count"] == paragraph_data.word_count + ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' + assert ground_truth["len_text"] == len( + paragraph_data.text + ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' + assert ( + ground_truth["cut_type"] == paragraph_data.cut_type + ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 773dc2293..e0278de81 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -71,32 +71,32 @@ def test_UnstructuredDocument(): for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" - assert "sentence_cut" == paragraph_data.cut_type, ( - f" sentence_cut != {paragraph_data.cut_type = }" - ) + assert ( + "sentence_cut" == paragraph_data.cut_type + ), f" sentence_cut != {paragraph_data.cut_type = }" # Test DOCX for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" - assert "sentence_end" == paragraph_data.cut_type, ( - f" sentence_end != {paragraph_data.cut_type = }" - ) + assert ( + "sentence_end" == paragraph_data.cut_type + ), f" sentence_end != {paragraph_data.cut_type = }" # TEST CSV for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" - assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( - f"Read text doesn't match expected text: {paragraph_data.text}" - ) - assert "sentence_cut" == paragraph_data.cut_type, ( - f" sentence_cut != {paragraph_data.cut_type = }" - ) + assert ( + "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text + ), f"Read text doesn't match expected text: {paragraph_data.text}" + assert ( + "sentence_cut" == paragraph_data.cut_type + ), f" sentence_cut != {paragraph_data.cut_type = }" # Test XLSX for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" - assert "sentence_cut" == paragraph_data.cut_type, ( - f" sentence_cut != {paragraph_data.cut_type = }" - ) + assert ( + "sentence_cut" == paragraph_data.cut_type + ), f" sentence_cut != {paragraph_data.cut_type = }" diff --git a/cognee/tests/test_deduplication.py b/cognee/tests/test_deduplication.py index 89c866f12..9c2df032d 100644 --- a/cognee/tests/test_deduplication.py +++ b/cognee/tests/test_deduplication.py @@ -30,9 +30,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert result[0]["name"] == "Natural_language_processing_copy", ( - "Result name does not match expected value." - ) + assert ( + result[0]["name"] == "Natural_language_processing_copy" + ), "Result name does not match expected value." result = await relational_engine.get_all_data_from_table("datasets") assert len(result) == 2, "Unexpected number of datasets found." @@ -61,9 +61,9 @@ async def test_deduplication(): result = await relational_engine.get_all_data_from_table("data") assert len(result) == 1, "More than one data entity was found." - assert hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"], ( - "Content hash is not a part of file name." - ) + assert ( + hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"] + ), "Content hash is not a part of file name." await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) diff --git a/cognee/tests/test_falkordb.py b/cognee/tests/test_falkordb.py index af0e87916..07ece9eb2 100755 --- a/cognee/tests/test_falkordb.py +++ b/cognee/tests/test_falkordb.py @@ -85,9 +85,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists(get_relational_engine().db_path), ( - "SQLite relational database is not empty" - ) + assert not os.path.exists( + get_relational_engine().db_path + ), "SQLite relational database is not empty" from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_library.py b/cognee/tests/test_library.py index 192b67506..8352b4161 100755 --- a/cognee/tests/test_library.py +++ b/cognee/tests/test_library.py @@ -82,9 +82,9 @@ async def main(): from cognee.infrastructure.databases.relational import get_relational_engine - assert not os.path.exists(get_relational_engine().db_path), ( - "SQLite relational database is not empty" - ) + assert not os.path.exists( + get_relational_engine().db_path + ), "SQLite relational database is not empty" from cognee.infrastructure.databases.graph import get_graph_config diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 73b6be974..c241177f0 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -24,28 +24,28 @@ async def test_local_file_deletion(data_text, file_location): data_hash = hashlib.md5(encoded_text).hexdigest() # Get data entry from database based on hash contents data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() - assert os.path.isfile(data.raw_data_location), ( - f"Data location doesn't exist: {data.raw_data_location}" - ) + assert os.path.isfile( + data.raw_data_location + ), f"Data location doesn't exist: {data.raw_data_location}" # Test deletion of data along with local files created by cognee await engine.delete_data_entity(data.id) - assert not os.path.exists(data.raw_data_location), ( - f"Data location still exists after deletion: {data.raw_data_location}" - ) + assert not os.path.exists( + data.raw_data_location + ), f"Data location still exists after deletion: {data.raw_data_location}" async with engine.get_async_session() as session: # Get data entry from database based on file path data = ( await session.scalars(select(Data).where(Data.raw_data_location == file_location)) ).one() - assert os.path.isfile(data.raw_data_location), ( - f"Data location doesn't exist: {data.raw_data_location}" - ) + assert os.path.isfile( + data.raw_data_location + ), f"Data location doesn't exist: {data.raw_data_location}" # Test local files not created by cognee won't get deleted await engine.delete_data_entity(data.id) - assert os.path.exists(data.raw_data_location), ( - f"Data location doesn't exists: {data.raw_data_location}" - ) + assert os.path.exists( + data.raw_data_location + ), f"Data location doesn't exists: {data.raw_data_location}" async def test_getting_of_documents(dataset_name_1): @@ -54,16 +54,16 @@ async def test_getting_of_documents(dataset_name_1): user = await get_default_user() document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) - assert len(document_ids) == 1, ( - f"Number of expected documents doesn't match {len(document_ids)} != 1" - ) + assert ( + len(document_ids) == 1 + ), f"Number of expected documents doesn't match {len(document_ids)} != 1" # Test getting of documents for search when no dataset is provided user = await get_default_user() document_ids = await get_document_ids_for_user(user.id) - assert len(document_ids) == 2, ( - f"Number of expected documents doesn't match {len(document_ids)} != 2" - ) + assert ( + len(document_ids) == 2 + ), f"Number of expected documents doesn't match {len(document_ids)} != 2" async def main(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py index d8680a604..53098fc67 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py @@ -17,9 +17,9 @@ batch_paragraphs_vals = [True, False] def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) reconstructed_text = "".join([chunk["text"] for chunk in chunks]) - assert reconstructed_text == input_text, ( - f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" - ) + assert ( + reconstructed_text == input_text + ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" @pytest.mark.parametrize( @@ -36,9 +36,9 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs): chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] - assert np.all(chunk_lengths <= paragraph_length), ( - f"{paragraph_length = }: {larger_chunks} are too large" - ) + assert np.all( + chunk_lengths <= paragraph_length + ), f"{paragraph_length = }: {larger_chunks} are too large" @pytest.mark.parametrize( @@ -50,6 +50,6 @@ def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_ data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs ) chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) - assert np.all(chunk_indices == np.arange(len(chunk_indices))), ( - f"{chunk_indices = } are not monotonically increasing" - ) + assert np.all( + chunk_indices == np.arange(len(chunk_indices)) + ), f"{chunk_indices = } are not monotonically increasing" diff --git a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py index e420b2e9f..e7d9a54ba 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py @@ -58,9 +58,9 @@ def run_chunking_test(test_text, expected_chunks): for expected_chunks_item, chunk in zip(expected_chunks, chunks): for key in ["text", "word_count", "cut_type"]: - assert chunk[key] == expected_chunks_item[key], ( - f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" - ) + assert ( + chunk[key] == expected_chunks_item[key] + ), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" def test_chunking_whole_text(): diff --git a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py index efa053077..d1c75d7ed 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_sentence_test.py @@ -16,9 +16,9 @@ maximum_length_vals = [None, 8, 64] def test_chunk_by_sentence_isomorphism(input_text, maximum_length): chunks = chunk_by_sentence(input_text, maximum_length) reconstructed_text = "".join([chunk[1] for chunk in chunks]) - assert reconstructed_text == input_text, ( - f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" - ) + assert ( + reconstructed_text == input_text + ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" @pytest.mark.parametrize( @@ -36,6 +36,6 @@ def test_paragraph_chunk_length(input_text, maximum_length): chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) larger_chunks = chunk_lengths[chunk_lengths > maximum_length] - assert np.all(chunk_lengths <= maximum_length), ( - f"{maximum_length = }: {larger_chunks} are too large" - ) + assert np.all( + chunk_lengths <= maximum_length + ), f"{maximum_length = }: {larger_chunks} are too large" diff --git a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py index d79fcdbc8..fb26638cb 100644 --- a/cognee/tests/unit/processing/chunks/chunk_by_word_test.py +++ b/cognee/tests/unit/processing/chunks/chunk_by_word_test.py @@ -17,9 +17,9 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS def test_chunk_by_word_isomorphism(input_text): chunks = chunk_by_word(input_text) reconstructed_text = "".join([chunk[0] for chunk in chunks]) - assert reconstructed_text == input_text, ( - f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" - ) + assert ( + reconstructed_text == input_text + ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" @pytest.mark.parametrize(