Fix for now

This commit is contained in:
vasilije 2025-01-16 21:08:54 +01:00
parent 07836255ae
commit bbd51e8c27
15 changed files with 103 additions and 118 deletions

View file

@ -130,6 +130,7 @@ def get_freshest_png(directory: str) -> Image.Image:
except (IOError, OSError) as e: except (IOError, OSError) as e:
raise IOError(f"Failed to open PNG file {freshest_path}") from e raise IOError(f"Failed to open PNG file {freshest_path}") from e
@server.call_tool() @server.call_tool()
async def handle_call_tool( async def handle_call_tool(
name: str, arguments: dict | None name: str, arguments: dict | None

View file

@ -11,8 +11,6 @@ import networkx as nx
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import tiktoken import tiktoken
import nltk
import base64
import time import time
import logging import logging
@ -30,7 +28,6 @@ from cognee.shared.exceptions import IngestionError
proxy_url = "https://test.prometh.ai" proxy_url = "https://test.prometh.ai"
def get_entities(tagged_tokens): def get_entities(tagged_tokens):
nltk.download("maxent_ne_chunker", quiet=True) nltk.download("maxent_ne_chunker", quiet=True)
from nltk.chunk import ne_chunk from nltk.chunk import ne_chunk
@ -271,11 +268,6 @@ async def render_graph(
# return df.replace([np.inf, -np.inf, np.nan], None) # return df.replace([np.inf, -np.inf, np.nan], None)
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
@ -450,14 +442,6 @@ async def create_cognee_style_network_with_logo(
) )
p.add_tools(hover_tool) p.add_tools(hover_tool)
# Get the latest Unix timestamp as an integer
timestamp = int(time.time())
# Construct your filename
filename = f"{timestamp}.png"
logging.info(f"Saving visualization to {output_filename}...") logging.info(f"Saving visualization to {output_filename}...")
html_content = file_html(p, CDN, title) html_content = file_html(p, CDN, title)
with open(output_filename, "w") as f: with open(output_filename, "w") as f:

View file

@ -36,12 +36,12 @@ def test_AudioDocument():
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' ground_truth["word_count"] == paragraph_data.word_count
) ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' paragraph_data.text
) ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
assert ground_truth["cut_type"] == paragraph_data.cut_type, ( assert (
f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' ground_truth["cut_type"] == paragraph_data.cut_type
) ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'

View file

@ -25,12 +25,12 @@ def test_ImageDocument():
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' ground_truth["word_count"] == paragraph_data.word_count
) ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' paragraph_data.text
) ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
assert ground_truth["cut_type"] == paragraph_data.cut_type, ( assert (
f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' ground_truth["cut_type"] == paragraph_data.cut_type
) ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'

View file

@ -27,12 +27,12 @@ def test_PdfDocument():
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker") GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker")
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' ground_truth["word_count"] == paragraph_data.word_count
) ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' paragraph_data.text
) ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
assert ground_truth["cut_type"] == paragraph_data.cut_type, ( assert (
f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' ground_truth["cut_type"] == paragraph_data.cut_type
) ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'

View file

@ -39,12 +39,12 @@ def test_TextDocument(input_file, chunk_size):
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker") GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker")
): ):
assert ground_truth["word_count"] == paragraph_data.word_count, ( assert (
f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' ground_truth["word_count"] == paragraph_data.word_count
) ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
assert ground_truth["len_text"] == len(paragraph_data.text), ( assert ground_truth["len_text"] == len(
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' paragraph_data.text
) ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
assert ground_truth["cut_type"] == paragraph_data.cut_type, ( assert (
f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' ground_truth["cut_type"] == paragraph_data.cut_type
) ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'

View file

@ -71,32 +71,32 @@ def test_UnstructuredDocument():
for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
assert "sentence_cut" == paragraph_data.cut_type, ( assert (
f" sentence_cut != {paragraph_data.cut_type = }" "sentence_cut" == paragraph_data.cut_type
) ), f" sentence_cut != {paragraph_data.cut_type = }"
# Test DOCX # Test DOCX
for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
assert "sentence_end" == paragraph_data.cut_type, ( assert (
f" sentence_end != {paragraph_data.cut_type = }" "sentence_end" == paragraph_data.cut_type
) ), f" sentence_end != {paragraph_data.cut_type = }"
# TEST CSV # TEST CSV
for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, ( assert (
f"Read text doesn't match expected text: {paragraph_data.text}" "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
) ), f"Read text doesn't match expected text: {paragraph_data.text}"
assert "sentence_cut" == paragraph_data.cut_type, ( assert (
f" sentence_cut != {paragraph_data.cut_type = }" "sentence_cut" == paragraph_data.cut_type
) ), f" sentence_cut != {paragraph_data.cut_type = }"
# Test XLSX # Test XLSX
for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
assert "sentence_cut" == paragraph_data.cut_type, ( assert (
f" sentence_cut != {paragraph_data.cut_type = }" "sentence_cut" == paragraph_data.cut_type
) ), f" sentence_cut != {paragraph_data.cut_type = }"

View file

@ -30,9 +30,9 @@ async def test_deduplication():
result = await relational_engine.get_all_data_from_table("data") result = await relational_engine.get_all_data_from_table("data")
assert len(result) == 1, "More than one data entity was found." assert len(result) == 1, "More than one data entity was found."
assert result[0]["name"] == "Natural_language_processing_copy", ( assert (
"Result name does not match expected value." result[0]["name"] == "Natural_language_processing_copy"
) ), "Result name does not match expected value."
result = await relational_engine.get_all_data_from_table("datasets") result = await relational_engine.get_all_data_from_table("datasets")
assert len(result) == 2, "Unexpected number of datasets found." assert len(result) == 2, "Unexpected number of datasets found."
@ -61,9 +61,9 @@ async def test_deduplication():
result = await relational_engine.get_all_data_from_table("data") result = await relational_engine.get_all_data_from_table("data")
assert len(result) == 1, "More than one data entity was found." assert len(result) == 1, "More than one data entity was found."
assert hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"], ( assert (
"Content hash is not a part of file name." hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"]
) ), "Content hash is not a part of file name."
await cognee.prune.prune_data() await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)

View file

@ -85,9 +85,9 @@ async def main():
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
assert not os.path.exists(get_relational_engine().db_path), ( assert not os.path.exists(
"SQLite relational database is not empty" get_relational_engine().db_path
) ), "SQLite relational database is not empty"
from cognee.infrastructure.databases.graph import get_graph_config from cognee.infrastructure.databases.graph import get_graph_config

View file

@ -82,9 +82,9 @@ async def main():
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
assert not os.path.exists(get_relational_engine().db_path), ( assert not os.path.exists(
"SQLite relational database is not empty" get_relational_engine().db_path
) ), "SQLite relational database is not empty"
from cognee.infrastructure.databases.graph import get_graph_config from cognee.infrastructure.databases.graph import get_graph_config

View file

@ -24,28 +24,28 @@ async def test_local_file_deletion(data_text, file_location):
data_hash = hashlib.md5(encoded_text).hexdigest() data_hash = hashlib.md5(encoded_text).hexdigest()
# Get data entry from database based on hash contents # Get data entry from database based on hash contents
data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one()
assert os.path.isfile(data.raw_data_location), ( assert os.path.isfile(
f"Data location doesn't exist: {data.raw_data_location}" data.raw_data_location
) ), f"Data location doesn't exist: {data.raw_data_location}"
# Test deletion of data along with local files created by cognee # Test deletion of data along with local files created by cognee
await engine.delete_data_entity(data.id) await engine.delete_data_entity(data.id)
assert not os.path.exists(data.raw_data_location), ( assert not os.path.exists(
f"Data location still exists after deletion: {data.raw_data_location}" data.raw_data_location
) ), f"Data location still exists after deletion: {data.raw_data_location}"
async with engine.get_async_session() as session: async with engine.get_async_session() as session:
# Get data entry from database based on file path # Get data entry from database based on file path
data = ( data = (
await session.scalars(select(Data).where(Data.raw_data_location == file_location)) await session.scalars(select(Data).where(Data.raw_data_location == file_location))
).one() ).one()
assert os.path.isfile(data.raw_data_location), ( assert os.path.isfile(
f"Data location doesn't exist: {data.raw_data_location}" data.raw_data_location
) ), f"Data location doesn't exist: {data.raw_data_location}"
# Test local files not created by cognee won't get deleted # Test local files not created by cognee won't get deleted
await engine.delete_data_entity(data.id) await engine.delete_data_entity(data.id)
assert os.path.exists(data.raw_data_location), ( assert os.path.exists(
f"Data location doesn't exists: {data.raw_data_location}" data.raw_data_location
) ), f"Data location doesn't exists: {data.raw_data_location}"
async def test_getting_of_documents(dataset_name_1): async def test_getting_of_documents(dataset_name_1):
@ -54,16 +54,16 @@ async def test_getting_of_documents(dataset_name_1):
user = await get_default_user() user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
assert len(document_ids) == 1, ( assert (
f"Number of expected documents doesn't match {len(document_ids)} != 1" len(document_ids) == 1
) ), f"Number of expected documents doesn't match {len(document_ids)} != 1"
# Test getting of documents for search when no dataset is provided # Test getting of documents for search when no dataset is provided
user = await get_default_user() user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id) document_ids = await get_document_ids_for_user(user.id)
assert len(document_ids) == 2, ( assert (
f"Number of expected documents doesn't match {len(document_ids)} != 2" len(document_ids) == 2
) ), f"Number of expected documents doesn't match {len(document_ids)} != 2"
async def main(): async def main():

View file

@ -17,9 +17,9 @@ batch_paragraphs_vals = [True, False]
def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
reconstructed_text = "".join([chunk["text"] for chunk in chunks]) reconstructed_text = "".join([chunk["text"] for chunk in chunks])
assert reconstructed_text == input_text, ( assert (
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" reconstructed_text == input_text
) ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -36,9 +36,9 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]) chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks])
larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] larger_chunks = chunk_lengths[chunk_lengths > paragraph_length]
assert np.all(chunk_lengths <= paragraph_length), ( assert np.all(
f"{paragraph_length = }: {larger_chunks} are too large" chunk_lengths <= paragraph_length
) ), f"{paragraph_length = }: {larger_chunks} are too large"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -50,6 +50,6 @@ def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_
data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
) )
chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
assert np.all(chunk_indices == np.arange(len(chunk_indices))), ( assert np.all(
f"{chunk_indices = } are not monotonically increasing" chunk_indices == np.arange(len(chunk_indices))
) ), f"{chunk_indices = } are not monotonically increasing"

View file

@ -58,9 +58,9 @@ def run_chunking_test(test_text, expected_chunks):
for expected_chunks_item, chunk in zip(expected_chunks, chunks): for expected_chunks_item, chunk in zip(expected_chunks, chunks):
for key in ["text", "word_count", "cut_type"]: for key in ["text", "word_count", "cut_type"]:
assert chunk[key] == expected_chunks_item[key], ( assert (
f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" chunk[key] == expected_chunks_item[key]
) ), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
def test_chunking_whole_text(): def test_chunking_whole_text():

View file

@ -16,9 +16,9 @@ maximum_length_vals = [None, 8, 64]
def test_chunk_by_sentence_isomorphism(input_text, maximum_length): def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
chunks = chunk_by_sentence(input_text, maximum_length) chunks = chunk_by_sentence(input_text, maximum_length)
reconstructed_text = "".join([chunk[1] for chunk in chunks]) reconstructed_text = "".join([chunk[1] for chunk in chunks])
assert reconstructed_text == input_text, ( assert (
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" reconstructed_text == input_text
) ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -36,6 +36,6 @@ def test_paragraph_chunk_length(input_text, maximum_length):
chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
larger_chunks = chunk_lengths[chunk_lengths > maximum_length] larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
assert np.all(chunk_lengths <= maximum_length), ( assert np.all(
f"{maximum_length = }: {larger_chunks} are too large" chunk_lengths <= maximum_length
) ), f"{maximum_length = }: {larger_chunks} are too large"

View file

@ -17,9 +17,9 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
def test_chunk_by_word_isomorphism(input_text): def test_chunk_by_word_isomorphism(input_text):
chunks = chunk_by_word(input_text) chunks = chunk_by_word(input_text)
reconstructed_text = "".join([chunk[0] for chunk in chunks]) reconstructed_text = "".join([chunk[0] for chunk in chunks])
assert reconstructed_text == input_text, ( assert (
f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" reconstructed_text == input_text
) ), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@pytest.mark.parametrize( @pytest.mark.parametrize(