Merge branch 'dev' into feature/cog-971-preparing-swe-bench-run

This commit is contained in:
hajdul88 2025-01-10 15:58:31 +01:00 committed by GitHub
commit 46c33655ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 105 additions and 106 deletions

View file

@ -493,7 +493,7 @@ class Neo4jAdapter(GraphDBInterface):
query_edges = f""" query_edges = f"""
MATCH (n)-[r]->(m) MATCH (n)-[r]->(m)
WHERE {where_clause} AND {where_clause.replace('n.', 'm.')} WHERE {where_clause} AND {where_clause.replace("n.", "m.")}
RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
""" """
result_edges = await self.query(query_edges) result_edges = await self.query(query_edges)

View file

@ -43,7 +43,7 @@ def format_triplets(edges):
edge_info = {key: value for key, value in edge_attributes.items() if value is not None} edge_info = {key: value for key, value in edge_attributes.items() if value is not None}
# Create the formatted triplet # Create the formatted triplet
triplet = f"Node1: {node1_info}\n" f"Edge: {edge_info}\n" f"Node2: {node2_info}\n\n\n" triplet = f"Node1: {node1_info}\nEdge: {edge_info}\nNode2: {node2_info}\n\n\n"
triplets.append(triplet) triplets.append(triplet)
return "".join(triplets) return "".join(triplets)

View file

@ -75,8 +75,7 @@ async def code_description_to_code_part(
llm_client = get_llm_client() llm_client = get_llm_client()
context_from_documents = await llm_client.acreate_structured_output( context_from_documents = await llm_client.acreate_structured_output(
text_input=f"The retrieved context from documents" text_input=f"The retrieved context from documents is {concatenated_descriptions}.",
f" is {concatenated_descriptions}.",
system_prompt="You are a Senior Software Engineer, summarize the context from documents" system_prompt="You are a Senior Software Engineer, summarize the context from documents"
f" in a way that it is gonna be provided next to codeparts as context" f" in a way that it is gonna be provided next to codeparts as context"
f" while trying to solve this github issue connected to the project: {query}]", f" while trying to solve this github issue connected to the project: {query}]",

View file

@ -36,12 +36,12 @@ def test_AudioDocument():
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
): ):
assert ( assert ground_truth["word_count"] == paragraph_data.word_count, (
ground_truth["word_count"] == paragraph_data.word_count f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' )
assert ground_truth["len_text"] == len( assert ground_truth["len_text"] == len(paragraph_data.text), (
paragraph_data.text f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' )
assert ( assert ground_truth["cut_type"] == paragraph_data.cut_type, (
ground_truth["cut_type"] == paragraph_data.cut_type f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' )

View file

@ -25,12 +25,12 @@ def test_ImageDocument():
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker") GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
): ):
assert ( assert ground_truth["word_count"] == paragraph_data.word_count, (
ground_truth["word_count"] == paragraph_data.word_count f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' )
assert ground_truth["len_text"] == len( assert ground_truth["len_text"] == len(paragraph_data.text), (
paragraph_data.text f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' )
assert ( assert ground_truth["cut_type"] == paragraph_data.cut_type, (
ground_truth["cut_type"] == paragraph_data.cut_type f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' )

View file

@ -27,12 +27,12 @@ def test_PdfDocument():
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker") GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker")
): ):
assert ( assert ground_truth["word_count"] == paragraph_data.word_count, (
ground_truth["word_count"] == paragraph_data.word_count f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' )
assert ground_truth["len_text"] == len( assert ground_truth["len_text"] == len(paragraph_data.text), (
paragraph_data.text f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' )
assert ( assert ground_truth["cut_type"] == paragraph_data.cut_type, (
ground_truth["cut_type"] == paragraph_data.cut_type f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' )

View file

@ -39,12 +39,12 @@ def test_TextDocument(input_file, chunk_size):
for ground_truth, paragraph_data in zip( for ground_truth, paragraph_data in zip(
GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker") GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker")
): ):
assert ( assert ground_truth["word_count"] == paragraph_data.word_count, (
ground_truth["word_count"] == paragraph_data.word_count f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }' )
assert ground_truth["len_text"] == len( assert ground_truth["len_text"] == len(paragraph_data.text), (
paragraph_data.text f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }' )
assert ( assert ground_truth["cut_type"] == paragraph_data.cut_type, (
ground_truth["cut_type"] == paragraph_data.cut_type f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }' )

View file

@ -71,32 +71,32 @@ def test_UnstructuredDocument():
for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
assert ( assert "sentence_cut" == paragraph_data.cut_type, (
"sentence_cut" == paragraph_data.cut_type f" sentence_cut != {paragraph_data.cut_type = }"
), f" sentence_cut != {paragraph_data.cut_type = }" )
# Test DOCX # Test DOCX
for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
assert ( assert "sentence_end" == paragraph_data.cut_type, (
"sentence_end" == paragraph_data.cut_type f" sentence_end != {paragraph_data.cut_type = }"
), f" sentence_end != {paragraph_data.cut_type = }" )
# TEST CSV # TEST CSV
for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
assert ( assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
"A A A A A A A A A,A A A A A A,A A" == paragraph_data.text f"Read text doesn't match expected text: {paragraph_data.text}"
), f"Read text doesn't match expected text: {paragraph_data.text}" )
assert ( assert "sentence_cut" == paragraph_data.cut_type, (
"sentence_cut" == paragraph_data.cut_type f" sentence_cut != {paragraph_data.cut_type = }"
), f" sentence_cut != {paragraph_data.cut_type = }" )
# Test XLSX # Test XLSX
for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
assert ( assert "sentence_cut" == paragraph_data.cut_type, (
"sentence_cut" == paragraph_data.cut_type f" sentence_cut != {paragraph_data.cut_type = }"
), f" sentence_cut != {paragraph_data.cut_type = }" )

View file

@ -30,9 +30,9 @@ async def test_deduplication():
result = await relational_engine.get_all_data_from_table("data") result = await relational_engine.get_all_data_from_table("data")
assert len(result) == 1, "More than one data entity was found." assert len(result) == 1, "More than one data entity was found."
assert ( assert result[0]["name"] == "Natural_language_processing_copy", (
result[0]["name"] == "Natural_language_processing_copy" "Result name does not match expected value."
), "Result name does not match expected value." )
result = await relational_engine.get_all_data_from_table("datasets") result = await relational_engine.get_all_data_from_table("datasets")
assert len(result) == 2, "Unexpected number of datasets found." assert len(result) == 2, "Unexpected number of datasets found."
@ -61,9 +61,9 @@ async def test_deduplication():
result = await relational_engine.get_all_data_from_table("data") result = await relational_engine.get_all_data_from_table("data")
assert len(result) == 1, "More than one data entity was found." assert len(result) == 1, "More than one data entity was found."
assert ( assert hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"], (
hashlib.md5(text.encode("utf-8")).hexdigest() in result[0]["name"] "Content hash is not a part of file name."
), "Content hash is not a part of file name." )
await cognee.prune.prune_data() await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)

View file

@ -85,9 +85,9 @@ async def main():
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
assert not os.path.exists( assert not os.path.exists(get_relational_engine().db_path), (
get_relational_engine().db_path "SQLite relational database is not empty"
), "SQLite relational database is not empty" )
from cognee.infrastructure.databases.graph import get_graph_config from cognee.infrastructure.databases.graph import get_graph_config

View file

@ -82,9 +82,9 @@ async def main():
from cognee.infrastructure.databases.relational import get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
assert not os.path.exists( assert not os.path.exists(get_relational_engine().db_path), (
get_relational_engine().db_path "SQLite relational database is not empty"
), "SQLite relational database is not empty" )
from cognee.infrastructure.databases.graph import get_graph_config from cognee.infrastructure.databases.graph import get_graph_config

View file

@ -24,28 +24,28 @@ async def test_local_file_deletion(data_text, file_location):
data_hash = hashlib.md5(encoded_text).hexdigest() data_hash = hashlib.md5(encoded_text).hexdigest()
# Get data entry from database based on hash contents # Get data entry from database based on hash contents
data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one() data = (await session.scalars(select(Data).where(Data.content_hash == data_hash))).one()
assert os.path.isfile( assert os.path.isfile(data.raw_data_location), (
data.raw_data_location f"Data location doesn't exist: {data.raw_data_location}"
), f"Data location doesn't exist: {data.raw_data_location}" )
# Test deletion of data along with local files created by cognee # Test deletion of data along with local files created by cognee
await engine.delete_data_entity(data.id) await engine.delete_data_entity(data.id)
assert not os.path.exists( assert not os.path.exists(data.raw_data_location), (
data.raw_data_location f"Data location still exists after deletion: {data.raw_data_location}"
), f"Data location still exists after deletion: {data.raw_data_location}" )
async with engine.get_async_session() as session: async with engine.get_async_session() as session:
# Get data entry from database based on file path # Get data entry from database based on file path
data = ( data = (
await session.scalars(select(Data).where(Data.raw_data_location == file_location)) await session.scalars(select(Data).where(Data.raw_data_location == file_location))
).one() ).one()
assert os.path.isfile( assert os.path.isfile(data.raw_data_location), (
data.raw_data_location f"Data location doesn't exist: {data.raw_data_location}"
), f"Data location doesn't exist: {data.raw_data_location}" )
# Test local files not created by cognee won't get deleted # Test local files not created by cognee won't get deleted
await engine.delete_data_entity(data.id) await engine.delete_data_entity(data.id)
assert os.path.exists( assert os.path.exists(data.raw_data_location), (
data.raw_data_location f"Data location doesn't exists: {data.raw_data_location}"
), f"Data location doesn't exists: {data.raw_data_location}" )
async def test_getting_of_documents(dataset_name_1): async def test_getting_of_documents(dataset_name_1):
@ -54,16 +54,16 @@ async def test_getting_of_documents(dataset_name_1):
user = await get_default_user() user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id, [dataset_name_1]) document_ids = await get_document_ids_for_user(user.id, [dataset_name_1])
assert ( assert len(document_ids) == 1, (
len(document_ids) == 1 f"Number of expected documents doesn't match {len(document_ids)} != 1"
), f"Number of expected documents doesn't match {len(document_ids)} != 1" )
# Test getting of documents for search when no dataset is provided # Test getting of documents for search when no dataset is provided
user = await get_default_user() user = await get_default_user()
document_ids = await get_document_ids_for_user(user.id) document_ids = await get_document_ids_for_user(user.id)
assert ( assert len(document_ids) == 2, (
len(document_ids) == 2 f"Number of expected documents doesn't match {len(document_ids)} != 2"
), f"Number of expected documents doesn't match {len(document_ids)} != 2" )
async def main(): async def main():

View file

@ -17,9 +17,9 @@ batch_paragraphs_vals = [True, False]
def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs): def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs) chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
reconstructed_text = "".join([chunk["text"] for chunk in chunks]) reconstructed_text = "".join([chunk["text"] for chunk in chunks])
assert ( assert reconstructed_text == input_text, (
reconstructed_text == input_text f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -36,9 +36,9 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks]) chunk_lengths = np.array([len(list(chunk_by_word(chunk["text"]))) for chunk in chunks])
larger_chunks = chunk_lengths[chunk_lengths > paragraph_length] larger_chunks = chunk_lengths[chunk_lengths > paragraph_length]
assert np.all( assert np.all(chunk_lengths <= paragraph_length), (
chunk_lengths <= paragraph_length f"{paragraph_length = }: {larger_chunks} are too large"
), f"{paragraph_length = }: {larger_chunks} are too large" )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -50,6 +50,6 @@ def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_
data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
) )
chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks]) chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
assert np.all( assert np.all(chunk_indices == np.arange(len(chunk_indices))), (
chunk_indices == np.arange(len(chunk_indices)) f"{chunk_indices = } are not monotonically increasing"
), f"{chunk_indices = } are not monotonically increasing" )

View file

@ -58,9 +58,9 @@ def run_chunking_test(test_text, expected_chunks):
for expected_chunks_item, chunk in zip(expected_chunks, chunks): for expected_chunks_item, chunk in zip(expected_chunks, chunks):
for key in ["text", "word_count", "cut_type"]: for key in ["text", "word_count", "cut_type"]:
assert ( assert chunk[key] == expected_chunks_item[key], (
chunk[key] == expected_chunks_item[key] f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }"
), f"{key = }: {chunk[key] = } != {expected_chunks_item[key] = }" )
def test_chunking_whole_text(): def test_chunking_whole_text():

View file

@ -16,9 +16,9 @@ maximum_length_vals = [None, 8, 64]
def test_chunk_by_sentence_isomorphism(input_text, maximum_length): def test_chunk_by_sentence_isomorphism(input_text, maximum_length):
chunks = chunk_by_sentence(input_text, maximum_length) chunks = chunk_by_sentence(input_text, maximum_length)
reconstructed_text = "".join([chunk[1] for chunk in chunks]) reconstructed_text = "".join([chunk[1] for chunk in chunks])
assert ( assert reconstructed_text == input_text, (
reconstructed_text == input_text f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -36,6 +36,6 @@ def test_paragraph_chunk_length(input_text, maximum_length):
chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks]) chunk_lengths = np.array([len(list(chunk_by_word(chunk[1]))) for chunk in chunks])
larger_chunks = chunk_lengths[chunk_lengths > maximum_length] larger_chunks = chunk_lengths[chunk_lengths > maximum_length]
assert np.all( assert np.all(chunk_lengths <= maximum_length), (
chunk_lengths <= maximum_length f"{maximum_length = }: {larger_chunks} are too large"
), f"{maximum_length = }: {larger_chunks} are too large" )

View file

@ -17,9 +17,9 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS
def test_chunk_by_word_isomorphism(input_text): def test_chunk_by_word_isomorphism(input_text):
chunks = chunk_by_word(input_text) chunks = chunk_by_word(input_text)
reconstructed_text = "".join([chunk[0] for chunk in chunks]) reconstructed_text = "".join([chunk[0] for chunk in chunks])
assert ( assert reconstructed_text == input_text, (
reconstructed_text == input_text f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
), f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }" )
@pytest.mark.parametrize( @pytest.mark.parametrize(