From ac5fe4761be7e02dd2ad131056e77c5acb595c29 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 15 Oct 2025 10:08:10 +0200 Subject: [PATCH 1/4] test: Add entity extraction test --- .../entity_extraction_test.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 cognee/tests/tasks/entity_extraction/entity_extraction_test.py diff --git a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py new file mode 100644 index 000000000..c63ecfaa1 --- /dev/null +++ b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py @@ -0,0 +1,96 @@ +import os +import pathlib +import asyncio + +import cognee +import cognee.modules.ingestion as ingestion +from cognee.infrastructure.llm import get_max_chunk_tokens +from cognee.infrastructure.llm.extraction import extract_content_graph +from cognee.modules.chunking.TextChunker import TextChunker +from cognee.modules.data.processing.document_types import TextDocument +from cognee.modules.users.methods import get_default_user +from cognee.shared.data_models import KnowledgeGraph +from cognee.tasks.documents import extract_chunks_from_documents +from cognee.tasks.ingestion import save_data_item_to_storage +from cognee.infrastructure.files.utils.open_data_file import open_data_file + + +async def extract_graphs(document_chunks): + """ + Extract graph, and check if entities are present + """ + + extraction_results = await asyncio.gather( + *[ + extract_content_graph(chunk.text, KnowledgeGraph) + for chunk in document_chunks + ] + ) + + return all( + any(term in node.name.lower() + for extraction_result in extraction_results + for node in extraction_result.nodes) + for term in ("qubit", "algorithm", "superposition") + ) + +async def main(): + """ + Test how well the entity extraction works. Repeat graph generation a few times. + If 80% or more graphs are correctly generated, the test passes. + """ + + file_path = os.path.join( + pathlib.Path(__file__).parent.parent.parent, "test_data/Quantum_computers.txt" + ) + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + + await cognee.add("NLP is a subfield of computer science.") + + original_file_path = await save_data_item_to_storage(file_path) + + async with open_data_file(original_file_path) as file: + classified_data = ingestion.classify(file) + + # data_id is the hash of original file contents + owner id to avoid duplicate data + data_id = ingestion.identify(classified_data, await get_default_user()) + + await cognee.add(file_path) + + text_document = TextDocument( + id=data_id, + type="text", + mime_type="text/plain", + name="quantum_text", + raw_data_location=file_path, + external_metadata=None + ) + + document_chunks = [] + async for chunk in extract_chunks_from_documents( + [text_document], + max_chunk_size=get_max_chunk_tokens(), + chunker=TextChunker + ): + document_chunks.append(chunk) + + + number_of_reps = 5 + + graph_results = await asyncio.gather( + *[ + extract_graphs(document_chunks) + for _ in range(number_of_reps) + ] + ) + + + correct_graphs = [result for result in graph_results if result] + + assert len(correct_graphs) >= 0.8 * number_of_reps + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From 6cb54c94f1ccb19918466603f092f5f24c9be837 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 15 Oct 2025 10:08:40 +0200 Subject: [PATCH 2/4] chore: Format --- .../entity_extraction_test.py | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py index c63ecfaa1..39e883e09 100644 --- a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py +++ b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py @@ -17,27 +17,27 @@ from cognee.infrastructure.files.utils.open_data_file import open_data_file async def extract_graphs(document_chunks): """ - Extract graph, and check if entities are present + Extract graph, and check if entities are present """ extraction_results = await asyncio.gather( - *[ - extract_content_graph(chunk.text, KnowledgeGraph) - for chunk in document_chunks - ] + *[extract_content_graph(chunk.text, KnowledgeGraph) for chunk in document_chunks] ) return all( - any(term in node.name.lower() - for extraction_result in extraction_results - for node in extraction_result.nodes) - for term in ("qubit", "algorithm", "superposition") + any( + term in node.name.lower() + for extraction_result in extraction_results + for node in extraction_result.nodes ) + for term in ("qubit", "algorithm", "superposition") + ) + async def main(): """ - Test how well the entity extraction works. Repeat graph generation a few times. - If 80% or more graphs are correctly generated, the test passes. + Test how well the entity extraction works. Repeat graph generation a few times. + If 80% or more graphs are correctly generated, the test passes. """ file_path = os.path.join( @@ -47,7 +47,6 @@ async def main(): await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) - await cognee.add("NLP is a subfield of computer science.") original_file_path = await save_data_item_to_storage(file_path) @@ -66,31 +65,25 @@ async def main(): mime_type="text/plain", name="quantum_text", raw_data_location=file_path, - external_metadata=None + external_metadata=None, ) document_chunks = [] async for chunk in extract_chunks_from_documents( - [text_document], - max_chunk_size=get_max_chunk_tokens(), - chunker=TextChunker + [text_document], max_chunk_size=get_max_chunk_tokens(), chunker=TextChunker ): - document_chunks.append(chunk) - + document_chunks.append(chunk) number_of_reps = 5 graph_results = await asyncio.gather( - *[ - extract_graphs(document_chunks) - for _ in range(number_of_reps) - ] + *[extract_graphs(document_chunks) for _ in range(number_of_reps)] ) - correct_graphs = [result for result in graph_results if result] assert len(correct_graphs) >= 0.8 * number_of_reps + if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From e0663baba4ad5034f3671a5729098f31bc9f4a47 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 15 Oct 2025 18:17:06 +0200 Subject: [PATCH 3/4] test: Add test to e2e workflow --- .github/workflows/e2e_tests.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 3fe7a7992..3de3fbb16 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -264,3 +264,31 @@ jobs: EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./cognee/tests/test_edge_ingestion.py + + test-entity-extraction: + name: Test Entity Extraction + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Dependencies already installed + run: echo "Dependencies already installed in setup" + + - name: Run Entity Extraction Test + env: + ENV: 'dev' + LLM_MODEL: 'gpt-5-2025-08-07' + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/tasks/entity_extraction/entity_extraction_test.py From e8523bf4aaed7bbaa57d8ce4cdf61b52759704e3 Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Tue, 21 Oct 2025 13:00:42 +0200 Subject: [PATCH 4/4] test: Add entity extraction test. Minor checks and fixes. --- .github/workflows/e2e_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 1ad688bed..9548ef493 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -346,7 +346,7 @@ jobs: - name: Run Entity Extraction Test env: ENV: 'dev' - LLM_MODEL: 'gpt-5-2025-08-07' + LLM_MODEL: ${{ secrets.LLM_MODEL }} LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}