From ac5fe4761be7e02dd2ad131056e77c5acb595c29 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 15 Oct 2025 10:08:10 +0200
Subject: [PATCH 1/4] test: Add entity extraction test

---
 .../entity_extraction_test.py                 | 96 +++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 cognee/tests/tasks/entity_extraction/entity_extraction_test.py

diff --git a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py
new file mode 100644
index 000000000..c63ecfaa1
--- /dev/null
+++ b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py
@@ -0,0 +1,96 @@
+import os
+import pathlib
+import asyncio
+
+import cognee
+import cognee.modules.ingestion as ingestion
+from cognee.infrastructure.llm import get_max_chunk_tokens
+from cognee.infrastructure.llm.extraction import extract_content_graph
+from cognee.modules.chunking.TextChunker import TextChunker
+from cognee.modules.data.processing.document_types import TextDocument
+from cognee.modules.users.methods import get_default_user
+from cognee.shared.data_models import KnowledgeGraph
+from cognee.tasks.documents import extract_chunks_from_documents
+from cognee.tasks.ingestion import save_data_item_to_storage
+from cognee.infrastructure.files.utils.open_data_file import open_data_file
+
+
+async def extract_graphs(document_chunks):
+    """
+        Extract graph, and check if entities are present
+    """
+
+    extraction_results = await asyncio.gather(
+        *[
+            extract_content_graph(chunk.text, KnowledgeGraph)
+            for chunk in document_chunks
+        ]
+    )
+
+    return all(
+            any(term in node.name.lower()
+                for extraction_result in extraction_results
+                for node in extraction_result.nodes)
+            for term in ("qubit", "algorithm", "superposition")
+        )
+
+async def main():
+    """
+        Test how well the entity extraction works. Repeat graph generation a few times.
+        If 80% or more graphs are correctly generated, the test passes.
+    """
+
+    file_path = os.path.join(
+        pathlib.Path(__file__).parent.parent.parent, "test_data/Quantum_computers.txt"
+    )
+
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+
+    await cognee.add("NLP is a subfield of computer science.")
+
+    original_file_path = await save_data_item_to_storage(file_path)
+
+    async with open_data_file(original_file_path) as file:
+        classified_data = ingestion.classify(file)
+
+        # data_id is the hash of original file contents + owner id to avoid duplicate data
+        data_id = ingestion.identify(classified_data, await get_default_user())
+
+    await cognee.add(file_path)
+
+    text_document = TextDocument(
+        id=data_id,
+        type="text",
+        mime_type="text/plain",
+        name="quantum_text",
+        raw_data_location=file_path,
+        external_metadata=None
+    )
+
+    document_chunks = []
+    async for chunk in extract_chunks_from_documents(
+        [text_document],
+        max_chunk_size=get_max_chunk_tokens(),
+        chunker=TextChunker
+    ):
+         document_chunks.append(chunk)
+
+
+    number_of_reps = 5
+
+    graph_results = await asyncio.gather(
+        *[
+            extract_graphs(document_chunks)
+            for _ in range(number_of_reps)
+        ]
+    )
+
+
+    correct_graphs = [result for result in graph_results if result]
+
+    assert len(correct_graphs) >= 0.8 * number_of_reps
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file

From 6cb54c94f1ccb19918466603f092f5f24c9be837 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 15 Oct 2025 10:08:40 +0200
Subject: [PATCH 2/4] chore: Format

---
 .../entity_extraction_test.py                 | 41 ++++++++-----------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py
index c63ecfaa1..39e883e09 100644
--- a/cognee/tests/tasks/entity_extraction/entity_extraction_test.py
+++ b/cognee/tests/tasks/entity_extraction/entity_extraction_test.py
@@ -17,27 +17,27 @@ from cognee.infrastructure.files.utils.open_data_file import open_data_file
 
 async def extract_graphs(document_chunks):
     """
-        Extract graph, and check if entities are present
+    Extract graph, and check if entities are present
     """
 
     extraction_results = await asyncio.gather(
-        *[
-            extract_content_graph(chunk.text, KnowledgeGraph)
-            for chunk in document_chunks
-        ]
+        *[extract_content_graph(chunk.text, KnowledgeGraph) for chunk in document_chunks]
     )
 
     return all(
-            any(term in node.name.lower()
-                for extraction_result in extraction_results
-                for node in extraction_result.nodes)
-            for term in ("qubit", "algorithm", "superposition")
+        any(
+            term in node.name.lower()
+            for extraction_result in extraction_results
+            for node in extraction_result.nodes
         )
+        for term in ("qubit", "algorithm", "superposition")
+    )
+
 
 async def main():
     """
-        Test how well the entity extraction works. Repeat graph generation a few times.
-        If 80% or more graphs are correctly generated, the test passes.
+    Test how well the entity extraction works. Repeat graph generation a few times.
+    If 80% or more graphs are correctly generated, the test passes.
     """
 
     file_path = os.path.join(
@@ -47,7 +47,6 @@ async def main():
     await cognee.prune.prune_data()
     await cognee.prune.prune_system(metadata=True)
 
-
     await cognee.add("NLP is a subfield of computer science.")
 
     original_file_path = await save_data_item_to_storage(file_path)
@@ -66,31 +65,25 @@ async def main():
         mime_type="text/plain",
         name="quantum_text",
         raw_data_location=file_path,
-        external_metadata=None
+        external_metadata=None,
     )
 
     document_chunks = []
     async for chunk in extract_chunks_from_documents(
-        [text_document],
-        max_chunk_size=get_max_chunk_tokens(),
-        chunker=TextChunker
+        [text_document], max_chunk_size=get_max_chunk_tokens(), chunker=TextChunker
     ):
-         document_chunks.append(chunk)
-
+        document_chunks.append(chunk)
 
     number_of_reps = 5
 
     graph_results = await asyncio.gather(
-        *[
-            extract_graphs(document_chunks)
-            for _ in range(number_of_reps)
-        ]
+        *[extract_graphs(document_chunks) for _ in range(number_of_reps)]
     )
 
-
     correct_graphs = [result for result in graph_results if result]
 
     assert len(correct_graphs) >= 0.8 * number_of_reps
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())

From e0663baba4ad5034f3671a5729098f31bc9f4a47 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Wed, 15 Oct 2025 18:17:06 +0200
Subject: [PATCH 3/4] test: Add test to e2e workflow

---
 .github/workflows/e2e_tests.yml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
index 3fe7a7992..3de3fbb16 100644
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@@ -264,3 +264,31 @@ jobs:
           EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
           EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
         run: uv run python ./cognee/tests/test_edge_ingestion.py
+
+  test-entity-extraction:
+    name: Test Entity Extraction
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Cognee Setup
+        uses: ./.github/actions/cognee_setup
+        with:
+          python-version: '3.11.x'
+
+      - name: Dependencies already installed
+        run: echo "Dependencies already installed in setup"
+
+      - name: Run Entity Extraction Test
+        env:
+          ENV: 'dev'
+          LLM_MODEL: 'gpt-5-2025-08-07'
+          LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
+          EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
+          EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
+          EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
+          EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
+        run: uv run python ./cognee/tests/tasks/entity_extraction/entity_extraction_test.py

From e8523bf4aaed7bbaa57d8ce4cdf61b52759704e3 Mon Sep 17 00:00:00 2001
From: Andrej Milicevic <milicevi@Andrejs-MacBook-Pro.local>
Date: Tue, 21 Oct 2025 13:00:42 +0200
Subject: [PATCH 4/4] test: Add entity extraction test. Minor checks and fixes.

---
 .github/workflows/e2e_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
index 1ad688bed..9548ef493 100644
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@@ -346,7 +346,7 @@ jobs:
       - name: Run Entity Extraction Test
         env:
           ENV: 'dev'
-          LLM_MODEL: 'gpt-5-2025-08-07'
+          LLM_MODEL: ${{ secrets.LLM_MODEL }}
           LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}