diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 05c2b055e..ba7dfdce7 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -265,7 +265,7 @@ jobs: run: | poetry install - - name: Run parallel databases test + - name: Run permissions test env: ENV: 'dev' LLM_MODEL: ${{ secrets.LLM_MODEL }} @@ -277,3 +277,32 @@ jobs: EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_permissions.py + + test-knowledge-graph-quality: + name: Test Knowledge Graph Quality with GPT-4o + runs-on: ubuntu-22.04 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + + - name: Install dependencies + run: | + poetry install + + - name: Run Knowledge Graph Quality Test + env: + ENV: 'dev' + LLM_MODEL: 'gpt-4o' + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: poetry run python ./cognee/tests/test_knowledge_graph_quality.py diff --git a/cognee/tests/test_knowledge_graph_quality.py b/cognee/tests/test_knowledge_graph_quality.py new file mode 100644 index 000000000..dd4b610ac --- /dev/null +++ b/cognee/tests/test_knowledge_graph_quality.py @@ -0,0 +1,286 @@ +import os +import asyncio +import cognee +import pathlib +from cognee.modules.search.types import SearchType +from cognee.modules.users.methods import get_default_user +from cognee.shared.logging_utils import get_logger + +logger = get_logger() + + +async def test_knowledge_graph_quality_with_gpt4o(): + """ + Test that verifies all main concepts and entities from a specific document are found + in the knowledge graph using GPT-4o model for high-quality entity extraction. + + This test addresses the issue where HotPotQA questions may not reflect diminishing + quality of knowledge graph creation after data model changes. + """ + + # Configure GPT-4o for best quality + os.environ["LLM_MODEL"] = "gpt-4o" + cognee.config.set_llm_model("gpt-4o") + + # Ensure we have API key + if not os.environ.get("LLM_API_KEY"): + raise ValueError("LLM_API_KEY must be set for this test") + + # Set up test directories + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_kg_quality") + ).resolve() + ) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_kg_quality") + ).resolve() + ) + + cognee.config.data_root_directory(data_directory_path) + cognee.config.system_root_directory(cognee_directory_path) + + # Clean up before starting + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + # Get test document path + test_document_path = os.path.join( + pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" + ) + + # Expected entities and concepts from the NLP document + expected_entities = [ + "Natural language processing", + "NLP", + "computer science", + "information retrieval", + "machine learning", + "neural network", + "speech recognition", + "natural-language understanding", + "natural-language generation", + "theoretical linguistics", + "text corpora", + "speech corpora", + "statistical approaches", + "probabilistic approaches", + "rule-based approaches", + "documents", + "language", + "computers", + ] + + expected_concepts = [ + "NLP is a subfield of computer science", + "NLP is interdisciplinary", + "NLP involves processing natural language datasets", + "NLP uses machine learning approaches", + "NLP borrows ideas from theoretical linguistics", + "NLP can extract information from documents", + "NLP can categorize and organize documents", + "NLP involves speech recognition", + "NLP involves natural-language understanding", + "NLP involves natural-language generation", + "computers can understand document contents", + "neural networks are used in NLP", + "statistical approaches are used in NLP", + ] + + print("=" * 80) + print("KNOWLEDGE GRAPH QUALITY TEST WITH GPT-4o") + print("=" * 80) + print(f"Using model: {os.environ.get('LLM_MODEL', 'gpt-4o')}") + print(f"Test document: {test_document_path}") + print() + + # Add and process the document + print("Adding document to cognee...") + await cognee.add([test_document_path], dataset_name="NLP_TEST") + + user = await get_default_user() + + print("Processing document with cognify...") + await cognee.cognify(["NLP_TEST"], user=user) + print("Document processing completed.") + print() + + # Test different search types to find entities and concepts + search_types_to_test = [ + (SearchType.INSIGHTS, "Get entity relationships and connections"), + (SearchType.GRAPH_COMPLETION, "Natural language completion with graph context"), + (SearchType.CHUNKS, "Find relevant document chunks"), + (SearchType.SUMMARIES, "Get content summaries"), + ] + + all_found_results = {} + + for search_type, description in search_types_to_test: + print(f"Testing {search_type.value} search - {description}") + print("-" * 60) + + # Search for entities + entity_results = await cognee.search( + query_type=search_type, + query_text="What are the main entities, concepts, and terms mentioned in this document?", + user=user, + top_k=20, + ) + + # Search for relationships + relationship_results = await cognee.search( + query_type=search_type, + query_text="What are the key relationships and connections between concepts in this document?", + user=user, + top_k=20, + ) + + all_found_results[search_type.value] = { + "entities": entity_results, + "relationships": relationship_results, + } + + print(f"Entity search results ({len(entity_results)} items):") + for i, result in enumerate(entity_results[:3]): # Show first 3 results + print(f" {i + 1}. {result}") + + print(f"Relationship search results ({len(relationship_results)} items):") + for i, result in enumerate(relationship_results[:3]): # Show first 3 results + print(f" {i + 1}. {result}") + print() + + # Analyze results and check for expected entities and concepts + print("ANALYSIS: Expected vs Found") + print("=" * 80) + + # Combine all results into a single text for analysis + all_results_text = "" + for search_type, results in all_found_results.items(): + for result_type, result_list in results.items(): + all_results_text += f" {' '.join(str(r) for r in result_list)}" + + all_results_text = all_results_text.lower() + + print("ENTITY ANALYSIS:") + print("-" * 40) + found_entities = [] + missing_entities = [] + + for entity in expected_entities: + entity_lower = entity.lower() + # Check if entity or its variations are found + if ( + entity_lower in all_results_text + or entity_lower.replace("-", " ") in all_results_text + or entity_lower.replace(" ", "-") in all_results_text + ): + found_entities.append(entity) + print(f"✓ FOUND: {entity}") + else: + missing_entities.append(entity) + print(f"✗ MISSING: {entity}") + + print() + print("CONCEPT ANALYSIS:") + print("-" * 40) + found_concepts = [] + missing_concepts = [] + + for concept in expected_concepts: + concept_lower = concept.lower() + # Check if key parts of the concept are found + concept_words = concept_lower.split() + key_words = [ + word + for word in concept_words + if len(word) > 2 + and word not in ["the", "and", "are", "can", "involves", "uses", "from"] + ] + + if len(key_words) > 0: + found_key_words = sum(1 for word in key_words if word in all_results_text) + coverage = found_key_words / len(key_words) + + if coverage >= 0.6: # At least 60% of key words found + found_concepts.append(concept) + print(f"✓ FOUND: {concept} (coverage: {coverage:.1%})") + else: + missing_concepts.append(concept) + print(f"✗ MISSING: {concept} (coverage: {coverage:.1%})") + else: + missing_concepts.append(concept) + print(f"✗ MISSING: {concept} (no key words)") + + print() + print("SUMMARY:") + print("=" * 40) + print(f"Expected entities: {len(expected_entities)}") + print(f"Found entities: {len(found_entities)}") + print(f"Missing entities: {len(missing_entities)}") + print(f"Entity coverage: {len(found_entities) / len(expected_entities):.1%}") + print() + print(f"Expected concepts: {len(expected_concepts)}") + print(f"Found concepts: {len(found_concepts)}") + print(f"Missing concepts: {len(missing_concepts)}") + print(f"Concept coverage: {len(found_concepts) / len(expected_concepts):.1%}") + print() + + # Test assertions + entity_coverage = len(found_entities) / len(expected_entities) + concept_coverage = len(found_concepts) / len(expected_concepts) + + print("QUALITY ASSESSMENT:") + print("-" * 40) + + # We expect high coverage with GPT-4o + min_entity_coverage = 0.70 # At least 70% of entities should be found + min_concept_coverage = 0.60 # At least 60% of concepts should be found + + if entity_coverage >= min_entity_coverage: + print( + f"✓ PASS: Entity coverage ({entity_coverage:.1%}) meets minimum requirement ({min_entity_coverage:.1%})" + ) + else: + print( + f"✗ FAIL: Entity coverage ({entity_coverage:.1%}) below minimum requirement ({min_entity_coverage:.1%})" + ) + + if concept_coverage >= min_concept_coverage: + print( + f"✓ PASS: Concept coverage ({concept_coverage:.1%}) meets minimum requirement ({min_concept_coverage:.1%})" + ) + else: + print( + f"✗ FAIL: Concept coverage ({concept_coverage:.1%}) below minimum requirement ({min_concept_coverage:.1%})" + ) + + overall_quality = (entity_coverage + concept_coverage) / 2 + print(f"Overall quality score: {overall_quality:.1%}") + + # Assert that we have acceptable quality + assert entity_coverage >= min_entity_coverage, ( + f"Entity coverage {entity_coverage:.1%} below minimum {min_entity_coverage:.1%}" + ) + assert concept_coverage >= min_concept_coverage, ( + f"Concept coverage {concept_coverage:.1%} below minimum {min_concept_coverage:.1%}" + ) + + print() + print("=" * 80) + print("KNOWLEDGE GRAPH QUALITY TEST COMPLETED SUCCESSFULLY") + print("=" * 80) + + return { + "entity_coverage": entity_coverage, + "concept_coverage": concept_coverage, + "overall_quality": overall_quality, + "found_entities": found_entities, + "missing_entities": missing_entities, + "found_concepts": found_concepts, + "missing_concepts": missing_concepts, + } + + +if __name__ == "__main__": + asyncio.run(test_knowledge_graph_quality_with_gpt4o()) diff --git a/cognee/tests/test_permissions.py b/cognee/tests/test_permissions.py index ddf07f26d..b83c150ce 100644 --- a/cognee/tests/test_permissions.py +++ b/cognee/tests/test_permissions.py @@ -9,279 +9,6 @@ from cognee.modules.users.methods import get_default_user, create_user from cognee.modules.users.permissions.methods import authorized_give_permission_on_datasets -async def test_knowledge_graph_quality_with_gpt4o(): - """ - Test that verifies all main concepts and entities from a specific document are found - in the knowledge graph using GPT-4o model for high-quality entity extraction. - - This test addresses the issue where HotPotQA questions may not reflect diminishing - quality of knowledge graph creation after data model changes. - """ - - # Configure GPT-4o for best quality - os.environ["LLM_MODEL"] = "gpt-4o" - cognee.config.set_llm_model("gpt-4o") - - # Ensure we have API key - if not os.environ.get("LLM_API_KEY"): - raise ValueError("LLM_API_KEY must be set for this test") - - # Set up test directories - data_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_kg_quality") - ).resolve() - ) - cognee_directory_path = str( - pathlib.Path( - os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_kg_quality") - ).resolve() - ) - - cognee.config.data_root_directory(data_directory_path) - cognee.config.system_root_directory(cognee_directory_path) - - # Clean up before starting - await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata=True) - - # Get test document path - test_document_path = os.path.join( - pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" - ) - - # Expected entities and concepts from the NLP document - expected_entities = [ - "Natural language processing", - "NLP", - "computer science", - "information retrieval", - "machine learning", - "neural network", - "speech recognition", - "natural-language understanding", - "natural-language generation", - "theoretical linguistics", - "text corpora", - "speech corpora", - "statistical approaches", - "probabilistic approaches", - "rule-based approaches", - "documents", - "language", - "computers", - ] - - expected_concepts = [ - "NLP is a subfield of computer science", - "NLP is interdisciplinary", - "NLP involves processing natural language datasets", - "NLP uses machine learning approaches", - "NLP borrows ideas from theoretical linguistics", - "NLP can extract information from documents", - "NLP can categorize and organize documents", - "NLP involves speech recognition", - "NLP involves natural-language understanding", - "NLP involves natural-language generation", - "computers can understand document contents", - "neural networks are used in NLP", - "statistical approaches are used in NLP", - ] - - print("=" * 80) - print("KNOWLEDGE GRAPH QUALITY TEST WITH GPT-4o") - print("=" * 80) - print(f"Using model: {os.environ.get('LLM_MODEL', 'gpt-4o')}") - print(f"Test document: {test_document_path}") - print() - - # Add and process the document - print("Adding document to cognee...") - await cognee.add([test_document_path], dataset_name="NLP_TEST") - - user = await get_default_user() - - print("Processing document with cognify...") - await cognee.cognify(["NLP_TEST"], user=user) - print("Document processing completed.") - print() - - # Test different search types to find entities and concepts - search_types_to_test = [ - (SearchType.INSIGHTS, "Get entity relationships and connections"), - (SearchType.GRAPH_COMPLETION, "Natural language completion with graph context"), - (SearchType.CHUNKS, "Find relevant document chunks"), - (SearchType.SUMMARIES, "Get content summaries"), - ] - - all_found_results = {} - - for search_type, description in search_types_to_test: - print(f"Testing {search_type.value} search - {description}") - print("-" * 60) - - # Search for entities - entity_results = await cognee.search( - query_type=search_type, - query_text="What are the main entities, concepts, and terms mentioned in this document?", - user=user, - top_k=20, - ) - - # Search for relationships - relationship_results = await cognee.search( - query_type=search_type, - query_text="What are the key relationships and connections between concepts in this document?", - user=user, - top_k=20, - ) - - all_found_results[search_type.value] = { - "entities": entity_results, - "relationships": relationship_results, - } - - print(f"Entity search results ({len(entity_results)} items):") - for i, result in enumerate(entity_results[:3]): # Show first 3 results - print(f" {i + 1}. {result}") - - print(f"Relationship search results ({len(relationship_results)} items):") - for i, result in enumerate(relationship_results[:3]): # Show first 3 results - print(f" {i + 1}. {result}") - print() - - # Analyze results and check for expected entities and concepts - print("ANALYSIS: Expected vs Found") - print("=" * 80) - - # Combine all results into a single text for analysis - all_results_text = "" - for search_type, results in all_found_results.items(): - for result_type, result_list in results.items(): - all_results_text += f" {' '.join(str(r) for r in result_list)}" - - all_results_text = all_results_text.lower() - - print("ENTITY ANALYSIS:") - print("-" * 40) - found_entities = [] - missing_entities = [] - - for entity in expected_entities: - entity_lower = entity.lower() - # Check if entity or its variations are found - if ( - entity_lower in all_results_text - or entity_lower.replace("-", " ") in all_results_text - or entity_lower.replace(" ", "-") in all_results_text - ): - found_entities.append(entity) - print(f"✓ FOUND: {entity}") - else: - missing_entities.append(entity) - print(f"✗ MISSING: {entity}") - - print() - print("CONCEPT ANALYSIS:") - print("-" * 40) - found_concepts = [] - missing_concepts = [] - - for concept in expected_concepts: - concept_lower = concept.lower() - # Check if key parts of the concept are found - concept_words = concept_lower.split() - key_words = [ - word - for word in concept_words - if len(word) > 2 - and word not in ["the", "and", "are", "can", "involves", "uses", "from"] - ] - - if len(key_words) > 0: - found_key_words = sum(1 for word in key_words if word in all_results_text) - coverage = found_key_words / len(key_words) - - if coverage >= 0.6: # At least 60% of key words found - found_concepts.append(concept) - print(f"✓ FOUND: {concept} (coverage: {coverage:.1%})") - else: - missing_concepts.append(concept) - print(f"✗ MISSING: {concept} (coverage: {coverage:.1%})") - else: - missing_concepts.append(concept) - print(f"✗ MISSING: {concept} (no key words)") - - print() - print("SUMMARY:") - print("=" * 40) - print(f"Expected entities: {len(expected_entities)}") - print(f"Found entities: {len(found_entities)}") - print(f"Missing entities: {len(missing_entities)}") - print(f"Entity coverage: {len(found_entities) / len(expected_entities):.1%}") - print() - print(f"Expected concepts: {len(expected_concepts)}") - print(f"Found concepts: {len(found_concepts)}") - print(f"Missing concepts: {len(missing_concepts)}") - print(f"Concept coverage: {len(found_concepts) / len(expected_concepts):.1%}") - print() - - # Test assertions - entity_coverage = len(found_entities) / len(expected_entities) - concept_coverage = len(found_concepts) / len(expected_concepts) - - print("QUALITY ASSESSMENT:") - print("-" * 40) - - # We expect high coverage with GPT-4o - min_entity_coverage = 0.70 # At least 70% of entities should be found - min_concept_coverage = 0.60 # At least 60% of concepts should be found - - if entity_coverage >= min_entity_coverage: - print( - f"✓ PASS: Entity coverage ({entity_coverage:.1%}) meets minimum requirement ({min_entity_coverage:.1%})" - ) - else: - print( - f"✗ FAIL: Entity coverage ({entity_coverage:.1%}) below minimum requirement ({min_entity_coverage:.1%})" - ) - - if concept_coverage >= min_concept_coverage: - print( - f"✓ PASS: Concept coverage ({concept_coverage:.1%}) meets minimum requirement ({min_concept_coverage:.1%})" - ) - else: - print( - f"✗ FAIL: Concept coverage ({concept_coverage:.1%}) below minimum requirement ({min_concept_coverage:.1%})" - ) - - overall_quality = (entity_coverage + concept_coverage) / 2 - print(f"Overall quality score: {overall_quality:.1%}") - - # Assert that we have acceptable quality - assert entity_coverage >= min_entity_coverage, ( - f"Entity coverage {entity_coverage:.1%} below minimum {min_entity_coverage:.1%}" - ) - assert concept_coverage >= min_concept_coverage, ( - f"Concept coverage {concept_coverage:.1%} below minimum {min_concept_coverage:.1%}" - ) - - print() - print("=" * 80) - print("KNOWLEDGE GRAPH QUALITY TEST COMPLETED SUCCESSFULLY") - print("=" * 80) - - return { - "entity_coverage": entity_coverage, - "concept_coverage": concept_coverage, - "overall_quality": overall_quality, - "found_entities": found_entities, - "missing_entities": missing_entities, - "found_concepts": found_concepts, - "missing_concepts": missing_concepts, - } - - logger = get_logger() @@ -471,18 +198,7 @@ async def main(): await cognee.delete([explanation_file_path], dataset_id=test_user_dataset_id, user=default_user) -async def main_quality_test(): - """Main function to run the knowledge graph quality test""" - await test_knowledge_graph_quality_with_gpt4o() - - if __name__ == "__main__": import asyncio - import sys - if len(sys.argv) > 1 and sys.argv[1] == "quality": - print("Running Knowledge Graph Quality Test...") - asyncio.run(main_quality_test()) - else: - print("Running Permissions Test...") - asyncio.run(main()) + asyncio.run(main())