cognee/examples/python/extract_usage_frequency_example.py

325 lines
No EOL
12 KiB
Python

# cognee/examples/usage_frequency_example.py
"""
End-to-end example demonstrating usage frequency tracking in Cognee.
This example shows how to:
1. Add data and build a knowledge graph
2. Run searches with save_interaction=True to track usage
3. Extract and apply frequency weights using the memify pipeline
4. Query and analyze the frequency data
The frequency weights can be used to:
- Rank frequently referenced entities higher during retrieval
- Adjust scoring for completion strategies
- Expose usage metrics in dashboards or audits
"""
import asyncio
from datetime import timedelta
from typing import List
import cognee
from cognee.api.v1.search import SearchType
from cognee.tasks.memify.extract_usage_frequency import (
create_usage_frequency_pipeline,
run_usage_frequency_update,
)
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
from cognee.shared.logging_utils import get_logger
logger = get_logger("usage_frequency_example")
async def setup_knowledge_base():
"""Set up a fresh knowledge base with sample data."""
logger.info("Setting up knowledge base...")
# Reset cognee state for clean slate
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
# Sample conversation about AI/ML topics
conversation = [
"Alice discusses machine learning algorithms and their applications in computer vision.",
"Bob asks about neural networks and how they differ from traditional algorithms.",
"Alice explains deep learning concepts including CNNs and transformers.",
"Bob wants more details about neural networks and backpropagation.",
"Alice describes reinforcement learning and its use in robotics.",
"Bob inquires about natural language processing and transformers.",
]
# Add conversation data and build knowledge graph
logger.info("Adding conversation data...")
await cognee.add(conversation, dataset_name="ai_ml_conversation")
logger.info("Building knowledge graph (cognify)...")
await cognee.cognify()
logger.info("Knowledge base setup complete")
async def simulate_user_searches():
"""Simulate multiple user searches to generate interaction data."""
logger.info("Simulating user searches with save_interaction=True...")
# Different queries that will create CogneeUserInteraction nodes
queries = [
"What is machine learning?",
"Explain neural networks",
"Tell me about deep learning",
"What are neural networks?", # Repeat to increase frequency
"How does machine learning work?",
"Describe transformers in NLP",
"What is reinforcement learning?",
"Explain neural networks again", # Another repeat
]
search_count = 0
for query in queries:
try:
logger.info(f"Searching: '{query}'")
results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text=query,
save_interaction=True, # Critical: saves interaction to graph
top_k=5
)
search_count += 1
logger.debug(f"Search completed, got {len(results) if results else 0} results")
except Exception as e:
logger.warning(f"Search failed for '{query}': {e}")
logger.info(f"Completed {search_count} searches with interactions saved")
return search_count
async def retrieve_interaction_graph() -> List[CogneeGraph]:
"""Retrieve the graph containing interaction nodes."""
logger.info("Retrieving graph with interaction data...")
graph_engine = await get_graph_engine()
graph = CogneeGraph()
# Project the full graph including CogneeUserInteraction nodes
await graph.project_graph_from_db(
adapter=graph_engine,
node_properties_to_project=["type", "node_type", "timestamp", "created_at", "text", "name"],
edge_properties_to_project=["relationship_type", "timestamp", "created_at"],
directed=True,
)
logger.info(f"Retrieved graph: {len(graph.nodes)} nodes, {len(graph.edges)} edges")
# Count interaction nodes for verification
interaction_count = sum(
1 for node in graph.nodes.values()
if node.attributes.get('type') == 'CogneeUserInteraction' or
node.attributes.get('node_type') == 'CogneeUserInteraction'
)
logger.info(f"Found {interaction_count} CogneeUserInteraction nodes in graph")
return [graph]
async def run_frequency_pipeline_method1():
"""Method 1: Using the pipeline creation function."""
logger.info("\n=== Method 1: Using create_usage_frequency_pipeline ===")
graph_engine = await get_graph_engine()
subgraphs = await retrieve_interaction_graph()
# Create the pipeline tasks
extraction_tasks, enrichment_tasks = await create_usage_frequency_pipeline(
graph_adapter=graph_engine,
time_window=timedelta(days=30), # Last 30 days
min_interaction_threshold=1, # Count all interactions
batch_size=100
)
logger.info("Running extraction tasks...")
# Note: In real memify pipeline, these would be executed by the pipeline runner
# For this example, we'll execute them manually
for task in extraction_tasks:
if hasattr(task, 'function'):
result = await task.function(
subgraphs=subgraphs,
time_window=timedelta(days=30),
min_interaction_threshold=1
)
logger.info(f"Extraction result: {result.get('interactions_in_window')} interactions processed")
logger.info("Running enrichment tasks...")
for task in enrichment_tasks:
if hasattr(task, 'function'):
await task.function(
graph_adapter=graph_engine,
usage_frequencies=result
)
return result
async def run_frequency_pipeline_method2():
"""Method 2: Using the convenience function."""
logger.info("\n=== Method 2: Using run_usage_frequency_update ===")
graph_engine = await get_graph_engine()
subgraphs = await retrieve_interaction_graph()
# Run the complete pipeline in one call
stats = await run_usage_frequency_update(
graph_adapter=graph_engine,
subgraphs=subgraphs,
time_window=timedelta(days=30),
min_interaction_threshold=1
)
logger.info("Frequency update statistics:")
logger.info(f" Total interactions: {stats['total_interactions']}")
logger.info(f" Interactions in window: {stats['interactions_in_window']}")
logger.info(f" Nodes with frequency weights: {len(stats['node_frequencies'])}")
logger.info(f" Element types: {stats.get('element_type_frequencies', {})}")
return stats
async def analyze_frequency_weights():
"""Analyze and display the frequency weights that were added."""
logger.info("\n=== Analyzing Frequency Weights ===")
graph_engine = await get_graph_engine()
graph = CogneeGraph()
# Project graph with frequency weights
await graph.project_graph_from_db(
adapter=graph_engine,
node_properties_to_project=[
"type",
"node_type",
"text",
"name",
"frequency_weight", # Our added property
"frequency_updated_at"
],
edge_properties_to_project=["relationship_type"],
directed=True,
)
# Find nodes with frequency weights
weighted_nodes = []
for node_id, node in graph.nodes.items():
freq_weight = node.attributes.get('frequency_weight')
if freq_weight is not None:
weighted_nodes.append({
'id': node_id,
'type': node.attributes.get('type') or node.attributes.get('node_type'),
'text': node.attributes.get('text', '')[:100], # First 100 chars
'name': node.attributes.get('name', ''),
'frequency_weight': freq_weight,
'updated_at': node.attributes.get('frequency_updated_at')
})
# Sort by frequency (descending)
weighted_nodes.sort(key=lambda x: x['frequency_weight'], reverse=True)
logger.info(f"\nFound {len(weighted_nodes)} nodes with frequency weights:")
logger.info("\nTop 10 Most Frequently Referenced Elements:")
logger.info("-" * 80)
for i, node in enumerate(weighted_nodes[:10], 1):
logger.info(f"\n{i}. Frequency: {node['frequency_weight']}")
logger.info(f" Type: {node['type']}")
logger.info(f" Name: {node['name']}")
logger.info(f" Text: {node['text']}")
logger.info(f" ID: {node['id'][:50]}...")
return weighted_nodes
async def demonstrate_retrieval_with_frequencies():
"""Demonstrate how frequency weights can be used in retrieval."""
logger.info("\n=== Demonstrating Retrieval with Frequency Weights ===")
# This is a conceptual demonstration of how frequency weights
# could be used to boost search results
query = "neural networks"
logger.info(f"Searching for: '{query}'")
try:
# Standard search
standard_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text=query,
save_interaction=False, # Don't add more interactions
top_k=5
)
logger.info(f"Standard search returned {len(standard_results) if standard_results else 0} results")
# Note: To actually use frequency_weight in scoring, you would need to:
# 1. Modify the retrieval/ranking logic to consider frequency_weight
# 2. Add frequency_weight as a scoring factor in the completion strategy
# 3. Use it in analytics dashboards to show popular topics
logger.info("\nFrequency weights can now be used for:")
logger.info(" - Boosting frequently-accessed nodes in search rankings")
logger.info(" - Adjusting triplet importance scores")
logger.info(" - Building usage analytics dashboards")
logger.info(" - Identifying 'hot' topics in the knowledge graph")
except Exception as e:
logger.warning(f"Demonstration search failed: {e}")
async def main():
"""Main execution flow."""
logger.info("=" * 80)
logger.info("Usage Frequency Tracking Example")
logger.info("=" * 80)
try:
# Step 1: Setup knowledge base
await setup_knowledge_base()
# Step 2: Simulate user searches with save_interaction=True
search_count = await simulate_user_searches()
if search_count == 0:
logger.warning("No searches completed - cannot demonstrate frequency tracking")
return
# Step 3: Run frequency extraction and enrichment
# You can use either method - both accomplish the same thing
# Option A: Using the convenience function (recommended)
stats = await run_frequency_pipeline_method2()
# Option B: Using the pipeline creation function (for custom pipelines)
# stats = await run_frequency_pipeline_method1()
# Step 4: Analyze the results
weighted_nodes = await analyze_frequency_weights()
# Step 5: Demonstrate retrieval usage
await demonstrate_retrieval_with_frequencies()
# Summary
logger.info("\n" + "=" * 80)
logger.info("SUMMARY")
logger.info("=" * 80)
logger.info(f"Searches performed: {search_count}")
logger.info(f"Interactions tracked: {stats.get('interactions_in_window', 0)}")
logger.info(f"Nodes weighted: {len(weighted_nodes)}")
logger.info(f"Time window: {stats.get('time_window_days', 0)} days")
logger.info("\nFrequency weights have been added to the graph!")
logger.info("These can now be used in retrieval, ranking, and analytics.")
logger.info("=" * 80)
except Exception as e:
logger.error(f"Example failed: {e}", exc_info=True)
raise
if __name__ == "__main__":
asyncio.run(main())