cognee/examples/python/extract_usage_frequency_example.py

482 lines
16 KiB
Python

#!/usr/bin/env python3
"""
End-to-End Example: Usage Frequency Tracking in Cognee
This example demonstrates the complete workflow for tracking and analyzing
how frequently different graph elements are accessed through user searches.
Features demonstrated:
- Setting up a knowledge base
- Running searches with interaction tracking (save_interaction=True)
- Extracting usage frequencies from interaction data
- Applying frequency weights to graph nodes
- Analyzing and visualizing the results
Use cases:
- Ranking search results by popularity
- Identifying "hot topics" in your knowledge base
- Understanding user behavior and interests
- Improving retrieval based on usage patterns
"""
import asyncio
import os
from datetime import timedelta
from typing import List, Dict, Any
from dotenv import load_dotenv
import cognee
from cognee.api.v1.search import SearchType
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
from cognee.tasks.memify.extract_usage_frequency import run_usage_frequency_update
# Load environment variables
load_dotenv()
# ============================================================================
# STEP 1: Setup and Configuration
# ============================================================================
async def setup_knowledge_base():
"""
Create a fresh knowledge base with sample content.
In a real application, you would:
- Load documents from files, databases, or APIs
- Process larger datasets
- Organize content by datasets/categories
"""
print("=" * 80)
print("STEP 1: Setting up knowledge base")
print("=" * 80)
# Reset state for clean demo (optional in production)
print("\nResetting Cognee state...")
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
print("✓ Reset complete")
# Sample content: AI/ML educational material
documents = [
"""
Machine Learning Fundamentals:
Machine learning is a subset of artificial intelligence that enables systems
to learn and improve from experience without being explicitly programmed.
The three main types are supervised learning, unsupervised learning, and
reinforcement learning.
""",
"""
Neural Networks Explained:
Neural networks are computing systems inspired by biological neural networks.
They consist of layers of interconnected nodes (neurons) that process information
through weighted connections. Deep learning uses neural networks with many layers
to automatically learn hierarchical representations of data.
""",
"""
Natural Language Processing:
NLP enables computers to understand, interpret, and generate human language.
Modern NLP uses transformer architectures like BERT and GPT, which have
revolutionized tasks such as translation, summarization, and question answering.
""",
"""
Computer Vision Applications:
Computer vision allows machines to interpret visual information from the world.
Convolutional neural networks (CNNs) are particularly effective for image
recognition, object detection, and image segmentation tasks.
""",
]
print(f"\nAdding {len(documents)} documents to knowledge base...")
await cognee.add(documents, dataset_name="ai_ml_fundamentals")
print("✓ Documents added")
# Build knowledge graph
print("\nBuilding knowledge graph (cognify)...")
await cognee.cognify()
print("✓ Knowledge graph built")
print("\n" + "=" * 80)
# ============================================================================
# STEP 2: Simulate User Searches with Interaction Tracking
# ============================================================================
async def simulate_user_searches(queries: List[str]):
"""
Simulate users searching the knowledge base.
The key parameter is save_interaction=True, which creates:
- CogneeUserInteraction nodes (one per search)
- used_graph_element_to_answer edges (connecting queries to relevant nodes)
Args:
queries: List of search queries to simulate
Returns:
Number of successful searches
"""
print("=" * 80)
print("STEP 2: Simulating user searches with interaction tracking")
print("=" * 80)
successful_searches = 0
for i, query in enumerate(queries, 1):
print(f"\nSearch {i}/{len(queries)}: '{query}'")
try:
results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text=query,
save_interaction=True, # ← THIS IS CRITICAL!
top_k=5,
)
successful_searches += 1
# Show snippet of results
result_preview = str(results)[:100] if results else "No results"
print(f" ✓ Completed ({result_preview}...)")
except Exception as e:
print(f" ✗ Failed: {e}")
print(f"\n✓ Completed {successful_searches}/{len(queries)} searches")
print("=" * 80)
return successful_searches
# ============================================================================
# STEP 3: Extract and Apply Usage Frequencies
# ============================================================================
async def extract_and_apply_frequencies(
time_window_days: int = 7, min_threshold: int = 1
) -> Dict[str, Any]:
"""
Extract usage frequencies from interactions and apply them to the graph.
This function:
1. Retrieves the graph with interaction data
2. Counts how often each node was accessed
3. Writes frequency_weight property back to nodes
Args:
time_window_days: Only count interactions from last N days
min_threshold: Minimum accesses to track (filter out rarely used nodes)
Returns:
Dictionary with statistics about the frequency update
"""
print("=" * 80)
print("STEP 3: Extracting and applying usage frequencies")
print("=" * 80)
# Get graph adapter
graph_engine = await get_graph_engine()
# Retrieve graph with interactions
print("\nRetrieving graph from database...")
graph = CogneeGraph()
await graph.project_graph_from_db(
adapter=graph_engine,
node_properties_to_project=[
"type",
"node_type",
"timestamp",
"created_at",
"text",
"name",
"query_text",
"frequency_weight",
],
edge_properties_to_project=["relationship_type", "timestamp"],
directed=True,
)
print(f"✓ Retrieved: {len(graph.nodes)} nodes, {len(graph.edges)} edges")
# Count interaction nodes
interaction_nodes = [
n
for n in graph.nodes.values()
if n.attributes.get("type") == "CogneeUserInteraction"
or n.attributes.get("node_type") == "CogneeUserInteraction"
]
print(f"✓ Found {len(interaction_nodes)} interaction nodes")
# Run frequency extraction and update
print(f"\nExtracting frequencies (time window: {time_window_days} days)...")
stats = await run_usage_frequency_update(
graph_adapter=graph_engine,
subgraphs=[graph],
time_window=timedelta(days=time_window_days),
min_interaction_threshold=min_threshold,
)
print("\n✓ Frequency extraction complete!")
print(
f" - Interactions processed: {stats['interactions_in_window']}/{stats['total_interactions']}"
)
print(f" - Nodes weighted: {len(stats['node_frequencies'])}")
print(f" - Element types tracked: {stats.get('element_type_frequencies', {})}")
print("=" * 80)
return stats
# ============================================================================
# STEP 4: Analyze and Display Results
# ============================================================================
async def analyze_results(stats: Dict[str, Any]):
"""
Analyze and display the frequency tracking results.
Shows:
- Top most frequently accessed nodes
- Element type distribution
- Verification that weights were written to database
Args:
stats: Statistics from frequency extraction
"""
print("=" * 80)
print("STEP 4: Analyzing usage frequency results")
print("=" * 80)
# Display top nodes by frequency
if stats["node_frequencies"]:
print("\n📊 Top 10 Most Frequently Accessed Elements:")
print("-" * 80)
sorted_nodes = sorted(stats["node_frequencies"].items(), key=lambda x: x[1], reverse=True)
# Get graph to display node details
graph_engine = await get_graph_engine()
graph = CogneeGraph()
await graph.project_graph_from_db(
adapter=graph_engine,
node_properties_to_project=["type", "text", "name"],
edge_properties_to_project=[],
directed=True,
)
for i, (node_id, frequency) in enumerate(sorted_nodes[:10], 1):
node = graph.get_node(node_id)
if node:
node_type = node.attributes.get("type", "Unknown")
text = node.attributes.get("text") or node.attributes.get("name") or ""
text_preview = text[:60] + "..." if len(text) > 60 else text
print(f"\n{i}. Frequency: {frequency} accesses")
print(f" Type: {node_type}")
print(f" Content: {text_preview}")
else:
print(f"\n{i}. Frequency: {frequency} accesses")
print(f" Node ID: {node_id[:50]}...")
# Display element type distribution
if stats.get("element_type_frequencies"):
print("\n\n📈 Element Type Distribution:")
print("-" * 80)
type_dist = stats["element_type_frequencies"]
for elem_type, count in sorted(type_dist.items(), key=lambda x: x[1], reverse=True):
print(f" {elem_type}: {count} accesses")
# Verify weights in database (Neo4j only)
print("\n\n🔍 Verifying weights in database...")
print("-" * 80)
graph_engine = await get_graph_engine()
adapter_type = type(graph_engine).__name__
if adapter_type == "Neo4jAdapter":
try:
result = await graph_engine.query("""
MATCH (n)
WHERE n.frequency_weight IS NOT NULL
RETURN count(n) as weighted_count
""")
count = result[0]["weighted_count"] if result else 0
if count > 0:
print(f"{count} nodes have frequency_weight in Neo4j database")
# Show sample
sample = await graph_engine.query("""
MATCH (n)
WHERE n.frequency_weight IS NOT NULL
RETURN n.frequency_weight as weight, labels(n) as labels
ORDER BY n.frequency_weight DESC
LIMIT 3
""")
print("\nSample weighted nodes:")
for row in sample:
print(f" - Weight: {row['weight']}, Type: {row['labels']}")
else:
print("⚠ No nodes with frequency_weight found in database")
except Exception as e:
print(f"Could not verify in Neo4j: {e}")
else:
print(f"Database verification not implemented for {adapter_type}")
print("\n" + "=" * 80)
# ============================================================================
# STEP 5: Demonstrate Usage in Retrieval
# ============================================================================
async def demonstrate_retrieval_usage():
"""
Demonstrate how frequency weights can be used in retrieval.
Note: This is a conceptual demonstration. To actually use frequency
weights in ranking, you would need to modify the retrieval/completion
strategies to incorporate the frequency_weight property.
"""
print("=" * 80)
print("STEP 5: How to use frequency weights in retrieval")
print("=" * 80)
print("""
Frequency weights can be used to improve search results:
1. RANKING BOOST:
- Multiply relevance scores by frequency_weight
- Prioritize frequently accessed nodes in results
2. COMPLETION STRATEGIES:
- Adjust triplet importance based on usage
- Filter out rarely accessed information
3. ANALYTICS:
- Track trending topics over time
- Understand user interests and behavior
- Identify knowledge gaps (low-frequency nodes)
4. ADAPTIVE RETRIEVAL:
- Personalize results based on team usage patterns
- Surface popular answers faster
Example Cypher query with frequency boost (Neo4j):
MATCH (n)
WHERE n.text CONTAINS $search_term
RETURN n, n.frequency_weight as boost
ORDER BY (n.relevance_score * COALESCE(n.frequency_weight, 1)) DESC
LIMIT 10
To integrate this into Cognee, you would modify the completion
strategy to include frequency_weight in the scoring function.
""")
print("=" * 80)
# ============================================================================
# MAIN: Run Complete Example
# ============================================================================
async def main():
"""
Run the complete end-to-end usage frequency tracking example.
"""
print("\n")
print("" + "=" * 78 + "")
print("" + " " * 78 + "")
print("" + " Usage Frequency Tracking - End-to-End Example".center(78) + "")
print("" + " " * 78 + "")
print("" + "=" * 78 + "")
print("\n")
# Configuration check
print("Configuration:")
print(f" Graph Provider: {os.getenv('GRAPH_DATABASE_PROVIDER')}")
print(f" Graph Handler: {os.getenv('GRAPH_DATASET_HANDLER')}")
print(f" LLM Provider: {os.getenv('LLM_PROVIDER')}")
# Verify LLM key is set
if not os.getenv("LLM_API_KEY") or os.getenv("LLM_API_KEY") == "sk-your-key-here":
print("\n⚠ WARNING: LLM_API_KEY not set in .env file")
print(" Set your API key to run searches")
return
print("\n")
try:
# Step 1: Setup
await setup_knowledge_base()
# Step 2: Simulate searches
# Note: Repeat queries increase frequency for those topics
queries = [
"What is machine learning?",
"Explain neural networks",
"How does deep learning work?",
"Tell me about neural networks", # Repeat - increases frequency
"What are transformers in NLP?",
"Explain neural networks again", # Another repeat
"How does computer vision work?",
"What is reinforcement learning?",
"Tell me more about neural networks", # Third repeat
]
successful_searches = await simulate_user_searches(queries)
if successful_searches == 0:
print("⚠ No searches completed - cannot demonstrate frequency tracking")
return
# Step 3: Extract frequencies
stats = await extract_and_apply_frequencies(time_window_days=7, min_threshold=1)
# Step 4: Analyze results
await analyze_results(stats)
# Step 5: Show usage examples
await demonstrate_retrieval_usage()
# Summary
print("\n")
print("" + "=" * 78 + "")
print("" + " " * 78 + "")
print("" + " Example Complete!".center(78) + "")
print("" + " " * 78 + "")
print("" + "=" * 78 + "")
print("\n")
print("Summary:")
print(" ✓ Documents added: 4")
print(f" ✓ Searches performed: {successful_searches}")
print(f" ✓ Interactions tracked: {stats['interactions_in_window']}")
print(f" ✓ Nodes weighted: {len(stats['node_frequencies'])}")
print("\nNext steps:")
print(" 1. Open Neo4j Browser (http://localhost:7474) to explore the graph")
print(" 2. Modify retrieval strategies to use frequency_weight")
print(" 3. Build analytics dashboards using element_type_frequencies")
print(" 4. Run periodic frequency updates to track trends over time")
print("\n")
except Exception as e:
print(f"\n✗ Example failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())