cognee/examples/python/extract_usage_frequency_example.py

#!/usr/bin/env python3
"""
End-to-End Example: Usage Frequency Tracking in Cognee

This example demonstrates the complete workflow for tracking and analyzing
how frequently different graph elements are accessed through user searches.

Features demonstrated:
- Setting up a knowledge base
- Running searches with interaction tracking (save_interaction=True)
- Extracting usage frequencies from interaction data
- Applying frequency weights to graph nodes
- Analyzing and visualizing the results

Use cases:
- Ranking search results by popularity
- Identifying "hot topics" in your knowledge base
- Understanding user behavior and interests
- Improving retrieval based on usage patterns
"""

import asyncio
import os
from datetime import timedelta
from typing import List, Dict, Any
from dotenv import load_dotenv

import cognee
from cognee.api.v1.search import SearchType
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
from cognee.tasks.memify.extract_usage_frequency import run_usage_frequency_update

# Load environment variables
load_dotenv()


# ============================================================================
# STEP 1: Setup and Configuration
# ============================================================================


async def setup_knowledge_base():
    """
    Create a fresh knowledge base with sample content.

    In a real application, you would:
    - Load documents from files, databases, or APIs
    - Process larger datasets
    - Organize content by datasets/categories
    """
    print("=" * 80)
    print("STEP 1: Setting up knowledge base")
    print("=" * 80)

    # Reset state for clean demo (optional in production)
    print("\nResetting Cognee state...")
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    print("✓ Reset complete")

    # Sample content: AI/ML educational material
    documents = [
        """
        Machine Learning Fundamentals:
        Machine learning is a subset of artificial intelligence that enables systems
        to learn and improve from experience without being explicitly programmed.
        The three main types are supervised learning, unsupervised learning, and
        reinforcement learning.
        """,
        """
        Neural Networks Explained:
        Neural networks are computing systems inspired by biological neural networks.
        They consist of layers of interconnected nodes (neurons) that process information
        through weighted connections. Deep learning uses neural networks with many layers
        to automatically learn hierarchical representations of data.
        """,
        """
        Natural Language Processing:
        NLP enables computers to understand, interpret, and generate human language.
        Modern NLP uses transformer architectures like BERT and GPT, which have
        revolutionized tasks such as translation, summarization, and question answering.
        """,
        """
        Computer Vision Applications:
        Computer vision allows machines to interpret visual information from the world.
        Convolutional neural networks (CNNs) are particularly effective for image
        recognition, object detection, and image segmentation tasks.
        """,
    ]

    print(f"\nAdding {len(documents)} documents to knowledge base...")
    await cognee.add(documents, dataset_name="ai_ml_fundamentals")
    print("✓ Documents added")

    # Build knowledge graph
    print("\nBuilding knowledge graph (cognify)...")
    await cognee.cognify()
    print("✓ Knowledge graph built")

    print("\n" + "=" * 80)


# ============================================================================
# STEP 2: Simulate User Searches with Interaction Tracking
# ============================================================================


async def simulate_user_searches(queries: List[str]):
    """
    Simulate users searching the knowledge base.

    The key parameter is save_interaction=True, which creates:
    - CogneeUserInteraction nodes (one per search)
    - used_graph_element_to_answer edges (connecting queries to relevant nodes)

    Args:
        queries: List of search queries to simulate

    Returns:
        Number of successful searches
    """
    print("=" * 80)
    print("STEP 2: Simulating user searches with interaction tracking")
    print("=" * 80)

    successful_searches = 0

    for i, query in enumerate(queries, 1):
        print(f"\nSearch {i}/{len(queries)}: '{query}'")
        try:
            results = await cognee.search(
                query_type=SearchType.GRAPH_COMPLETION,
                query_text=query,
                save_interaction=True,  # ← THIS IS CRITICAL!
                top_k=5,
            )
            successful_searches += 1

            # Show snippet of results
            result_preview = str(results)[:100] if results else "No results"
            print(f"  ✓ Completed ({result_preview}...)")

        except Exception as e:
            print(f"  ✗ Failed: {e}")

    print(f"\n✓ Completed {successful_searches}/{len(queries)} searches")
    print("=" * 80)

    return successful_searches


# ============================================================================
# STEP 3: Extract and Apply Usage Frequencies
# ============================================================================


async def extract_and_apply_frequencies(
    time_window_days: int = 7, min_threshold: int = 1
) -> Dict[str, Any]:
    """
    Extract usage frequencies from interactions and apply them to the graph.

    This function:
    1. Retrieves the graph with interaction data
    2. Counts how often each node was accessed
    3. Writes frequency_weight property back to nodes

    Args:
        time_window_days: Only count interactions from last N days
        min_threshold: Minimum accesses to track (filter out rarely used nodes)

    Returns:
        Dictionary with statistics about the frequency update
    """
    print("=" * 80)
    print("STEP 3: Extracting and applying usage frequencies")
    print("=" * 80)

    # Get graph adapter
    graph_engine = await get_graph_engine()

    # Retrieve graph with interactions
    print("\nRetrieving graph from database...")
    graph = CogneeGraph()
    await graph.project_graph_from_db(
        adapter=graph_engine,
        node_properties_to_project=[
            "type",
            "node_type",
            "timestamp",
            "created_at",
            "text",
            "name",
            "query_text",
            "frequency_weight",
        ],
        edge_properties_to_project=["relationship_type", "timestamp"],
        directed=True,
    )

    print(f"✓ Retrieved: {len(graph.nodes)} nodes, {len(graph.edges)} edges")

    # Count interaction nodes
    interaction_nodes = [
        n
        for n in graph.nodes.values()
        if n.attributes.get("type") == "CogneeUserInteraction"
        or n.attributes.get("node_type") == "CogneeUserInteraction"
    ]
    print(f"✓ Found {len(interaction_nodes)} interaction nodes")

    # Run frequency extraction and update
    print(f"\nExtracting frequencies (time window: {time_window_days} days)...")
    stats = await run_usage_frequency_update(
        graph_adapter=graph_engine,
        subgraphs=[graph],
        time_window=timedelta(days=time_window_days),
        min_interaction_threshold=min_threshold,
    )

    print("\n✓ Frequency extraction complete!")
    print(
        f"  - Interactions processed: {stats['interactions_in_window']}/{stats['total_interactions']}"
    )
    print(f"  - Nodes weighted: {len(stats['node_frequencies'])}")
    print(f"  - Element types tracked: {stats.get('element_type_frequencies', {})}")

    print("=" * 80)

    return stats


# ============================================================================
# STEP 4: Analyze and Display Results
# ============================================================================


async def analyze_results(stats: Dict[str, Any]):
    """
    Analyze and display the frequency tracking results.

    Shows:
    - Top most frequently accessed nodes
    - Element type distribution
    - Verification that weights were written to database

    Args:
        stats: Statistics from frequency extraction
    """
    print("=" * 80)
    print("STEP 4: Analyzing usage frequency results")
    print("=" * 80)

    # Display top nodes by frequency
    if stats["node_frequencies"]:
        print("\n📊 Top 10 Most Frequently Accessed Elements:")
        print("-" * 80)

        sorted_nodes = sorted(stats["node_frequencies"].items(), key=lambda x: x[1], reverse=True)

        # Get graph to display node details
        graph_engine = await get_graph_engine()
        graph = CogneeGraph()
        await graph.project_graph_from_db(
            adapter=graph_engine,
            node_properties_to_project=["type", "text", "name"],
            edge_properties_to_project=[],
            directed=True,
        )

        for i, (node_id, frequency) in enumerate(sorted_nodes[:10], 1):
            node = graph.get_node(node_id)
            if node:
                node_type = node.attributes.get("type", "Unknown")
                text = node.attributes.get("text") or node.attributes.get("name") or ""
                text_preview = text[:60] + "..." if len(text) > 60 else text

                print(f"\n{i}. Frequency: {frequency} accesses")
                print(f"   Type: {node_type}")
                print(f"   Content: {text_preview}")
            else:
                print(f"\n{i}. Frequency: {frequency} accesses")
                print(f"   Node ID: {node_id[:50]}...")

    # Display element type distribution
    if stats.get("element_type_frequencies"):
        print("\n\n📈 Element Type Distribution:")
        print("-" * 80)
        type_dist = stats["element_type_frequencies"]
        for elem_type, count in sorted(type_dist.items(), key=lambda x: x[1], reverse=True):
            print(f"  {elem_type}: {count} accesses")

    # Verify weights in database (Neo4j only)
    print("\n\n🔍 Verifying weights in database...")
    print("-" * 80)

    graph_engine = await get_graph_engine()
    adapter_type = type(graph_engine).__name__

    if adapter_type == "Neo4jAdapter":
        try:
            result = await graph_engine.query("""
                MATCH (n)
                WHERE n.frequency_weight IS NOT NULL
                RETURN count(n) as weighted_count
            """)

            count = result[0]["weighted_count"] if result else 0
            if count > 0:
                print(f"✓ {count} nodes have frequency_weight in Neo4j database")

                # Show sample
                sample = await graph_engine.query("""
                    MATCH (n)
                    WHERE n.frequency_weight IS NOT NULL
                    RETURN n.frequency_weight as weight, labels(n) as labels
                    ORDER BY n.frequency_weight DESC
                    LIMIT 3
                """)

                print("\nSample weighted nodes:")
                for row in sample:
                    print(f"  - Weight: {row['weight']}, Type: {row['labels']}")
            else:
                print("⚠ No nodes with frequency_weight found in database")
        except Exception as e:
            print(f"Could not verify in Neo4j: {e}")
    else:
        print(f"Database verification not implemented for {adapter_type}")

    print("\n" + "=" * 80)


# ============================================================================
# STEP 5: Demonstrate Usage in Retrieval
# ============================================================================


async def demonstrate_retrieval_usage():
    """
    Demonstrate how frequency weights can be used in retrieval.

    Note: This is a conceptual demonstration. To actually use frequency
    weights in ranking, you would need to modify the retrieval/completion
    strategies to incorporate the frequency_weight property.
    """
    print("=" * 80)
    print("STEP 5: How to use frequency weights in retrieval")
    print("=" * 80)

    print("""
    Frequency weights can be used to improve search results:

    1. RANKING BOOST:
       - Multiply relevance scores by frequency_weight
       - Prioritize frequently accessed nodes in results

    2. COMPLETION STRATEGIES:
       - Adjust triplet importance based on usage
       - Filter out rarely accessed information

    3. ANALYTICS:
       - Track trending topics over time
       - Understand user interests and behavior
       - Identify knowledge gaps (low-frequency nodes)

    4. ADAPTIVE RETRIEVAL:
       - Personalize results based on team usage patterns
       - Surface popular answers faster

    Example Cypher query with frequency boost (Neo4j):

        MATCH (n)
        WHERE n.text CONTAINS $search_term
        RETURN n, n.frequency_weight as boost
        ORDER BY (n.relevance_score * COALESCE(n.frequency_weight, 1)) DESC
        LIMIT 10

    To integrate this into Cognee, you would modify the completion
    strategy to include frequency_weight in the scoring function.
    """)

    print("=" * 80)


# ============================================================================
# MAIN: Run Complete Example
# ============================================================================


async def main():
    """
    Run the complete end-to-end usage frequency tracking example.
    """
    print("\n")
    print("╔" + "=" * 78 + "╗")
    print("║" + " " * 78 + "║")
    print("║" + "  Usage Frequency Tracking - End-to-End Example".center(78) + "║")
    print("║" + " " * 78 + "║")
    print("╚" + "=" * 78 + "╝")
    print("\n")

    # Configuration check
    print("Configuration:")
    print(f"  Graph Provider: {os.getenv('GRAPH_DATABASE_PROVIDER')}")
    print(f"  Graph Handler: {os.getenv('GRAPH_DATASET_HANDLER')}")
    print(f"  LLM Provider: {os.getenv('LLM_PROVIDER')}")

    # Verify LLM key is set
    if not os.getenv("LLM_API_KEY") or os.getenv("LLM_API_KEY") == "sk-your-key-here":
        print("\n⚠ WARNING: LLM_API_KEY not set in .env file")
        print("   Set your API key to run searches")
        return

    print("\n")

    try:
        # Step 1: Setup
        await setup_knowledge_base()

        # Step 2: Simulate searches
        # Note: Repeat queries increase frequency for those topics
        queries = [
            "What is machine learning?",
            "Explain neural networks",
            "How does deep learning work?",
            "Tell me about neural networks",  # Repeat - increases frequency
            "What are transformers in NLP?",
            "Explain neural networks again",  # Another repeat
            "How does computer vision work?",
            "What is reinforcement learning?",
            "Tell me more about neural networks",  # Third repeat
        ]

        successful_searches = await simulate_user_searches(queries)

        if successful_searches == 0:
            print("⚠ No searches completed - cannot demonstrate frequency tracking")
            return

        # Step 3: Extract frequencies
        stats = await extract_and_apply_frequencies(time_window_days=7, min_threshold=1)

        # Step 4: Analyze results
        await analyze_results(stats)

        # Step 5: Show usage examples
        await demonstrate_retrieval_usage()

        # Summary
        print("\n")
        print("╔" + "=" * 78 + "╗")
        print("║" + " " * 78 + "║")
        print("║" + "  Example Complete!".center(78) + "║")
        print("║" + " " * 78 + "║")
        print("╚" + "=" * 78 + "╝")
        print("\n")

        print("Summary:")
        print("  ✓ Documents added: 4")
        print(f"  ✓ Searches performed: {successful_searches}")
        print(f"  ✓ Interactions tracked: {stats['interactions_in_window']}")
        print(f"  ✓ Nodes weighted: {len(stats['node_frequencies'])}")

        print("\nNext steps:")
        print("  1. Open Neo4j Browser (http://localhost:7474) to explore the graph")
        print("  2. Modify retrieval strategies to use frequency_weight")
        print("  3. Build analytics dashboards using element_type_frequencies")
        print("  4. Run periodic frequency updates to track trends over time")

        print("\n")

    except Exception as e:
        print(f"\n✗ Example failed: {e}")
        import traceback

        traceback.print_exc()


if __name__ == "__main__":
    asyncio.run(main())