cognee/notebooks/cognee_demo_1.5.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "958375a6ffc0c2e4",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:47.336283Z",
     "start_time": "2024-09-20T14:02:43.652444Z"
    }
   },
   "outputs": [],
   "source": [
    "import asyncio\n",
    "import logging\n",
    "from typing import Union\n",
    "\n",
    "from cognee.modules.cognify.config import get_cognify_config\n",
    "from cognee.shared.data_models import KnowledgeGraph\n",
    "from cognee.modules.data.models import Dataset, Data\n",
    "from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
    "from cognee.modules.data.methods import get_datasets, get_datasets_by_name\n",
    "from cognee.modules.pipelines.tasks.Task import Task\n",
    "from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n",
    "from cognee.modules.users.models import User\n",
    "from cognee.modules.users.methods import get_default_user\n",
    "from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status\n",
    "from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status\n",
    "from cognee.tasks import chunk_extract_summary, \\\n",
    "    chunk_naive_llm_classifier, \\\n",
    "    chunk_remove_disconnected, \\\n",
    "    infer_data_ontology, \\\n",
    "    save_chunks_to_store, \\\n",
    "    chunk_update_check, \\\n",
    "    chunks_into_graph, \\\n",
    "    source_documents_to_chunks, \\\n",
    "    check_permissions_on_documents, \\\n",
    "    classify_documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "df16431d0f48b006",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:48.519686Z",
     "start_time": "2024-09-20T14:02:48.515589Z"
    }
   },
   "outputs": [],
   "source": [
    "job_position = \"\"\"Senior Data Scientist (Machine Learning)\n",
    "\n",
    "Company: TechNova Solutions\n",
    "Location: San Francisco, CA\n",
    "\n",
    "Job Description:\n",
    "\n",
    "TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n",
    "\n",
    "Responsibilities:\n",
    "\n",
    "Develop and implement advanced machine learning algorithms and models.\n",
    "Analyze large, complex datasets to extract meaningful patterns and insights.\n",
    "Collaborate with cross-functional teams to integrate predictive models into products.\n",
    "Stay updated with the latest advancements in machine learning and data science.\n",
    "Mentor junior data scientists and provide technical guidance.\n",
    "Qualifications:\n",
    "\n",
    "Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n",
    "5+ years of experience in data science and machine learning.\n",
    "Proficient in Python, R, and SQL.\n",
    "Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n",
    "Strong problem-solving skills and attention to detail.\n",
    "Candidate CVs\n",
    "\"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9086abf3af077ab4",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:49.120838Z",
     "start_time": "2024-09-20T14:02:49.118294Z"
    }
   },
   "outputs": [],
   "source": [
    "job_1 = \"\"\"\n",
    "CV 1: Relevant\n",
    "Name: Dr. Emily Carter\n",
    "Contact Information:\n",
    "\n",
    "Email: emily.carter@example.com\n",
    "Phone: (555) 123-4567\n",
    "Summary:\n",
    "\n",
    "Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n",
    "\n",
    "Education:\n",
    "\n",
    "Ph.D. in Computer Science, Stanford University (2014)\n",
    "B.S. in Mathematics, University of California, Berkeley (2010)\n",
    "Experience:\n",
    "\n",
    "Senior Data Scientist, InnovateAI Labs (2016 – Present)\n",
    "Led a team in developing machine learning models for natural language processing applications.\n",
    "Implemented deep learning algorithms that improved prediction accuracy by 25%.\n",
    "Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n",
    "Data Scientist, DataWave Analytics (2014 – 2016)\n",
    "Developed predictive models for customer segmentation and churn analysis.\n",
    "Analyzed large datasets using Hadoop and Spark frameworks.\n",
    "Skills:\n",
    "\n",
    "Programming Languages: Python, R, SQL\n",
    "Machine Learning: TensorFlow, Keras, Scikit-Learn\n",
    "Big Data Technologies: Hadoop, Spark\n",
    "Data Visualization: Tableau, Matplotlib\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a9de0cc07f798b7f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:49.675003Z",
     "start_time": "2024-09-20T14:02:49.671615Z"
    }
   },
   "outputs": [],
   "source": [
    "job_2 = \"\"\"\n",
    "CV 2: Relevant\n",
    "Name: Michael Rodriguez\n",
    "Contact Information:\n",
    "\n",
    "Email: michael.rodriguez@example.com\n",
    "Phone: (555) 234-5678\n",
    "Summary:\n",
    "\n",
    "Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n",
    "\n",
    "Education:\n",
    "\n",
    "M.S. in Data Science, Carnegie Mellon University (2013)\n",
    "B.S. in Computer Science, University of Michigan (2011)\n",
    "Experience:\n",
    "\n",
    "Senior Data Scientist, Alpha Analytics (2017 – Present)\n",
    "Developed machine learning models to optimize marketing strategies.\n",
    "Reduced customer acquisition cost by 15% through predictive modeling.\n",
    "Data Scientist, TechInsights (2013 – 2017)\n",
    "Analyzed user behavior data to improve product features.\n",
    "Implemented A/B testing frameworks to evaluate product changes.\n",
    "Skills:\n",
    "\n",
    "Programming Languages: Python, Java, SQL\n",
    "Machine Learning: Scikit-Learn, XGBoost\n",
    "Data Visualization: Seaborn, Plotly\n",
    "Databases: MySQL, MongoDB\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "185ff1c102d06111",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:50.286828Z",
     "start_time": "2024-09-20T14:02:50.284369Z"
    }
   },
   "outputs": [],
   "source": [
    "job_3 = \"\"\"\n",
    "CV 3: Relevant\n",
    "Name: Sarah Nguyen\n",
    "Contact Information:\n",
    "\n",
    "Email: sarah.nguyen@example.com\n",
    "Phone: (555) 345-6789\n",
    "Summary:\n",
    "\n",
    "Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n",
    "\n",
    "Education:\n",
    "\n",
    "M.S. in Statistics, University of Washington (2014)\n",
    "B.S. in Applied Mathematics, University of Texas at Austin (2012)\n",
    "Experience:\n",
    "\n",
    "Data Scientist, QuantumTech (2016 – Present)\n",
    "Designed and implemented machine learning algorithms for financial forecasting.\n",
    "Improved model efficiency by 20% through algorithm optimization.\n",
    "Junior Data Scientist, DataCore Solutions (2014 – 2016)\n",
    "Assisted in developing predictive models for supply chain optimization.\n",
    "Conducted data cleaning and preprocessing on large datasets.\n",
    "Skills:\n",
    "\n",
    "Programming Languages: Python, R\n",
    "Machine Learning Frameworks: PyTorch, Scikit-Learn\n",
    "Statistical Analysis: SAS, SPSS\n",
    "Cloud Platforms: AWS, Azure\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d55ce4c58f8efb67",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:50.950343Z",
     "start_time": "2024-09-20T14:02:50.946378Z"
    }
   },
   "outputs": [],
   "source": [
    "job_4 = \"\"\"\n",
    "CV 4: Not Relevant\n",
    "Name: David Thompson\n",
    "Contact Information:\n",
    "\n",
    "Email: david.thompson@example.com\n",
    "Phone: (555) 456-7890\n",
    "Summary:\n",
    "\n",
    "Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n",
    "\n",
    "Education:\n",
    "\n",
    "B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n",
    "Experience:\n",
    "\n",
    "Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n",
    "Led design projects for clients in various industries.\n",
    "Created branding materials that increased client engagement by 30%.\n",
    "Graphic Designer, Visual Innovations (2012 – 2015)\n",
    "Designed marketing collateral, including brochures, logos, and websites.\n",
    "Collaborated with the marketing team to develop cohesive brand strategies.\n",
    "Skills:\n",
    "\n",
    "Design Software: Adobe Photoshop, Illustrator, InDesign\n",
    "Web Design: HTML, CSS\n",
    "Specialties: Branding and Identity, Typography\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ca4ecc32721ad332",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:51.548191Z",
     "start_time": "2024-09-20T14:02:51.545520Z"
    }
   },
   "outputs": [],
   "source": [
    "job_5 = \"\"\"\n",
    "CV 5: Not Relevant\n",
    "Name: Jessica Miller\n",
    "Contact Information:\n",
    "\n",
    "Email: jessica.miller@example.com\n",
    "Phone: (555) 567-8901\n",
    "Summary:\n",
    "\n",
    "Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n",
    "\n",
    "Education:\n",
    "\n",
    "B.A. in Business Administration, University of Southern California (2010)\n",
    "Experience:\n",
    "\n",
    "Sales Manager, Global Enterprises (2015 – Present)\n",
    "Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n",
    "Developed sales strategies that expanded customer base by 25%.\n",
    "Sales Representative, Market Leaders Inc. (2010 – 2015)\n",
    "Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n",
    "Skills:\n",
    "\n",
    "Sales Strategy and Planning\n",
    "Team Leadership and Development\n",
    "CRM Software: Salesforce, Zoho\n",
    "Negotiation and Relationship Building\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "904df61ba484a8e5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:54.243987Z",
     "start_time": "2024-09-20T14:02:52.498195Z"
    }
   },
   "outputs": [],
   "source": [
    "import cognee\n",
    "from os import listdir, path\n",
    "\n",
    "data_path = path.abspath(\".data\")\n",
    "\n",
    "results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], \"example\")\n",
    "\n",
    "for result in results:\n",
    "    print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6f9b564de121713d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:55.564445Z",
     "start_time": "2024-09-20T14:02:55.562784Z"
    }
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8911f8bd4f8c440a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:56.714408Z",
     "start_time": "2024-09-20T14:02:56.711812Z"
    }
   },
   "outputs": [],
   "source": [
    "# from enum import Enum, auto\n",
    "# from typing import Optional, List, Union, Dict, Any\n",
    "# from pydantic import BaseModel, Field\n",
    "# \n",
    "# class Node(BaseModel):\n",
    "#     \"\"\"Node in a knowledge graph.\"\"\"\n",
    "#     id: str\n",
    "#     name: str\n",
    "#     type: str\n",
    "#     description: str\n",
    "#     properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the node.\")\n",
    "# \n",
    "# class Edge(BaseModel):\n",
    "#     \"\"\"Edge in a knowledge graph.\"\"\"\n",
    "#     source_node_id: str\n",
    "#     target_node_id: str\n",
    "#     relationship_name: str\n",
    "#     properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the edge.\")\n",
    "# \n",
    "# class KnowledgeGraph(BaseModel):\n",
    "#     \"\"\"Knowledge graph.\"\"\"\n",
    "#     nodes: List[Node] = Field(..., default_factory=list)\n",
    "#     edges: List[Edge] = Field(..., default_factory=list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7c431fdef4921ae0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:57.925667Z",
     "start_time": "2024-09-20T14:02:57.922353Z"
    }
   },
   "outputs": [],
   "source": [
    "async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
    "    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n",
    "\n",
    "    try:\n",
    "\n",
    "        root_node_id = None\n",
    "\n",
    "        tasks = [\n",
    "            Task(classify_documents),\n",
    "            Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
    "            Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
    "            Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
    "            Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
    "            Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
    "            Task(\n",
    "                save_chunks_to_store,\n",
    "                collection_name = \"chunks\",\n",
    "            ), \n",
    "            Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n",
    "        ]\n",
    "\n",
    "        pipeline = run_tasks(tasks, data_documents)\n",
    "\n",
    "        async for result in pipeline:\n",
    "            print(result)\n",
    "    except Exception as error:\n",
    "        raise error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0a91b99c6215e09",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-20T14:02:58.905774Z",
     "start_time": "2024-09-20T14:02:58.625915Z"
    }
   },
   "outputs": [],
   "source": [
    "user = await get_default_user()\n",
    "datasets = await get_datasets_by_name([\"example\"], user.id)\n",
    "await run_cognify_pipeline(datasets[0], user)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "080389e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from cognee.shared.utils import render_graph\n",
    "from cognee.infrastructure.databases.graph import get_graph_engine\n",
    "import graphistry\n",
    "\n",
    "# # Setting an environment variable\n",
    "# os.environ[\"GRAPHISTRY_USERNAME\"] = placeholder\n",
    "# os.environ[\"GRAPHISTRY_PASSWORD\"] = placeholder\n",
    "\n",
    "\n",
    "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
    "\n",
    "graph_engine = await get_graph_engine()\n",
    "\n",
    "graph_url = await render_graph(graph_engine.graph)\n",
    "print(graph_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5e7dfc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "async def search(\n",
    "    vector_engine,\n",
    "    collection_name: str,\n",
    "    query_text: str = None,\n",
    "):\n",
    "    query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n",
    "\n",
    "    connection = await vector_engine.get_connection()\n",
    "    collection = await connection.open_table(collection_name)\n",
    "\n",
    "    results = await collection.vector_search(query_vector).limit(10).to_pandas()\n",
    "\n",
    "    result_values = list(results.to_dict(\"index\").values())\n",
    "\n",
    "    return [dict(\n",
    "        id = str(result[\"id\"]),\n",
    "        payload = result[\"payload\"],\n",
    "        score = result[\"_distance\"],\n",
    "    ) for result in result_values]\n",
    "\n",
    "\n",
    "from cognee.infrastructure.databases.vector import get_vector_engine\n",
    "\n",
    "vector_engine = get_vector_engine()\n",
    "results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n",
    "for result in results:\n",
    "    print(result)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}