{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "958375a6ffc0c2e4", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:47.336283Z", "start_time": "2024-09-20T14:02:43.652444Z" } }, "outputs": [], "source": [ "import asyncio\n", "import logging\n", "from typing import Union\n", "\n", "from cognee.modules.cognify.config import get_cognify_config\n", "from cognee.shared.data_models import KnowledgeGraph\n", "from cognee.modules.data.models import Dataset, Data\n", "from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n", "from cognee.modules.data.methods import get_datasets, get_datasets_by_name\n", "from cognee.modules.pipelines.tasks.Task import Task\n", "from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n", "from cognee.modules.users.models import User\n", "from cognee.modules.users.methods import get_default_user\n", "from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status\n", "from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status\n", "from cognee.tasks import chunk_extract_summary, \\\n", " chunk_naive_llm_classifier, \\\n", " chunk_remove_disconnected, \\\n", " infer_data_ontology, \\\n", " save_chunks_to_store, \\\n", " chunk_update_check, \\\n", " chunks_into_graph, \\\n", " source_documents_to_chunks, \\\n", " check_permissions_on_documents, \\\n", " classify_documents" ] }, { "cell_type": "code", "execution_count": 2, "id": "df16431d0f48b006", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:48.519686Z", "start_time": "2024-09-20T14:02:48.515589Z" } }, "outputs": [], "source": [ "job_position = \"\"\"Senior Data Scientist (Machine Learning)\n", "\n", "Company: TechNova Solutions\n", "Location: San Francisco, CA\n", "\n", "Job Description:\n", "\n", "TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n", "\n", "Responsibilities:\n", "\n", "Develop and implement advanced machine learning algorithms and models.\n", "Analyze large, complex datasets to extract meaningful patterns and insights.\n", "Collaborate with cross-functional teams to integrate predictive models into products.\n", "Stay updated with the latest advancements in machine learning and data science.\n", "Mentor junior data scientists and provide technical guidance.\n", "Qualifications:\n", "\n", "Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n", "5+ years of experience in data science and machine learning.\n", "Proficient in Python, R, and SQL.\n", "Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n", "Strong problem-solving skills and attention to detail.\n", "Candidate CVs\n", "\"\"\"\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "9086abf3af077ab4", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:49.120838Z", "start_time": "2024-09-20T14:02:49.118294Z" } }, "outputs": [], "source": [ "job_1 = \"\"\"\n", "CV 1: Relevant\n", "Name: Dr. Emily Carter\n", "Contact Information:\n", "\n", "Email: emily.carter@example.com\n", "Phone: (555) 123-4567\n", "Summary:\n", "\n", "Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n", "\n", "Education:\n", "\n", "Ph.D. in Computer Science, Stanford University (2014)\n", "B.S. in Mathematics, University of California, Berkeley (2010)\n", "Experience:\n", "\n", "Senior Data Scientist, InnovateAI Labs (2016 – Present)\n", "Led a team in developing machine learning models for natural language processing applications.\n", "Implemented deep learning algorithms that improved prediction accuracy by 25%.\n", "Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n", "Data Scientist, DataWave Analytics (2014 – 2016)\n", "Developed predictive models for customer segmentation and churn analysis.\n", "Analyzed large datasets using Hadoop and Spark frameworks.\n", "Skills:\n", "\n", "Programming Languages: Python, R, SQL\n", "Machine Learning: TensorFlow, Keras, Scikit-Learn\n", "Big Data Technologies: Hadoop, Spark\n", "Data Visualization: Tableau, Matplotlib\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "a9de0cc07f798b7f", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:49.675003Z", "start_time": "2024-09-20T14:02:49.671615Z" } }, "outputs": [], "source": [ "job_2 = \"\"\"\n", "CV 2: Relevant\n", "Name: Michael Rodriguez\n", "Contact Information:\n", "\n", "Email: michael.rodriguez@example.com\n", "Phone: (555) 234-5678\n", "Summary:\n", "\n", "Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n", "\n", "Education:\n", "\n", "M.S. in Data Science, Carnegie Mellon University (2013)\n", "B.S. in Computer Science, University of Michigan (2011)\n", "Experience:\n", "\n", "Senior Data Scientist, Alpha Analytics (2017 – Present)\n", "Developed machine learning models to optimize marketing strategies.\n", "Reduced customer acquisition cost by 15% through predictive modeling.\n", "Data Scientist, TechInsights (2013 – 2017)\n", "Analyzed user behavior data to improve product features.\n", "Implemented A/B testing frameworks to evaluate product changes.\n", "Skills:\n", "\n", "Programming Languages: Python, Java, SQL\n", "Machine Learning: Scikit-Learn, XGBoost\n", "Data Visualization: Seaborn, Plotly\n", "Databases: MySQL, MongoDB\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "185ff1c102d06111", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:50.286828Z", "start_time": "2024-09-20T14:02:50.284369Z" } }, "outputs": [], "source": [ "job_3 = \"\"\"\n", "CV 3: Relevant\n", "Name: Sarah Nguyen\n", "Contact Information:\n", "\n", "Email: sarah.nguyen@example.com\n", "Phone: (555) 345-6789\n", "Summary:\n", "\n", "Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n", "\n", "Education:\n", "\n", "M.S. in Statistics, University of Washington (2014)\n", "B.S. in Applied Mathematics, University of Texas at Austin (2012)\n", "Experience:\n", "\n", "Data Scientist, QuantumTech (2016 – Present)\n", "Designed and implemented machine learning algorithms for financial forecasting.\n", "Improved model efficiency by 20% through algorithm optimization.\n", "Junior Data Scientist, DataCore Solutions (2014 – 2016)\n", "Assisted in developing predictive models for supply chain optimization.\n", "Conducted data cleaning and preprocessing on large datasets.\n", "Skills:\n", "\n", "Programming Languages: Python, R\n", "Machine Learning Frameworks: PyTorch, Scikit-Learn\n", "Statistical Analysis: SAS, SPSS\n", "Cloud Platforms: AWS, Azure\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "d55ce4c58f8efb67", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:50.950343Z", "start_time": "2024-09-20T14:02:50.946378Z" } }, "outputs": [], "source": [ "job_4 = \"\"\"\n", "CV 4: Not Relevant\n", "Name: David Thompson\n", "Contact Information:\n", "\n", "Email: david.thompson@example.com\n", "Phone: (555) 456-7890\n", "Summary:\n", "\n", "Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n", "\n", "Education:\n", "\n", "B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n", "Experience:\n", "\n", "Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n", "Led design projects for clients in various industries.\n", "Created branding materials that increased client engagement by 30%.\n", "Graphic Designer, Visual Innovations (2012 – 2015)\n", "Designed marketing collateral, including brochures, logos, and websites.\n", "Collaborated with the marketing team to develop cohesive brand strategies.\n", "Skills:\n", "\n", "Design Software: Adobe Photoshop, Illustrator, InDesign\n", "Web Design: HTML, CSS\n", "Specialties: Branding and Identity, Typography\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "ca4ecc32721ad332", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:51.548191Z", "start_time": "2024-09-20T14:02:51.545520Z" } }, "outputs": [], "source": [ "job_5 = \"\"\"\n", "CV 5: Not Relevant\n", "Name: Jessica Miller\n", "Contact Information:\n", "\n", "Email: jessica.miller@example.com\n", "Phone: (555) 567-8901\n", "Summary:\n", "\n", "Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n", "\n", "Education:\n", "\n", "B.A. in Business Administration, University of Southern California (2010)\n", "Experience:\n", "\n", "Sales Manager, Global Enterprises (2015 – Present)\n", "Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n", "Developed sales strategies that expanded customer base by 25%.\n", "Sales Representative, Market Leaders Inc. (2010 – 2015)\n", "Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n", "Skills:\n", "\n", "Sales Strategy and Planning\n", "Team Leadership and Development\n", "CRM Software: Salesforce, Zoho\n", "Negotiation and Relationship Building\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "904df61ba484a8e5", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:54.243987Z", "start_time": "2024-09-20T14:02:52.498195Z" } }, "outputs": [], "source": [ "import cognee\n", "from os import listdir, path\n", "\n", "data_path = path.abspath(\".data\")\n", "\n", "results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], \"example\")\n", "\n", "for result in results:\n", " print(result)" ] }, { "cell_type": "code", "execution_count": 8, "id": "6f9b564de121713d", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:55.564445Z", "start_time": "2024-09-20T14:02:55.562784Z" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "id": "8911f8bd4f8c440a", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:56.714408Z", "start_time": "2024-09-20T14:02:56.711812Z" } }, "outputs": [], "source": [ "# from enum import Enum, auto\n", "# from typing import Optional, List, Union, Dict, Any\n", "# from pydantic import BaseModel, Field\n", "# \n", "# class Node(BaseModel):\n", "# \"\"\"Node in a knowledge graph.\"\"\"\n", "# id: str\n", "# name: str\n", "# type: str\n", "# description: str\n", "# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the node.\")\n", "# \n", "# class Edge(BaseModel):\n", "# \"\"\"Edge in a knowledge graph.\"\"\"\n", "# source_node_id: str\n", "# target_node_id: str\n", "# relationship_name: str\n", "# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the edge.\")\n", "# \n", "# class KnowledgeGraph(BaseModel):\n", "# \"\"\"Knowledge graph.\"\"\"\n", "# nodes: List[Node] = Field(..., default_factory=list)\n", "# edges: List[Edge] = Field(..., default_factory=list)" ] }, { "cell_type": "code", "execution_count": 10, "id": "7c431fdef4921ae0", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:57.925667Z", "start_time": "2024-09-20T14:02:57.922353Z" } }, "outputs": [], "source": [ "async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n", " data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n", "\n", " try:\n", "\n", " root_node_id = None\n", "\n", " tasks = [\n", " Task(classify_documents),\n", " Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n", " Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n", " Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n", " Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n", " Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n", " Task(\n", " save_chunks_to_store,\n", " collection_name = \"chunks\",\n", " ), \n", " Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n", " ]\n", "\n", " pipeline = run_tasks(tasks, data_documents)\n", "\n", " async for result in pipeline:\n", " print(result)\n", " except Exception as error:\n", " raise error" ] }, { "cell_type": "code", "execution_count": null, "id": "f0a91b99c6215e09", "metadata": { "ExecuteTime": { "end_time": "2024-09-20T14:02:58.905774Z", "start_time": "2024-09-20T14:02:58.625915Z" } }, "outputs": [], "source": [ "user = await get_default_user()\n", "datasets = await get_datasets_by_name([\"example\"], user.id)\n", "await run_cognify_pipeline(datasets[0], user)" ] }, { "cell_type": "code", "execution_count": null, "id": "080389e5", "metadata": {}, "outputs": [], "source": [ "import os\n", "from cognee.shared.utils import render_graph\n", "from cognee.infrastructure.databases.graph import get_graph_engine\n", "import graphistry\n", "\n", "# # Setting an environment variable\n", "# os.environ[\"GRAPHISTRY_USERNAME\"] = placeholder\n", "# os.environ[\"GRAPHISTRY_PASSWORD\"] = placeholder\n", "\n", "\n", "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n", "\n", "graph_engine = await get_graph_engine()\n", "\n", "graph_url = await render_graph(graph_engine.graph)\n", "print(graph_url)" ] }, { "cell_type": "code", "execution_count": null, "id": "e5e7dfc8", "metadata": {}, "outputs": [], "source": [ "async def search(\n", " vector_engine,\n", " collection_name: str,\n", " query_text: str = None,\n", "):\n", " query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n", "\n", " connection = await vector_engine.get_connection()\n", " collection = await connection.open_table(collection_name)\n", "\n", " results = await collection.vector_search(query_vector).limit(10).to_pandas()\n", "\n", " result_values = list(results.to_dict(\"index\").values())\n", "\n", " return [dict(\n", " id = str(result[\"id\"]),\n", " payload = result[\"payload\"],\n", " score = result[\"_distance\"],\n", " ) for result in result_values]\n", "\n", "\n", "from cognee.infrastructure.databases.vector import get_vector_engine\n", "\n", "vector_engine = get_vector_engine()\n", "results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n", "for result in results:\n", " print(result)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }