512 lines
17 KiB
Text
512 lines
17 KiB
Text
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "958375a6ffc0c2e4",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:47.336283Z",
|
||
"start_time": "2024-09-20T14:02:43.652444Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import asyncio\n",
|
||
"import logging\n",
|
||
"from typing import Union\n",
|
||
"\n",
|
||
"from cognee.modules.cognify.config import get_cognify_config\n",
|
||
"from cognee.shared.data_models import KnowledgeGraph\n",
|
||
"from cognee.modules.data.models import Dataset, Data\n",
|
||
"from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
|
||
"from cognee.modules.data.methods import get_datasets, get_datasets_by_name\n",
|
||
"from cognee.modules.pipelines.tasks.Task import Task\n",
|
||
"from cognee.modules.pipelines import run_tasks, run_tasks_parallel\n",
|
||
"from cognee.modules.users.models import User\n",
|
||
"from cognee.modules.users.methods import get_default_user\n",
|
||
"from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status\n",
|
||
"from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status\n",
|
||
"from cognee.tasks import chunk_extract_summary, \\\n",
|
||
" chunk_naive_llm_classifier, \\\n",
|
||
" chunk_remove_disconnected, \\\n",
|
||
" infer_data_ontology, \\\n",
|
||
" save_chunks_to_store, \\\n",
|
||
" chunk_update_check, \\\n",
|
||
" chunks_into_graph, \\\n",
|
||
" source_documents_to_chunks, \\\n",
|
||
" check_permissions_on_documents, \\\n",
|
||
" classify_documents"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "df16431d0f48b006",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:48.519686Z",
|
||
"start_time": "2024-09-20T14:02:48.515589Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_position = \"\"\"Senior Data Scientist (Machine Learning)\n",
|
||
"\n",
|
||
"Company: TechNova Solutions\n",
|
||
"Location: San Francisco, CA\n",
|
||
"\n",
|
||
"Job Description:\n",
|
||
"\n",
|
||
"TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n",
|
||
"\n",
|
||
"Responsibilities:\n",
|
||
"\n",
|
||
"Develop and implement advanced machine learning algorithms and models.\n",
|
||
"Analyze large, complex datasets to extract meaningful patterns and insights.\n",
|
||
"Collaborate with cross-functional teams to integrate predictive models into products.\n",
|
||
"Stay updated with the latest advancements in machine learning and data science.\n",
|
||
"Mentor junior data scientists and provide technical guidance.\n",
|
||
"Qualifications:\n",
|
||
"\n",
|
||
"Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n",
|
||
"5+ years of experience in data science and machine learning.\n",
|
||
"Proficient in Python, R, and SQL.\n",
|
||
"Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n",
|
||
"Strong problem-solving skills and attention to detail.\n",
|
||
"Candidate CVs\n",
|
||
"\"\"\"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "9086abf3af077ab4",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:49.120838Z",
|
||
"start_time": "2024-09-20T14:02:49.118294Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_1 = \"\"\"\n",
|
||
"CV 1: Relevant\n",
|
||
"Name: Dr. Emily Carter\n",
|
||
"Contact Information:\n",
|
||
"\n",
|
||
"Email: emily.carter@example.com\n",
|
||
"Phone: (555) 123-4567\n",
|
||
"Summary:\n",
|
||
"\n",
|
||
"Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n",
|
||
"\n",
|
||
"Education:\n",
|
||
"\n",
|
||
"Ph.D. in Computer Science, Stanford University (2014)\n",
|
||
"B.S. in Mathematics, University of California, Berkeley (2010)\n",
|
||
"Experience:\n",
|
||
"\n",
|
||
"Senior Data Scientist, InnovateAI Labs (2016 – Present)\n",
|
||
"Led a team in developing machine learning models for natural language processing applications.\n",
|
||
"Implemented deep learning algorithms that improved prediction accuracy by 25%.\n",
|
||
"Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n",
|
||
"Data Scientist, DataWave Analytics (2014 – 2016)\n",
|
||
"Developed predictive models for customer segmentation and churn analysis.\n",
|
||
"Analyzed large datasets using Hadoop and Spark frameworks.\n",
|
||
"Skills:\n",
|
||
"\n",
|
||
"Programming Languages: Python, R, SQL\n",
|
||
"Machine Learning: TensorFlow, Keras, Scikit-Learn\n",
|
||
"Big Data Technologies: Hadoop, Spark\n",
|
||
"Data Visualization: Tableau, Matplotlib\n",
|
||
"\"\"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "a9de0cc07f798b7f",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:49.675003Z",
|
||
"start_time": "2024-09-20T14:02:49.671615Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_2 = \"\"\"\n",
|
||
"CV 2: Relevant\n",
|
||
"Name: Michael Rodriguez\n",
|
||
"Contact Information:\n",
|
||
"\n",
|
||
"Email: michael.rodriguez@example.com\n",
|
||
"Phone: (555) 234-5678\n",
|
||
"Summary:\n",
|
||
"\n",
|
||
"Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n",
|
||
"\n",
|
||
"Education:\n",
|
||
"\n",
|
||
"M.S. in Data Science, Carnegie Mellon University (2013)\n",
|
||
"B.S. in Computer Science, University of Michigan (2011)\n",
|
||
"Experience:\n",
|
||
"\n",
|
||
"Senior Data Scientist, Alpha Analytics (2017 – Present)\n",
|
||
"Developed machine learning models to optimize marketing strategies.\n",
|
||
"Reduced customer acquisition cost by 15% through predictive modeling.\n",
|
||
"Data Scientist, TechInsights (2013 – 2017)\n",
|
||
"Analyzed user behavior data to improve product features.\n",
|
||
"Implemented A/B testing frameworks to evaluate product changes.\n",
|
||
"Skills:\n",
|
||
"\n",
|
||
"Programming Languages: Python, Java, SQL\n",
|
||
"Machine Learning: Scikit-Learn, XGBoost\n",
|
||
"Data Visualization: Seaborn, Plotly\n",
|
||
"Databases: MySQL, MongoDB\n",
|
||
"\"\"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "185ff1c102d06111",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:50.286828Z",
|
||
"start_time": "2024-09-20T14:02:50.284369Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_3 = \"\"\"\n",
|
||
"CV 3: Relevant\n",
|
||
"Name: Sarah Nguyen\n",
|
||
"Contact Information:\n",
|
||
"\n",
|
||
"Email: sarah.nguyen@example.com\n",
|
||
"Phone: (555) 345-6789\n",
|
||
"Summary:\n",
|
||
"\n",
|
||
"Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n",
|
||
"\n",
|
||
"Education:\n",
|
||
"\n",
|
||
"M.S. in Statistics, University of Washington (2014)\n",
|
||
"B.S. in Applied Mathematics, University of Texas at Austin (2012)\n",
|
||
"Experience:\n",
|
||
"\n",
|
||
"Data Scientist, QuantumTech (2016 – Present)\n",
|
||
"Designed and implemented machine learning algorithms for financial forecasting.\n",
|
||
"Improved model efficiency by 20% through algorithm optimization.\n",
|
||
"Junior Data Scientist, DataCore Solutions (2014 – 2016)\n",
|
||
"Assisted in developing predictive models for supply chain optimization.\n",
|
||
"Conducted data cleaning and preprocessing on large datasets.\n",
|
||
"Skills:\n",
|
||
"\n",
|
||
"Programming Languages: Python, R\n",
|
||
"Machine Learning Frameworks: PyTorch, Scikit-Learn\n",
|
||
"Statistical Analysis: SAS, SPSS\n",
|
||
"Cloud Platforms: AWS, Azure\n",
|
||
"\"\"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "d55ce4c58f8efb67",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:50.950343Z",
|
||
"start_time": "2024-09-20T14:02:50.946378Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_4 = \"\"\"\n",
|
||
"CV 4: Not Relevant\n",
|
||
"Name: David Thompson\n",
|
||
"Contact Information:\n",
|
||
"\n",
|
||
"Email: david.thompson@example.com\n",
|
||
"Phone: (555) 456-7890\n",
|
||
"Summary:\n",
|
||
"\n",
|
||
"Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n",
|
||
"\n",
|
||
"Education:\n",
|
||
"\n",
|
||
"B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n",
|
||
"Experience:\n",
|
||
"\n",
|
||
"Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n",
|
||
"Led design projects for clients in various industries.\n",
|
||
"Created branding materials that increased client engagement by 30%.\n",
|
||
"Graphic Designer, Visual Innovations (2012 – 2015)\n",
|
||
"Designed marketing collateral, including brochures, logos, and websites.\n",
|
||
"Collaborated with the marketing team to develop cohesive brand strategies.\n",
|
||
"Skills:\n",
|
||
"\n",
|
||
"Design Software: Adobe Photoshop, Illustrator, InDesign\n",
|
||
"Web Design: HTML, CSS\n",
|
||
"Specialties: Branding and Identity, Typography\n",
|
||
"\"\"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "ca4ecc32721ad332",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:51.548191Z",
|
||
"start_time": "2024-09-20T14:02:51.545520Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"job_5 = \"\"\"\n",
|
||
"CV 5: Not Relevant\n",
|
||
"Name: Jessica Miller\n",
|
||
"Contact Information:\n",
|
||
"\n",
|
||
"Email: jessica.miller@example.com\n",
|
||
"Phone: (555) 567-8901\n",
|
||
"Summary:\n",
|
||
"\n",
|
||
"Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n",
|
||
"\n",
|
||
"Education:\n",
|
||
"\n",
|
||
"B.A. in Business Administration, University of Southern California (2010)\n",
|
||
"Experience:\n",
|
||
"\n",
|
||
"Sales Manager, Global Enterprises (2015 – Present)\n",
|
||
"Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n",
|
||
"Developed sales strategies that expanded customer base by 25%.\n",
|
||
"Sales Representative, Market Leaders Inc. (2010 – 2015)\n",
|
||
"Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n",
|
||
"Skills:\n",
|
||
"\n",
|
||
"Sales Strategy and Planning\n",
|
||
"Team Leadership and Development\n",
|
||
"CRM Software: Salesforce, Zoho\n",
|
||
"Negotiation and Relationship Building\n",
|
||
"\"\"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "904df61ba484a8e5",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:54.243987Z",
|
||
"start_time": "2024-09-20T14:02:52.498195Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import cognee\n",
|
||
"from os import listdir, path\n",
|
||
"\n",
|
||
"data_path = path.abspath(\".data\")\n",
|
||
"\n",
|
||
"results = await cognee.add([job_1, job_2,job_3,job_4,job_5,job_position], \"example\")\n",
|
||
"\n",
|
||
"for result in results:\n",
|
||
" print(result)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "6f9b564de121713d",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:55.564445Z",
|
||
"start_time": "2024-09-20T14:02:55.562784Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "8911f8bd4f8c440a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:56.714408Z",
|
||
"start_time": "2024-09-20T14:02:56.711812Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# from enum import Enum, auto\n",
|
||
"# from typing import Optional, List, Union, Dict, Any\n",
|
||
"# from pydantic import BaseModel, Field\n",
|
||
"# \n",
|
||
"# class Node(BaseModel):\n",
|
||
"# \"\"\"Node in a knowledge graph.\"\"\"\n",
|
||
"# id: str\n",
|
||
"# name: str\n",
|
||
"# type: str\n",
|
||
"# description: str\n",
|
||
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the node.\")\n",
|
||
"# \n",
|
||
"# class Edge(BaseModel):\n",
|
||
"# \"\"\"Edge in a knowledge graph.\"\"\"\n",
|
||
"# source_node_id: str\n",
|
||
"# target_node_id: str\n",
|
||
"# relationship_name: str\n",
|
||
"# properties: Optional[Dict[str, Any]] = Field(None, description = \"A dictionary of properties associated with the edge.\")\n",
|
||
"# \n",
|
||
"# class KnowledgeGraph(BaseModel):\n",
|
||
"# \"\"\"Knowledge graph.\"\"\"\n",
|
||
"# nodes: List[Node] = Field(..., default_factory=list)\n",
|
||
"# edges: List[Edge] = Field(..., default_factory=list)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "7c431fdef4921ae0",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:57.925667Z",
|
||
"start_time": "2024-09-20T14:02:57.922353Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
|
||
" data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n",
|
||
"\n",
|
||
" try:\n",
|
||
"\n",
|
||
" root_node_id = None\n",
|
||
"\n",
|
||
" tasks = [\n",
|
||
" Task(classify_documents),\n",
|
||
" Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
|
||
" Task(infer_data_ontology, root_node_id = root_node_id, ontology_model = KnowledgeGraph),\n",
|
||
" Task(source_documents_to_chunks, chunk_size = 800, parent_node_id = root_node_id), # Classify documents and save them as a nodes in graph db, extract text chunks based on the document type\n",
|
||
" Task(chunks_into_graph, graph_model = KnowledgeGraph, collection_name = \"entities\", task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks and attach it to chunk nodes\n",
|
||
" Task(chunk_update_check, collection_name = \"chunks\"), # Find all affected chunks, so we don't process unchanged chunks\n",
|
||
" Task(\n",
|
||
" save_chunks_to_store,\n",
|
||
" collection_name = \"chunks\",\n",
|
||
" ), \n",
|
||
" Task(chunk_remove_disconnected), # Remove the obsolete document chunks.\n",
|
||
" ]\n",
|
||
"\n",
|
||
" pipeline = run_tasks(tasks, data_documents)\n",
|
||
"\n",
|
||
" async for result in pipeline:\n",
|
||
" print(result)\n",
|
||
" except Exception as error:\n",
|
||
" raise error"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f0a91b99c6215e09",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-09-20T14:02:58.905774Z",
|
||
"start_time": "2024-09-20T14:02:58.625915Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"user = await get_default_user()\n",
|
||
"datasets = await get_datasets_by_name([\"example\"], user.id)\n",
|
||
"await run_cognify_pipeline(datasets[0], user)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "080389e5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"from cognee.shared.utils import render_graph\n",
|
||
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
|
||
"import graphistry\n",
|
||
"\n",
|
||
"# # Setting an environment variable\n",
|
||
"# os.environ[\"GRAPHISTRY_USERNAME\"] = placeholder\n",
|
||
"# os.environ[\"GRAPHISTRY_PASSWORD\"] = placeholder\n",
|
||
"\n",
|
||
"\n",
|
||
"graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
|
||
"\n",
|
||
"graph_engine = await get_graph_engine()\n",
|
||
"\n",
|
||
"graph_url = await render_graph(graph_engine.graph)\n",
|
||
"print(graph_url)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e5e7dfc8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"async def search(\n",
|
||
" vector_engine,\n",
|
||
" collection_name: str,\n",
|
||
" query_text: str = None,\n",
|
||
"):\n",
|
||
" query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n",
|
||
"\n",
|
||
" connection = await vector_engine.get_connection()\n",
|
||
" collection = await connection.open_table(collection_name)\n",
|
||
"\n",
|
||
" results = await collection.vector_search(query_vector).limit(10).to_pandas()\n",
|
||
"\n",
|
||
" result_values = list(results.to_dict(\"index\").values())\n",
|
||
"\n",
|
||
" return [dict(\n",
|
||
" id = str(result[\"id\"]),\n",
|
||
" payload = result[\"payload\"],\n",
|
||
" score = result[\"_distance\"],\n",
|
||
" ) for result in result_values]\n",
|
||
"\n",
|
||
"\n",
|
||
"from cognee.infrastructure.databases.vector import get_vector_engine\n",
|
||
"\n",
|
||
"vector_engine = get_vector_engine()\n",
|
||
"results = await search(vector_engine, \"entities\", \"sarah.nguyen@example.com\")\n",
|
||
"for result in results:\n",
|
||
" print(result)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.18"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|