Merge remote-tracking branch 'origin/dev' into COG-650-replace-pylint

2024-12-20 15:23:55 +01:00 · 2024-12-20 15:23:55 +01:00 · a90a232e95
commit a90a232e95
parent e31033c326 de2394c392
15 changed files with 1650 additions and 756 deletions
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -1,127 +1,38 @@
-# NOTICE: This module contains deprecated functions.
-# Use only the run_code_graph_pipeline function; all other functions are deprecated.
-# Related issue: COG-906
-
 import asyncio
 import logging
 from pathlib import Path
-from typing import Union

-from cognee.modules.data.methods import get_datasets, get_datasets_by_name
-from cognee.modules.data.methods.get_dataset_data import get_dataset_data
-from cognee.modules.data.models import Data, Dataset
+from cognee.base_config import get_base_config
+from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
-from cognee.modules.pipelines.models import PipelineRunStatus
-from cognee.modules.pipelines.operations.get_pipeline_status import \
-    get_pipeline_status
-from cognee.modules.pipelines.operations.log_pipeline_status import \
-    log_pipeline_status
 from cognee.modules.pipelines.tasks.Task import Task
 from cognee.modules.users.methods import get_default_user
-from cognee.modules.users.models import User
-from cognee.shared.SourceCodeGraph import SourceCodeGraph
-from cognee.shared.utils import send_telemetry
-from cognee.tasks.documents import (check_permissions_on_documents,
-                                    classify_documents,
+from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
+from cognee.tasks.documents import (classify_documents,
                                    extract_chunks_from_documents)
-from cognee.tasks.graph import extract_graph_from_code
+from cognee.tasks.graph import extract_graph_from_data
+from cognee.tasks.ingestion import ingest_data_with_metadata
 from cognee.tasks.repo_processor import (enrich_dependency_graph,
                                         expand_dependency_graph,
+                                         get_data_list_for_user,
+                                         get_non_code_files,
                                         get_repo_file_dependencies)
 from cognee.tasks.storage import add_data_points
-from cognee.tasks.summarization import summarize_code
+
+monitoring = get_base_config().monitoring_tool
+if monitoring == MonitoringTool.LANGFUSE:
+    from langfuse.decorators import observe
+
+from cognee.tasks.summarization import summarize_code, summarize_text

 logger = logging.getLogger("code_graph_pipeline")
-
 update_status_lock = asyncio.Lock()

-async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User = None):
-    if user is None:
-        user = await get_default_user()
-
-    existing_datasets = await get_datasets(user.id)
-
-    if datasets is None or len(datasets) == 0:
-        # If no datasets are provided, cognify all existing datasets.
-        datasets = existing_datasets
-
-    if type(datasets[0]) == str:
-        datasets = await get_datasets_by_name(datasets, user.id)
-
-    existing_datasets_map = {
-        generate_dataset_name(dataset.name): True for dataset in existing_datasets
-    }
-
-    awaitables = []
-
-    for dataset in datasets:
-        dataset_name = generate_dataset_name(dataset.name)
-
-        if dataset_name in existing_datasets_map:
-            awaitables.append(run_pipeline(dataset, user))
-
-    return await asyncio.gather(*awaitables)
-
-
-async def run_pipeline(dataset: Dataset, user: User):
-    '''DEPRECATED: Use `run_code_graph_pipeline` instead. This function will be removed.'''
-    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
-
-    document_ids_str = [str(document.id) for document in data_documents]
-
-    dataset_id = dataset.id
-    dataset_name = generate_dataset_name(dataset.name)
-
-    send_telemetry("code_graph_pipeline EXECUTION STARTED", user.id)
-
-    async with update_status_lock:
-        task_status = await get_pipeline_status([dataset_id])
-
-        if dataset_id in task_status and task_status[dataset_id] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
-            logger.info("Dataset %s is already being processed.", dataset_name)
-            return
-
-        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_STARTED, {
-            "dataset_name": dataset_name,
-            "files": document_ids_str,
-        })
-    try:
-        tasks = [
-            Task(classify_documents),
-            Task(check_permissions_on_documents, user = user, permissions = ["write"]),
-            Task(extract_chunks_from_documents), # Extract text chunks based on the document type.
-            Task(add_data_points, task_config = { "batch_size": 10 }),
-            Task(extract_graph_from_code, graph_model = SourceCodeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks.
-        ]
-
-        pipeline = run_tasks(tasks, data_documents, "code_graph_pipeline")
-
-        async for result in pipeline:
-            print(result)
-
-        send_telemetry("code_graph_pipeline EXECUTION COMPLETED", user.id)
-
-        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_COMPLETED, {
-            "dataset_name": dataset_name,
-            "files": document_ids_str,
-        })
-    except Exception as error:
-        send_telemetry("code_graph_pipeline EXECUTION ERRORED", user.id)
-
-        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_ERRORED, {
-            "dataset_name": dataset_name,
-            "files": document_ids_str,
-        })
-        raise error
-
-
-def generate_dataset_name(dataset_name: str) -> str:
-    return dataset_name.replace(".", "_").replace(" ", "_")
-
-
-async def run_code_graph_pipeline(repo_path):
+@observe
+async def run_code_graph_pipeline(repo_path, include_docs=True):
    import os
    import pathlib
+
    import cognee
    from cognee.infrastructure.databases.relational import create_db_and_tables

@ -135,6 +46,9 @@ async def run_code_graph_pipeline(repo_path):
    await cognee.prune.prune_system(metadata=True)
    await create_db_and_tables()

+    cognee_config = get_cognify_config()
+    user = await get_default_user()
+
    tasks = [
        Task(get_repo_file_dependencies),
        Task(enrich_dependency_graph, task_config={"batch_size": 50}),
@ -143,4 +57,24 @@ async def run_code_graph_pipeline(repo_path):
        Task(add_data_points, task_config={"batch_size": 50}),
    ]

-    return run_tasks(tasks, repo_path, "cognify_code_pipeline")
+    if include_docs:
+        non_code_tasks = [
+            Task(get_non_code_files, task_config={"batch_size": 50}),
+            Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
+            Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
+            Task(classify_documents),
+            Task(extract_chunks_from_documents),
+            Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
+            Task(
+                summarize_text,
+                summarization_model=cognee_config.summarization_model,
+                task_config={"batch_size": 50}
+            ),
+        ]
+
+    if include_docs:
+        async for result in run_tasks(non_code_tasks, repo_path):
+            yield result
+
+    async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"):
+        yield result
--- a/cognee/base_config.py
+++ b/cognee/base_config.py
@ -10,7 +10,9 @@ class BaseConfig(BaseSettings):
    monitoring_tool: object = MonitoringTool.LANGFUSE
    graphistry_username: Optional[str] = os.getenv("GRAPHISTRY_USERNAME")
    graphistry_password: Optional[str] = os.getenv("GRAPHISTRY_PASSWORD")
-
+    langfuse_public_key: Optional[str] = os.getenv("LANGFUSE_PUBLIC_KEY")
+    langfuse_secret_key: Optional[str] = os.getenv("LANGFUSE_SECRET_KEY")
+    langfuse_host: Optional[str] = os.getenv("LANGFUSE_HOST")
    model_config = SettingsConfigDict(env_file = ".env", extra = "allow")

    def to_dict(self) -> dict:
--- a/cognee/infrastructure/llm/openai/adapter.py
+++ b/cognee/infrastructure/llm/openai/adapter.py
@ -6,26 +6,31 @@ from typing import Type
 import litellm
 import instructor
 from pydantic import BaseModel
-
+from cognee.shared.data_models import MonitoringTool
 from cognee.exceptions import InvalidValueError
 from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.infrastructure.llm.prompts import read_query_prompt
+from cognee.base_config import get_base_config
+
+if MonitoringTool.LANGFUSE:
+    from langfuse.decorators import observe

 class OpenAIAdapter(LLMInterface):
    name = "OpenAI"
    model: str
    api_key: str
    api_version: str
-  
+
    """Adapter for OpenAI's GPT-3, GPT=4 API"""
+
    def __init__(
-        self,
-        api_key: str,
-        endpoint: str,
-        api_version: str,
-        model: str,
-        transcription_model: str,
-        streaming: bool = False,
+            self,
+            api_key: str,
+            endpoint: str,
+            api_version: str,
+            model: str,
+            transcription_model: str,
+            streaming: bool = False,
    ):
        self.aclient = instructor.from_litellm(litellm.acompletion)
        self.client = instructor.from_litellm(litellm.completion)
@ -35,13 +40,18 @@ class OpenAIAdapter(LLMInterface):
        self.endpoint = endpoint
        self.api_version = api_version
        self.streaming = streaming
+        base_config = get_base_config()
+
+
+    @observe()
+    async def acreate_structured_output(self, text_input: str, system_prompt: str,
+                                        response_model: Type[BaseModel]) -> BaseModel:

-    async def acreate_structured_output(self, text_input: str, system_prompt: str, response_model: Type[BaseModel]) -> BaseModel:
        """Generate a response from a user query."""

        return await self.aclient.chat.completions.create(
-            model = self.model,
-            messages = [{
+            model=self.model,
+            messages=[{
                "role": "user",
                "content": f"""Use the given format to
                extract information from the following input: {text_input}. """,
@ -49,19 +59,21 @@ class OpenAIAdapter(LLMInterface):
                "role": "system",
                "content": system_prompt,
            }],
-            api_key = self.api_key,
-            api_base = self.endpoint,
-            api_version = self.api_version,
-            response_model = response_model,
-            max_retries = 5,
+            api_key=self.api_key,
+            api_base=self.endpoint,
+            api_version=self.api_version,
+            response_model=response_model,
+            max_retries=5,
        )

-    def create_structured_output(self, text_input: str, system_prompt: str, response_model: Type[BaseModel]) -> BaseModel:
+    @observe
+    def create_structured_output(self, text_input: str, system_prompt: str,
+                                 response_model: Type[BaseModel]) -> BaseModel:
        """Generate a response from a user query."""

        return self.client.chat.completions.create(
-            model = self.model,
-            messages = [{
+            model=self.model,
+            messages=[{
                "role": "user",
                "content": f"""Use the given format to
                extract information from the following input: {text_input}. """,
@ -69,11 +81,11 @@ class OpenAIAdapter(LLMInterface):
                "role": "system",
                "content": system_prompt,
            }],
-            api_key = self.api_key,
-            api_base = self.endpoint,
-            api_version = self.api_version,
-            response_model = response_model,
-            max_retries = 5,
+            api_key=self.api_key,
+            api_base=self.endpoint,
+            api_version=self.api_version,
+            response_model=response_model,
+            max_retries=5,
        )

    def create_transcript(self, input):
@ -86,12 +98,12 @@ class OpenAIAdapter(LLMInterface):
        #     audio_data = audio_file.read()

        transcription = litellm.transcription(
-            model = self.transcription_model,
-            file = Path(input),
+            model=self.transcription_model,
+            file=Path(input),
            api_key=self.api_key,
            api_base=self.endpoint,
            api_version=self.api_version,
-            max_retries = 5,
+            max_retries=5,
        )

        return transcription
@ -101,8 +113,8 @@ class OpenAIAdapter(LLMInterface):
            encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

        return litellm.completion(
-            model = self.model,
-            messages = [{
+            model=self.model,
+            messages=[{
                "role": "user",
                "content": [
                    {
@ -119,8 +131,8 @@ class OpenAIAdapter(LLMInterface):
            api_key=self.api_key,
            api_base=self.endpoint,
            api_version=self.api_version,
-            max_tokens = 300,
-            max_retries = 5,
+            max_tokens=300,
+            max_retries=5,
        )

    def show_prompt(self, text_input: str, system_prompt: str) -> str:
@ -132,4 +144,4 @@ class OpenAIAdapter(LLMInterface):
        system_prompt = read_query_prompt(system_prompt)

        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
-        return formatted_prompt
+        return formatted_prompt
--- a/cognee/modules/data/extraction/extract_summary.py
+++ b/cognee/modules/data/extraction/extract_summary.py
@ -1,10 +1,11 @@
 from typing import Type
-
+import os
 from pydantic import BaseModel

 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt
-from cognee.shared.data_models import SummarizedCode
+from cognee.shared.data_models import SummarizedCode, SummarizedClass, SummarizedFunction
+from cognee.tasks.summarization.mock_summary import get_mock_summarized_code


 async def extract_summary(content: str, response_model: Type[BaseModel]):
@ -17,5 +18,14 @@ async def extract_summary(content: str, response_model: Type[BaseModel]):
    return llm_output

 async def extract_code_summary(content: str):
-    
-    return await extract_summary(content, response_model=SummarizedCode)
+    enable_mocking = os.getenv("MOCK_CODE_SUMMARY", "false")
+    if isinstance(enable_mocking, bool):
+        enable_mocking = str(enable_mocking).lower()
+    enable_mocking = enable_mocking in ("true", "1", "yes")
+
+    if enable_mocking:
+        result = get_mock_summarized_code()
+        return result
+    else:
+        result = await extract_summary(content, response_model=SummarizedCode)
+        return result
--- a/cognee/tasks/repo_processor/init.py
+++ b/cognee/tasks/repo_processor/init.py
@ -4,4 +4,5 @@ logger = logging.getLogger("task:repo_processor")

 from .enrich_dependency_graph import enrich_dependency_graph
 from .expand_dependency_graph import expand_dependency_graph
+from .get_non_code_files import get_data_list_for_user, get_non_py_files
 from .get_repo_file_dependencies import get_repo_file_dependencies
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@ -0,0 +1,48 @@
+import os
+
+import aiofiles
+
+import cognee.modules.ingestion as ingestion
+from cognee.infrastructure.engine import DataPoint
+from cognee.modules.data.methods import get_datasets
+from cognee.modules.data.methods.get_dataset_data import get_dataset_data
+from cognee.modules.data.methods.get_datasets_by_name import \
+    get_datasets_by_name
+from cognee.modules.data.models import Data
+from cognee.modules.data.operations.write_metadata import write_metadata
+from cognee.modules.ingestion.data_types import BinaryData
+from cognee.modules.users.methods import get_default_user
+from cognee.shared.CodeGraphEntities import Repository
+
+
+async def get_non_py_files(repo_path):
+    """Get files that are not .py files and their contents"""
+    if not os.path.exists(repo_path):
+        return {}
+
+    IGNORED_PATTERNS = {
+        '.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd',
+        'node_modules', '*.egg-info'
+    }
+
+    def should_process(path):
+        return not any(pattern in path for pattern in IGNORED_PATTERNS)
+
+    non_py_files_paths = [
+        os.path.join(root, file)
+        for root, _, files in os.walk(repo_path) for file in files 
+        if not file.endswith(".py") and should_process(os.path.join(root, file))
+    ]
+    return non_py_files_paths
+
+
+async def get_data_list_for_user(_, dataset_name, user):
+    # Note: This method is meant to be used as a Task in a pipeline.
+    # By the nature of pipelines, the output of the previous Task will be passed as the first argument here,
+    # but it is not needed here, hence the "_" input.
+    datasets = await get_datasets_by_name(dataset_name, user.id)
+    data_documents: list[Data] = []
+    for dataset in datasets:
+        data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id)
+        data_documents.extend(data_docs)
+    return data_documents
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@ -73,7 +73,7 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non

    yield repo

-    with ProcessPoolExecutor() as executor:
+    with ProcessPoolExecutor(max_workers = 12) as executor:
        loop = asyncio.get_event_loop()

        tasks = [
--- a/cognee/tasks/summarization/mock_summary.py
+++ b/cognee/tasks/summarization/mock_summary.py
@ -0,0 +1,37 @@
+from cognee.shared.data_models import SummarizedCode, SummarizedClass, SummarizedFunction
+
+def get_mock_summarized_code() -> SummarizedCode:
+    return SummarizedCode(
+        file_name="mock_file.py",
+        high_level_summary="This is a mock high-level summary.",
+        key_features=["Mock feature 1", "Mock feature 2"],
+        imports=["mock_import1", "mock_import2"],
+        constants=["MOCK_CONSTANT = 'mock_value'"],
+        classes=[
+            SummarizedClass(
+                name="MockClass",
+                description="This is a mock description of the MockClass.",
+                methods=[
+                    SummarizedFunction(
+                        name="mock_method",
+                        description="This is a description of the mock method.",
+                        docstring="This is a mock method.",
+                        inputs=["mock_input: str"],
+                        outputs=["mock_output: str"],
+                        decorators=None,
+                    )
+                ],
+            )
+        ],
+        functions=[
+            SummarizedFunction(
+                name="mock_function",
+                description="This is a description of the mock function.",
+                docstring="This is a mock function.",
+                inputs=["mock_input: str"],
+                outputs=["mock_output: str"],
+                decorators=None,
+            )
+        ],
+        workflow_description="This is a mock workflow description.",
+    )
--- a/cognee/tests/test_code_generation.py
+++ b/cognee/tests/test_code_generation.py
@ -1,38 +0,0 @@
-import os
-import logging
-import pathlib
-import cognee
-from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline
-from cognee.api.v1.search import SearchType
-from cognee.shared.utils import render_graph
-
-logging.basicConfig(level = logging.DEBUG)
-
-async def  main():
-    data_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_code_generation")).resolve())
-    cognee.config.data_root_directory(data_directory_path)
-    cognee_directory_path = str(pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_code_generation")).resolve())
-    cognee.config.system_root_directory(cognee_directory_path)
-
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata = True)
-
-    dataset_name = "artificial_intelligence"
-
-    ai_text_file_path = os.path.join(pathlib.Path(__file__).parent, "test_data/code.txt")
-    await cognee.add([ai_text_file_path], dataset_name)
-
-    await code_graph_pipeline([dataset_name])
-
-    await render_graph(None, include_nodes = True, include_labels = True)
-
-    search_results = await cognee.search(SearchType.CHUNKS, query_text = "Student")
-    assert len(search_results) != 0, "The search results list is empty."
-    print("\n\nExtracted chunks are:\n")
-    for result in search_results:
-        print(f"{result}\n")
-
-
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main(), debug=True)
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@ -1,15 +1,16 @@
 import argparse
 import asyncio
+
 from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline


-async def main(repo_path):
-    async for result in await run_code_graph_pipeline(repo_path):
+async def main(repo_path, include_docs):
+    async for result in run_code_graph_pipeline(repo_path, include_docs):
        print(result)

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
+    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
+    parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
    args = parser.parse_args()
-    asyncio.run(main(args.repo_path))
-
+    asyncio.run(main(args.repo_path, args.include_docs))
--- a/notebooks/cognee_code_graph_demo.ipynb
+++ b/notebooks/cognee_code_graph_demo.ipynb
@ -104,12 +104,58 @@
    "await render_graph(None, include_nodes = True, include_labels = True)"
   ]
  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Let's check the evaluations"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from evals.eval_on_hotpot import eval_on_hotpotQA\n",
+    "from evals.eval_on_hotpot import answer_with_cognee\n",
+    "from evals.eval_on_hotpot import answer_without_cognee\n",
+    "from evals.eval_on_hotpot import eval_answers\n",
+    "from cognee.base_config import get_base_config\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm\n",
+    "import wget\n",
+    "import json\n",
+    "import statistics"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "answer_provider = answer_with_cognee # For native LLM answers use answer_without_cognee\n",
+    "num_samples = 10 # With cognee, it takes ~1m10s per sample\n",
+    "\n",
+    "base_config = get_base_config()\n",
+    "data_root_dir = base_config.data_root_directory\n",
+    "\n",
+    "if not Path(data_root_dir).exists():\n",
+    "    Path(data_root_dir).mkdir()\n",
+    "\n",
+    "filepath = data_root_dir / Path(\"hotpot_dev_fullwiki_v1.json\")\n",
+    "if not filepath.exists():\n",
+    "    url = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'\n",
+    "    wget.download(url, out=data_root_dir)\n",
+    "\n",
+    "with open(filepath, \"r\") as file:\n",
+    "    dataset = json.load(file)\n",
+    "\n",
+    "instances = dataset if not num_samples else dataset[:num_samples]\n",
+    "answers = []\n",
+    "for instance in tqdm(instances, desc=\"Getting answers\"):\n",
+    "    answer = answer_provider(instance)\n",
+    "    answers.append(answer)"
+   ]
  }
 ],
 "metadata": {
--- a/notebooks/cognee_hotpot_eval.ipynb
+++ b/notebooks/cognee_hotpot_eval.ipynb
@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation on the hotpotQA dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evals.eval_on_hotpot import eval_on_hotpotQA\n",
+    "from evals.eval_on_hotpot import answer_with_cognee\n",
+    "from evals.eval_on_hotpot import answer_without_cognee\n",
+    "from evals.eval_on_hotpot import eval_answers\n",
+    "from cognee.base_config import get_base_config\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm\n",
+    "import wget\n",
+    "import json\n",
+    "import statistics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Getting the answers for the first num_samples questions of the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "answer_provider = answer_with_cognee # For native LLM answers use answer_without_cognee\n",
+    "num_samples = 10 # With cognee, it takes ~1m10s per sample\n",
+    "\n",
+    "base_config = get_base_config()\n",
+    "data_root_dir = base_config.data_root_directory\n",
+    "\n",
+    "if not Path(data_root_dir).exists():\n",
+    "    Path(data_root_dir).mkdir()\n",
+    "\n",
+    "filepath = data_root_dir / Path(\"hotpot_dev_fullwiki_v1.json\")\n",
+    "if not filepath.exists():\n",
+    "    url = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'\n",
+    "    wget.download(url, out=data_root_dir)\n",
+    "\n",
+    "with open(filepath, \"r\") as file:\n",
+    "    dataset = json.load(file)\n",
+    "\n",
+    "instances = dataset if not num_samples else dataset[:num_samples]\n",
+    "answers = []\n",
+    "for instance in tqdm(instances, desc=\"Getting answers\"):\n",
+    "    answer = await answer_provider(instance)\n",
+    "    answers.append(answer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Calculating the official HotpotQA benchmark metrics: F1 score and EM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evals.deepeval_metrics import f1_score_metric\n",
+    "from evals.deepeval_metrics import em_score_metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f1_metric = f1_score_metric()\n",
+    "eval_results = await eval_answers(instances, answers, f1_metric)\n",
+    "avg_f1_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])\n",
+    "print(\"F1 score: \", avg_f1_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "em_metric = em_score_metric()\n",
+    "eval_results = await eval_answers(instances, answers, em_metric)\n",
+    "avg_em_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])\n",
+    "print(\"EM score: \", avg_em_score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Calculating a custom metric called Correctness\n",
+    "##### Correctness is judged by an LLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evals.deepeval_metrics import correctness_metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_results = await eval_answers(instances, answers, correctness_metric) # note that instantiation is not needed for correctness metric as it is already an instance\n",
+    "avg_correctness_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])\n",
+    "print(\"Correctness score: \", avg_correctness_score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using a metric from Deepeval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from deepeval.metrics import AnswerRelevancyMetric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "relevancy_metric = AnswerRelevancyMetric()\n",
+    "eval_results = await eval_answers(instances, answers, relevancy_metric) # note that instantiation is not needed for correctness metric as it is already an instance\n",
+    "avg_relevancy_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])\n",
+    "print(\"Relevancy score: \", avg_relevancy_score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Answering and eval in one step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "answer_provider = answer_without_cognee\n",
+    "f1_metric = f1_score_metric()\n",
+    "f1_score =  await eval_on_hotpotQA(answer_provider, num_samples=10, eval_metric=f1_metric) # takes ~1m10s per sample\n",
+    "print(\"F1 score: \", f1_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/hr_demo.ipynb
+++ b/notebooks/hr_demo.ipynb
@ -0,0 +1,710 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d35ac8ce-0f92-46f5-9ba4-a46970f0ce19",
+   "metadata": {},
+   "source": [
+    "# Cognee - Get Started"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "074f0ea8-c659-4736-be26-be4b0e5ac665",
+   "metadata": {},
+   "source": [
+    "# Demo time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0587d91d",
+   "metadata": {},
+   "source": [
+    "#### First let's define some data that we will cognify and perform a search on"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "df16431d0f48b006",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:48.519686Z",
+     "start_time": "2024-09-20T14:02:48.515589Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "job_position = \"\"\"Senior Data Scientist (Machine Learning)\n",
+    "\n",
+    "Company: TechNova Solutions\n",
+    "Location: San Francisco, CA\n",
+    "\n",
+    "Job Description:\n",
+    "\n",
+    "TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.\n",
+    "\n",
+    "Responsibilities:\n",
+    "\n",
+    "Develop and implement advanced machine learning algorithms and models.\n",
+    "Analyze large, complex datasets to extract meaningful patterns and insights.\n",
+    "Collaborate with cross-functional teams to integrate predictive models into products.\n",
+    "Stay updated with the latest advancements in machine learning and data science.\n",
+    "Mentor junior data scientists and provide technical guidance.\n",
+    "Qualifications:\n",
+    "\n",
+    "Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.\n",
+    "5+ years of experience in data science and machine learning.\n",
+    "Proficient in Python, R, and SQL.\n",
+    "Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).\n",
+    "Strong problem-solving skills and attention to detail.\n",
+    "Candidate CVs\n",
+    "\"\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9086abf3af077ab4",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:49.120838Z",
+     "start_time": "2024-09-20T14:02:49.118294Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "job_1 = \"\"\"\n",
+    "CV 1: Relevant\n",
+    "Name: Dr. Emily Carter\n",
+    "Contact Information:\n",
+    "\n",
+    "Email: emily.carter@example.com\n",
+    "Phone: (555) 123-4567\n",
+    "Summary:\n",
+    "\n",
+    "Senior Data Scientist with over 8 years of experience in machine learning and predictive analytics. Expertise in developing advanced algorithms and deploying scalable models in production environments.\n",
+    "\n",
+    "Education:\n",
+    "\n",
+    "Ph.D. in Computer Science, Stanford University (2014)\n",
+    "B.S. in Mathematics, University of California, Berkeley (2010)\n",
+    "Experience:\n",
+    "\n",
+    "Senior Data Scientist, InnovateAI Labs (2016 – Present)\n",
+    "Led a team in developing machine learning models for natural language processing applications.\n",
+    "Implemented deep learning algorithms that improved prediction accuracy by 25%.\n",
+    "Collaborated with cross-functional teams to integrate models into cloud-based platforms.\n",
+    "Data Scientist, DataWave Analytics (2014 – 2016)\n",
+    "Developed predictive models for customer segmentation and churn analysis.\n",
+    "Analyzed large datasets using Hadoop and Spark frameworks.\n",
+    "Skills:\n",
+    "\n",
+    "Programming Languages: Python, R, SQL\n",
+    "Machine Learning: TensorFlow, Keras, Scikit-Learn\n",
+    "Big Data Technologies: Hadoop, Spark\n",
+    "Data Visualization: Tableau, Matplotlib\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9de0cc07f798b7f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:49.675003Z",
+     "start_time": "2024-09-20T14:02:49.671615Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "job_2 = \"\"\"\n",
+    "CV 2: Relevant\n",
+    "Name: Michael Rodriguez\n",
+    "Contact Information:\n",
+    "\n",
+    "Email: michael.rodriguez@example.com\n",
+    "Phone: (555) 234-5678\n",
+    "Summary:\n",
+    "\n",
+    "Data Scientist with a strong background in machine learning and statistical modeling. Skilled in handling large datasets and translating data into actionable business insights.\n",
+    "\n",
+    "Education:\n",
+    "\n",
+    "M.S. in Data Science, Carnegie Mellon University (2013)\n",
+    "B.S. in Computer Science, University of Michigan (2011)\n",
+    "Experience:\n",
+    "\n",
+    "Senior Data Scientist, Alpha Analytics (2017 – Present)\n",
+    "Developed machine learning models to optimize marketing strategies.\n",
+    "Reduced customer acquisition cost by 15% through predictive modeling.\n",
+    "Data Scientist, TechInsights (2013 – 2017)\n",
+    "Analyzed user behavior data to improve product features.\n",
+    "Implemented A/B testing frameworks to evaluate product changes.\n",
+    "Skills:\n",
+    "\n",
+    "Programming Languages: Python, Java, SQL\n",
+    "Machine Learning: Scikit-Learn, XGBoost\n",
+    "Data Visualization: Seaborn, Plotly\n",
+    "Databases: MySQL, MongoDB\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "185ff1c102d06111",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:50.286828Z",
+     "start_time": "2024-09-20T14:02:50.284369Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "job_3 = \"\"\"\n",
+    "CV 3: Relevant\n",
+    "Name: Sarah Nguyen\n",
+    "Contact Information:\n",
+    "\n",
+    "Email: sarah.nguyen@example.com\n",
+    "Phone: (555) 345-6789\n",
+    "Summary:\n",
+    "\n",
+    "Data Scientist specializing in machine learning with 6 years of experience. Passionate about leveraging data to drive business solutions and improve product performance.\n",
+    "\n",
+    "Education:\n",
+    "\n",
+    "M.S. in Statistics, University of Washington (2014)\n",
+    "B.S. in Applied Mathematics, University of Texas at Austin (2012)\n",
+    "Experience:\n",
+    "\n",
+    "Data Scientist, QuantumTech (2016 – Present)\n",
+    "Designed and implemented machine learning algorithms for financial forecasting.\n",
+    "Improved model efficiency by 20% through algorithm optimization.\n",
+    "Junior Data Scientist, DataCore Solutions (2014 – 2016)\n",
+    "Assisted in developing predictive models for supply chain optimization.\n",
+    "Conducted data cleaning and preprocessing on large datasets.\n",
+    "Skills:\n",
+    "\n",
+    "Programming Languages: Python, R\n",
+    "Machine Learning Frameworks: PyTorch, Scikit-Learn\n",
+    "Statistical Analysis: SAS, SPSS\n",
+    "Cloud Platforms: AWS, Azure\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d55ce4c58f8efb67",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:50.950343Z",
+     "start_time": "2024-09-20T14:02:50.946378Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "job_4 = \"\"\"\n",
+    "CV 4: Not Relevant\n",
+    "Name: David Thompson\n",
+    "Contact Information:\n",
+    "\n",
+    "Email: david.thompson@example.com\n",
+    "Phone: (555) 456-7890\n",
+    "Summary:\n",
+    "\n",
+    "Creative Graphic Designer with over 8 years of experience in visual design and branding. Proficient in Adobe Creative Suite and passionate about creating compelling visuals.\n",
+    "\n",
+    "Education:\n",
+    "\n",
+    "B.F.A. in Graphic Design, Rhode Island School of Design (2012)\n",
+    "Experience:\n",
+    "\n",
+    "Senior Graphic Designer, CreativeWorks Agency (2015 – Present)\n",
+    "Led design projects for clients in various industries.\n",
+    "Created branding materials that increased client engagement by 30%.\n",
+    "Graphic Designer, Visual Innovations (2012 – 2015)\n",
+    "Designed marketing collateral, including brochures, logos, and websites.\n",
+    "Collaborated with the marketing team to develop cohesive brand strategies.\n",
+    "Skills:\n",
+    "\n",
+    "Design Software: Adobe Photoshop, Illustrator, InDesign\n",
+    "Web Design: HTML, CSS\n",
+    "Specialties: Branding and Identity, Typography\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca4ecc32721ad332",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:51.548191Z",
+     "start_time": "2024-09-20T14:02:51.545520Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "job_5 = \"\"\"\n",
+    "CV 5: Not Relevant\n",
+    "Name: Jessica Miller\n",
+    "Contact Information:\n",
+    "\n",
+    "Email: jessica.miller@example.com\n",
+    "Phone: (555) 567-8901\n",
+    "Summary:\n",
+    "\n",
+    "Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams. Excellent communication and leadership skills.\n",
+    "\n",
+    "Education:\n",
+    "\n",
+    "B.A. in Business Administration, University of Southern California (2010)\n",
+    "Experience:\n",
+    "\n",
+    "Sales Manager, Global Enterprises (2015 – Present)\n",
+    "Managed a sales team of 15 members, achieving a 20% increase in annual revenue.\n",
+    "Developed sales strategies that expanded customer base by 25%.\n",
+    "Sales Representative, Market Leaders Inc. (2010 – 2015)\n",
+    "Consistently exceeded sales targets and received the 'Top Salesperson' award in 2013.\n",
+    "Skills:\n",
+    "\n",
+    "Sales Strategy and Planning\n",
+    "Team Leadership and Development\n",
+    "CRM Software: Salesforce, Zoho\n",
+    "Negotiation and Relationship Building\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4415446a",
+   "metadata": {},
+   "source": [
+    "#### Please add the necessary environment information bellow:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bce39dc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Setting environment variables\n",
+    "if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
+    "    os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
+    "\n",
+    "if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n",
+    "    os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
+    "\n",
+    "if \"LLM_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"LLM_API_KEY\"] = \"\"\n",
+    "\n",
+    "# \"neo4j\" or \"networkx\"\n",
+    "os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
+    "# Not needed if using networkx\n",
+    "#os.environ[\"GRAPH_DATABASE_URL\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_USERNAME\"]=\"\"\n",
+    "#os.environ[\"GRAPH_DATABASE_PASSWORD\"]=\"\"\n",
+    "\n",
+    "# \"pgvector\", \"qdrant\", \"weaviate\" or \"lancedb\"\n",
+    "os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
+    "# Not needed if using \"lancedb\" or \"pgvector\"\n",
+    "# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
+    "# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
+    "\n",
+    "# Relational Database provider \"sqlite\" or \"postgres\"\n",
+    "os.environ[\"DB_PROVIDER\"]=\"sqlite\"\n",
+    "\n",
+    "# Database name\n",
+    "os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
+    "\n",
+    "# Postgres specific parameters (Only if Postgres or PGVector is used)\n",
+    "# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
+    "# os.environ[\"DB_PORT\"]=\"5432\"\n",
+    "# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
+    "# os.environ[\"DB_PASSWORD\"]=\"cognee\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f1a1dbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reset the cognee system with the following command:\n",
+    "\n",
+    "import cognee\n",
+    "\n",
+    "await cognee.prune.prune_data()\n",
+    "await cognee.prune.prune_system(metadata=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "383d6971",
+   "metadata": {},
+   "source": [
+    "#### After we have defined and gathered our data let's add it to cognee "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "904df61ba484a8e5",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:54.243987Z",
+     "start_time": "2024-09-20T14:02:52.498195Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import cognee\n",
+    "\n",
+    "await cognee.add([job_1, job_2, job_3, job_4, job_5, job_position], \"example\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f15c5b1",
+   "metadata": {},
+   "source": [
+    "#### All good, let's cognify it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c431fdef4921ae0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:57.925667Z",
+     "start_time": "2024-09-20T14:02:57.922353Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from cognee.shared.data_models import KnowledgeGraph\n",
+    "from cognee.modules.data.models import Dataset, Data\n",
+    "from cognee.modules.data.methods.get_dataset_data import get_dataset_data\n",
+    "from cognee.modules.cognify.config import get_cognify_config\n",
+    "from cognee.modules.pipelines.tasks.Task import Task\n",
+    "from cognee.modules.pipelines import run_tasks\n",
+    "from cognee.modules.users.models import User\n",
+    "from cognee.tasks.documents import check_permissions_on_documents, classify_documents, extract_chunks_from_documents\n",
+    "from cognee.tasks.graph import extract_graph_from_data\n",
+    "from cognee.tasks.storage import add_data_points\n",
+    "from cognee.tasks.summarization import summarize_text\n",
+    "\n",
+    "async def run_cognify_pipeline(dataset: Dataset, user: User = None):\n",
+    "    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)\n",
+    "\n",
+    "    try:\n",
+    "        cognee_config = get_cognify_config()\n",
+    "\n",
+    "        tasks = [\n",
+    "            Task(classify_documents),\n",
+    "            Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
+    "            Task(extract_chunks_from_documents), # Extract text chunks based on the document type.\n",
+    "            Task(extract_graph_from_data, graph_model = KnowledgeGraph, task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks.\n",
+    "            Task(\n",
+    "                summarize_text,\n",
+    "                summarization_model = cognee_config.summarization_model,\n",
+    "                task_config = { \"batch_size\": 10 }\n",
+    "            ),\n",
+    "            Task(add_data_points, task_config = { \"batch_size\": 10 }),\n",
+    "        ]\n",
+    "\n",
+    "        pipeline = run_tasks(tasks, data_documents)\n",
+    "\n",
+    "        async for result in pipeline:\n",
+    "            print(result)\n",
+    "    except Exception as error:\n",
+    "        raise error\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0a91b99c6215e09",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-20T14:02:58.905774Z",
+     "start_time": "2024-09-20T14:02:58.625915Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from cognee.modules.users.methods import get_default_user\n",
+    "from cognee.modules.data.methods import get_datasets_by_name\n",
+    "\n",
+    "user = await get_default_user()\n",
+    "\n",
+    "datasets = await get_datasets_by_name([\"example\"], user.id)\n",
+    "\n",
+    "await run_cognify_pipeline(datasets[0], user)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "219a6d41",
+   "metadata": {},
+   "source": [
+    "#### We get the url to the graph on graphistry in the notebook cell bellow, showing nodes and connections made by the cognify process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "080389e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from cognee.shared.utils import render_graph\n",
+    "from cognee.infrastructure.databases.graph import get_graph_engine\n",
+    "import graphistry\n",
+    "\n",
+    "graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
+    "\n",
+    "graph_engine = await get_graph_engine()\n",
+    "\n",
+    "graph_url = await render_graph(graph_engine.graph)\n",
+    "print(graph_url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59e6c3c3",
+   "metadata": {},
+   "source": [
+    "#### We can also do a search on the data to explore the knowledge."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5e7dfc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def search(\n",
+    "    vector_engine,\n",
+    "    collection_name: str,\n",
+    "    query_text: str = None,\n",
+    "):\n",
+    "    query_vector = (await vector_engine.embedding_engine.embed_text([query_text]))[0]\n",
+    "\n",
+    "    connection = await vector_engine.get_connection()\n",
+    "    collection = await connection.open_table(collection_name)\n",
+    "\n",
+    "    results = await collection.vector_search(query_vector).limit(10).to_pandas()\n",
+    "\n",
+    "    result_values = list(results.to_dict(\"index\").values())\n",
+    "\n",
+    "    return [dict(\n",
+    "        id = str(result[\"id\"]),\n",
+    "        payload = result[\"payload\"],\n",
+    "        score = result[\"_distance\"],\n",
+    "    ) for result in result_values]\n",
+    "\n",
+    "\n",
+    "from cognee.infrastructure.databases.vector import get_vector_engine\n",
+    "\n",
+    "vector_engine = get_vector_engine()\n",
+    "results = await search(vector_engine, \"entity_name\", \"sarah.nguyen@example.com\")\n",
+    "for result in results:\n",
+    "    print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81fa2b00",
+   "metadata": {},
+   "source": [
+    "#### We normalize search output scores so the lower the score of the search result is the higher the chance that it's what you're looking for. In the example above we have searched for node entities in the knowledge graph related to \"sarah.nguyen@example.com\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b94ff96",
+   "metadata": {},
+   "source": [
+    "#### In the example bellow we'll use cognee search to summarize information regarding the node most related to \"sarah.nguyen@example.com\" in the knowledge graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21a3e9a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cognee.api.v1.search import SearchType\n",
+    "\n",
+    "node = (await vector_engine.search(\"entity_name\", \"sarah.nguyen@example.com\"))[0]\n",
+    "node_name = node.payload[\"text\"]\n",
+    "\n",
+    "search_results = await cognee.search(SearchType.SUMMARIES, query_text = node_name)\n",
+    "print(\"\\n\\Extracted summaries are:\\n\")\n",
+    "for result in search_results:\n",
+    "    print(f\"{result}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd6e5fe2",
+   "metadata": {},
+   "source": [
+    "#### In this example we'll use cognee search to find chunks in which the node most related to \"sarah.nguyen@example.com\" is a part of"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7a8abff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_results = await cognee.search(SearchType.CHUNKS, query_text = node_name)\n",
+    "print(\"\\n\\nExtracted chunks are:\\n\")\n",
+    "for result in search_results:\n",
+    "    print(f\"{result}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47f0112f",
+   "metadata": {},
+   "source": [
+    "#### In this example we'll use cognee search to give us insights from the knowledge graph related to the node most related to \"sarah.nguyen@example.com\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "706a3954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_results = await cognee.search(SearchType.INSIGHTS, query_text = node_name)\n",
+    "print(\"\\n\\nExtracted sentences are:\\n\")\n",
+    "for result in search_results:\n",
+    "    print(f\"{result}\\n\")"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Let's add evals",
+   "id": "e519e30c0423c2a"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-19T18:01:11.387716Z",
+     "start_time": "2024-12-19T18:01:11.278042Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from evals.eval_on_hotpot import eval_on_hotpotQA\n",
+    "from evals.eval_on_hotpot import answer_with_cognee\n",
+    "from evals.eval_on_hotpot import answer_without_cognee\n",
+    "from evals.eval_on_hotpot import eval_answers\n",
+    "from cognee.base_config import get_base_config\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm\n",
+    "import wget\n",
+    "import json\n",
+    "import statistics\n"
+   ],
+   "id": "b22ae3d868fa5606",
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'deepeval'",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mModuleNotFoundError\u001B[0m                       Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[3], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mevals\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01meval_on_hotpot\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m eval_on_hotpotQA\n\u001B[1;32m      2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mevals\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01meval_on_hotpot\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m answer_with_cognee\n\u001B[1;32m      3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mevals\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01meval_on_hotpot\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m answer_without_cognee\n",
+      "File \u001B[0;32m~/cognee/evals/eval_on_hotpot.py:7\u001B[0m\n\u001B[1;32m      4\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mstatistics\u001B[39;00m\n\u001B[1;32m      5\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m----> 7\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mdeepeval\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmetrics\u001B[39;00m\n\u001B[1;32m      8\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mwget\u001B[39;00m\n\u001B[1;32m      9\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mdeepeval\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdataset\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m EvaluationDataset\n",
+      "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'deepeval'"
+     ]
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "answer_provider = answer_with_cognee # For native LLM answers use answer_without_cognee\n",
+    "num_samples = 10 # With cognee, it takes ~1m10s per sample\n",
+    "\n",
+    "base_config = get_base_config()\n",
+    "data_root_dir = base_config.data_root_directory\n",
+    "\n",
+    "if not Path(data_root_dir).exists():\n",
+    "    Path(data_root_dir).mkdir()\n",
+    "\n",
+    "filepath = data_root_dir / Path(\"hotpot_dev_fullwiki_v1.json\")\n",
+    "if not filepath.exists():\n",
+    "    url = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'\n",
+    "    wget.download(url, out=data_root_dir)\n",
+    "\n",
+    "with open(filepath, \"r\") as file:\n",
+    "    dataset = json.load(file)\n",
+    "\n",
+    "instances = dataset if not num_samples else dataset[:num_samples]\n",
+    "answers = []\n",
+    "for instance in tqdm(instances, desc=\"Getting answers\"):\n",
+    "    answer = answer_provider(instance)\n",
+    "    answers.append(answer)"
+   ],
+   "id": "728355d390e3a01b"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "288ab570",
+   "metadata": {},
+   "source": [
+    "# Give us a star if you like it!\n",
+    "https://github.com/topoteretes/cognee"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,7 +46,7 @@ aiofiles = "^23.2.1"
 qdrant-client = {version = "^1.9.0", optional = true}
 graphistry = "^0.33.5"
 tenacity = "^8.4.1"
-weaviate-client = {version = "4.6.7", optional = true}
+weaviate-client = {version = "4.9.6", optional = true}
 scikit-learn = "^1.5.0"
 pypdf = "^4.1.0"
 neo4j = {version = "^5.20.0", optional = true}
@ -60,7 +60,7 @@ posthog = {version = "^3.5.0", optional = true}
 lancedb = "0.15.0"
 litellm = "1.49.1"
 groq = {version = "0.8.0", optional = true}
-langfuse = {version = "^2.32.0", optional = true}
+langfuse = "^2.32.0"
 pydantic-settings = "^2.2.1"
 anthropic = "^0.26.1"
 sentry-sdk = {extras = ["fastapi"], version = "^2.9.0"}
@ -75,6 +75,7 @@ transformers = "^4.46.3"
 pymilvus = {version = "^2.5.0", optional = true}
 unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.10", optional = true }
 pre-commit = "^4.0.1"
+httpx = "0.27.0"