From a2ff42332ed411512178ddc5f6ed798d5018db74 Mon Sep 17 00:00:00 2001
From: Leon Luithlen <leontimnaluithlen@gmail.com>
Date: Thu, 28 Nov 2024 11:49:28 +0100
Subject: [PATCH 1/3] DummyLLMAdapter WIP

---
 .../infrastructure/llm/anthropic/adapter.py   | 15 ------
 .../llm/generic_llm_api/adapter.py            | 11 -----
 cognee/infrastructure/llm/llm_interface.py    | 14 +++++-
 cognee/infrastructure/llm/openai/adapter.py   | 12 -----
 .../llm/prompts/generate_cog_layers.txt       | 14 ------
 .../extraction/extract_cognitive_layers.py    | 11 -----
 profiling/util/DummyLLMAdapter.py             | 47 +++++++++++++++++++
 7 files changed, 59 insertions(+), 65 deletions(-)
 delete mode 100644 cognee/infrastructure/llm/prompts/generate_cog_layers.txt
 delete mode 100644 cognee/modules/data/extraction/extract_cognitive_layers.py
 create mode 100644 profiling/util/DummyLLMAdapter.py

diff --git a/cognee/infrastructure/llm/anthropic/adapter.py b/cognee/infrastructure/llm/anthropic/adapter.py
index 8df59e3e5..7577bc12f 100644
--- a/cognee/infrastructure/llm/anthropic/adapter.py
+++ b/cognee/infrastructure/llm/anthropic/adapter.py
@@ -4,7 +4,6 @@ import instructor
 from tenacity import retry, stop_after_attempt
 import anthropic
 from cognee.infrastructure.llm.llm_interface import LLMInterface
-from cognee.infrastructure.llm.prompts import read_query_prompt
 
 
 class AnthropicAdapter(LLMInterface):
@@ -38,17 +37,3 @@ class AnthropicAdapter(LLMInterface):
             }],
             response_model = response_model,
         )
-
-    def show_prompt(self, text_input: str, system_prompt: str) -> str:
-        """Format and display the prompt for a user query."""
-
-        if not text_input:
-            text_input = "No user input provided."
-        if not system_prompt:
-            raise ValueError("No system prompt path provided.")
-
-        system_prompt = read_query_prompt(system_prompt)
-
-        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
-
-        return formatted_prompt
diff --git a/cognee/infrastructure/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/generic_llm_api/adapter.py
index f65d559d5..5d1436ba0 100644
--- a/cognee/infrastructure/llm/generic_llm_api/adapter.py
+++ b/cognee/infrastructure/llm/generic_llm_api/adapter.py
@@ -6,7 +6,6 @@ import instructor
 from tenacity import retry, stop_after_attempt
 import openai
 from cognee.infrastructure.llm.llm_interface import LLMInterface
-from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.shared.data_models import MonitoringTool
 from cognee.base_config import get_base_config
 from cognee.infrastructure.llm.config import get_llm_config
@@ -123,13 +122,3 @@ class GenericAPIAdapter(LLMInterface):
             response_model = response_model,
         )
 
-    def show_prompt(self, text_input: str, system_prompt: str) -> str:
-        """Format and display the prompt for a user query."""
-        if not text_input:
-            text_input = "No user input provided."
-        if not system_prompt:
-            raise ValueError("No system prompt path provided.")
-        system_prompt = read_query_prompt(system_prompt)
-
-        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
-        return formatted_prompt
diff --git a/cognee/infrastructure/llm/llm_interface.py b/cognee/infrastructure/llm/llm_interface.py
index f0c6db133..069efb22d 100644
--- a/cognee/infrastructure/llm/llm_interface.py
+++ b/cognee/infrastructure/llm/llm_interface.py
@@ -3,6 +3,8 @@
 from typing import Type, Protocol
 from abc import abstractmethod
 from pydantic import BaseModel
+from cognee.infrastructure.llm.prompts import read_query_prompt
+
 class LLMInterface(Protocol):
     """ LLM Interface """
 
@@ -16,5 +18,13 @@ class LLMInterface(Protocol):
 
     @abstractmethod
     def show_prompt(self, text_input: str, system_prompt: str) -> str:
-        """To get structured output, import/call this function"""
-        raise NotImplementedError
+        """Format and display the prompt for a user query."""
+        if not text_input:
+            text_input = "No user input provided."
+        if not system_prompt:
+            raise ValueError("No system prompt path provided.")
+        system_prompt = read_query_prompt(system_prompt)
+
+        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n"""
+
+        return formatted_prompt
diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py
index 1dc9b70f5..e74cbbd33 100644
--- a/cognee/infrastructure/llm/openai/adapter.py
+++ b/cognee/infrastructure/llm/openai/adapter.py
@@ -8,7 +8,6 @@ import instructor
 from pydantic import BaseModel
 
 from cognee.infrastructure.llm.llm_interface import LLMInterface
-from cognee.infrastructure.llm.prompts import read_query_prompt
 
 class OpenAIAdapter(LLMInterface):
     name = "OpenAI"
@@ -121,14 +120,3 @@ class OpenAIAdapter(LLMInterface):
             max_tokens = 300,
             max_retries = 5,
         )
-
-    def show_prompt(self, text_input: str, system_prompt: str) -> str:
-        """Format and display the prompt for a user query."""
-        if not text_input:
-            text_input = "No user input provided."
-        if not system_prompt:
-            raise ValueError("No system prompt path provided.")
-        system_prompt = read_query_prompt(system_prompt)
-
-        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
-        return formatted_prompt
diff --git a/cognee/infrastructure/llm/prompts/generate_cog_layers.txt b/cognee/infrastructure/llm/prompts/generate_cog_layers.txt
deleted file mode 100644
index 925588189..000000000
--- a/cognee/infrastructure/llm/prompts/generate_cog_layers.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-You are tasked with analyzing `{{ data_type }}` files, especially in a multilayer network context for tasks such as analysis, categorization, and feature extraction. Various layers can be incorporated to capture the depth and breadth of information contained within the {{ data_type }}.
-
-These layers can help in understanding the content, context, and characteristics of the `{{ data_type }}`.
-
-Your objective is to extract meaningful layers of information that will contribute to constructing a detailed multilayer network or knowledge graph.
-
-Approach this task by considering the unique characteristics and inherent properties of the data at hand.
-
-VERY IMPORTANT: The context you are working in is `{{ category_name }}` and the specific domain you are extracting data on is `{{ category_name }}`.
-
-Guidelines for Layer Extraction:
-Take into account: The content type, in this case, is: `{{ category_name }}`, should play a major role in how you decompose into layers.
-
-Based on your analysis, define and describe the layers you've identified, explaining their relevance and contribution to understanding the dataset. Your independent identification of layers will enable a nuanced and multifaceted representation of the data, enhancing applications in knowledge discovery, content analysis, and information retrieval.
diff --git a/cognee/modules/data/extraction/extract_cognitive_layers.py b/cognee/modules/data/extraction/extract_cognitive_layers.py
deleted file mode 100644
index 82e9e8a94..000000000
--- a/cognee/modules/data/extraction/extract_cognitive_layers.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from typing import Type, Dict
-from pydantic import BaseModel
-from cognee.infrastructure.llm.prompts import render_prompt
-from cognee.infrastructure.llm.get_llm_client import get_llm_client
-
-async def extract_cognitive_layers(content: str, category: Dict, response_model: Type[BaseModel]):
-    llm_client = get_llm_client()
-
-    system_prompt = render_prompt("generate_cog_layers.txt", category)
-
-    return await llm_client.acreate_structured_output(content, system_prompt, response_model)
diff --git a/profiling/util/DummyLLMAdapter.py b/profiling/util/DummyLLMAdapter.py
new file mode 100644
index 000000000..40698b938
--- /dev/null
+++ b/profiling/util/DummyLLMAdapter.py
@@ -0,0 +1,47 @@
+import spacy
+import textacy
+from typing import Type
+from uuid import uuid4
+from pydantic import BaseModel
+from cognee.infrastructure.llm.llm_interface import LLMInterface
+from cognee.shared.data_models import SummarizedContent
+from cognee.shared.data_models import KnowledgeGraph, Node, Edge
+
+
+class DummyLLMAdapter(LLMInterface):
+    nlp = spacy.load('en_core_web_sm')
+    async def acreate_structured_output(self,
+                                        text_input: str,
+                                        system_prompt: str,
+                                        response_model: Type[BaseModel]) -> BaseModel:
+        
+        if isinstance(response_model, SummarizedContent):
+            return(dummy_summarize_content(text_input))
+        elif isinstance(response_model, KnowledgeGraph):
+            return(dummy_extract_knowledge_graph(text_input, nlp))
+        else:
+            raise Exception("Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph")
+
+
+def dummy_extract_knowledge_graph(text, nlp):
+    doc = nlp(text)
+    triples = list(textacy.extract.subject_verb_object_triples(doc))
+
+    nodes = {}
+    edges = []
+    for triple in triples:
+        source = "_".join([str(e) for e in triple.subject])
+        target  = "_".join([str(e) for e in triple.object])
+        nodes[source] = nodes.get(source, Node(id=str(uuid4()), name=source, type="object", description="")   )
+        nodes[target] = nodes.get(target,  Node(id=str(uuid4()), name=target, type="object", description="")  )
+        edge_type = "_".join([str(e) for e in triple.verb])
+        edges.append(Edge(source_node_id=nodes[source].id, target_node_id=nodes[target].id, relationship_name=edge_type))
+    return(KnowledgeGraph(nodes=list(nodes.keys()), edges=edges))
+    
+
+def dummy_summarize_content(text):
+    words = [(word, len(word)) for word in set(text.split(" "))]
+    words = sorted(words, key=lambda x: x[1], reverse=True)
+    summary = " ".join([word for word, _ in words[:100]])
+    description =  " ".join([word for word, _ in words[:10]])
+    return(SummarizedContent(summary=summary, description=description))
\ No newline at end of file

From 5c9fd44680730be365438131b9224b0d2423ac15 Mon Sep 17 00:00:00 2001
From: Leon Luithlen <leontimnaluithlen@gmail.com>
Date: Thu, 28 Nov 2024 12:26:01 +0100
Subject: [PATCH 2/3] Fix DummyLLMAdapter

---
 cognee/infrastructure/llm/llm_interface.py |  1 -
 profiling/util/DummyLLMAdapter.py          | 66 ++++++++++++++--------
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/cognee/infrastructure/llm/llm_interface.py b/cognee/infrastructure/llm/llm_interface.py
index 069efb22d..4a5bd79b6 100644
--- a/cognee/infrastructure/llm/llm_interface.py
+++ b/cognee/infrastructure/llm/llm_interface.py
@@ -16,7 +16,6 @@ class LLMInterface(Protocol):
         """To get structured output, import/call this function"""
         raise NotImplementedError
 
-    @abstractmethod
     def show_prompt(self, text_input: str, system_prompt: str) -> str:
         """Format and display the prompt for a user query."""
         if not text_input:
diff --git a/profiling/util/DummyLLMAdapter.py b/profiling/util/DummyLLMAdapter.py
index 40698b938..df81ce123 100644
--- a/profiling/util/DummyLLMAdapter.py
+++ b/profiling/util/DummyLLMAdapter.py
@@ -1,26 +1,34 @@
-import spacy
-import textacy
 from typing import Type
 from uuid import uuid4
+
+import spacy
+import textacy
 from pydantic import BaseModel
+
 from cognee.infrastructure.llm.llm_interface import LLMInterface
-from cognee.shared.data_models import SummarizedContent
-from cognee.shared.data_models import KnowledgeGraph, Node, Edge
+from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent
 
 
 class DummyLLMAdapter(LLMInterface):
-    nlp = spacy.load('en_core_web_sm')
-    async def acreate_structured_output(self,
-                                        text_input: str,
-                                        system_prompt: str,
-                                        response_model: Type[BaseModel]) -> BaseModel:
-        
-        if isinstance(response_model, SummarizedContent):
-            return(dummy_summarize_content(text_input))
-        elif isinstance(response_model, KnowledgeGraph):
-            return(dummy_extract_knowledge_graph(text_input, nlp))
+    nlp = spacy.load("en_core_web_sm")
+
+    async def acreate_structured_output(
+        self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
+    ) -> BaseModel:
+
+        if (
+            str(response_model)
+            == "<class 'cognee.shared.data_models.SummarizedContent'>"
+        ):
+            return dummy_summarize_content(text_input)
+        elif (
+            str(response_model) == "<class 'cognee.shared.data_models.KnowledgeGraph'>"
+        ):
+            return dummy_extract_knowledge_graph(text_input, self.nlp)
         else:
-            raise Exception("Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph")
+            raise Exception(
+                "Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph"
+            )
 
 
 def dummy_extract_knowledge_graph(text, nlp):
@@ -31,17 +39,27 @@ def dummy_extract_knowledge_graph(text, nlp):
     edges = []
     for triple in triples:
         source = "_".join([str(e) for e in triple.subject])
-        target  = "_".join([str(e) for e in triple.object])
-        nodes[source] = nodes.get(source, Node(id=str(uuid4()), name=source, type="object", description="")   )
-        nodes[target] = nodes.get(target,  Node(id=str(uuid4()), name=target, type="object", description="")  )
+        target = "_".join([str(e) for e in triple.object])
+        nodes[source] = nodes.get(
+            source, Node(id=str(uuid4()), name=source, type="object", description="")
+        )
+        nodes[target] = nodes.get(
+            target, Node(id=str(uuid4()), name=target, type="object", description="")
+        )
         edge_type = "_".join([str(e) for e in triple.verb])
-        edges.append(Edge(source_node_id=nodes[source].id, target_node_id=nodes[target].id, relationship_name=edge_type))
-    return(KnowledgeGraph(nodes=list(nodes.keys()), edges=edges))
-    
+        edges.append(
+            Edge(
+                source_node_id=nodes[source].id,
+                target_node_id=nodes[target].id,
+                relationship_name=edge_type,
+            )
+        )
+    return KnowledgeGraph(nodes=list(nodes.values()), edges=edges)
+
 
 def dummy_summarize_content(text):
     words = [(word, len(word)) for word in set(text.split(" "))]
     words = sorted(words, key=lambda x: x[1], reverse=True)
-    summary = " ".join([word for word, _ in words[:100]])
-    description =  " ".join([word for word, _ in words[:10]])
-    return(SummarizedContent(summary=summary, description=description))
\ No newline at end of file
+    summary = " ".join([word for word, _ in words[:50]])
+    description = " ".join([word for word, _ in words[:10]])
+    return SummarizedContent(summary=summary, description=description)

From 3e1949d895f1450a3b8a436eb9491c11e60daad8 Mon Sep 17 00:00:00 2001
From: Leon Luithlen <leontimnaluithlen@gmail.com>
Date: Thu, 28 Nov 2024 15:42:20 +0100
Subject: [PATCH 3/3] Remove unnecessary nesting in embed_text and add
 DummyEmbeddingEngine

---
 .../embeddings/LiteLLMEmbeddingEngine.py      | 29 ++++++++-----------
 profiling/util/DummyEmbeddingEngine.py        |  9 ++++++
 2 files changed, 21 insertions(+), 17 deletions(-)
 create mode 100644 profiling/util/DummyEmbeddingEngine.py

diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
index de30640e5..edc8eb57f 100644
--- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@@ -28,24 +28,19 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
         self.dimensions = dimensions
 
     async def embed_text(self, text: List[str]) -> List[List[float]]:
-        async def get_embedding(text_):
-            try:
-                response = await litellm.aembedding(
-                    self.model,
-                    input = text_,
-                    api_key = self.api_key,
-                    api_base = self.endpoint,
-                    api_version = self.api_version
-                )
-            except litellm.exceptions.BadRequestError as error:
-                logger.error("Error embedding text: %s", str(error))
-                raise error
+        try:
+            response = await litellm.aembedding(
+                self.model,
+                input = text,
+                api_key = self.api_key,
+                api_base = self.endpoint,
+                api_version = self.api_version
+            )
+        except litellm.exceptions.BadRequestError as error:
+            logger.error("Error embedding text: %s", str(error))
+            raise error
 
-            return [data["embedding"] for data in response.data]
-
-        # tasks = [get_embedding(text_) for text_ in text]
-        result = await get_embedding(text)
-        return result
+        return [data["embedding"] for data in response.data]
 
     def get_vector_size(self) -> int:
         return self.dimensions
diff --git a/profiling/util/DummyEmbeddingEngine.py b/profiling/util/DummyEmbeddingEngine.py
new file mode 100644
index 000000000..7f5b3e847
--- /dev/null
+++ b/profiling/util/DummyEmbeddingEngine.py
@@ -0,0 +1,9 @@
+import numpy as np
+from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
+
+class DummyEmbeddingEngine(EmbeddingEngine):
+    async def embed_text(self, text: list[str]) -> list[list[float]]:
+        return(list(list(np.random.randn(3072))))
+
+    def get_vector_size(self) -> int:
+        return(3072)