Cog 669 implement dummy llm adapter (#37)

Adds the `class DummyLLMAdapter(LLMInterface)` class for profiling of large datasets without actual LLM calls in the top level `profiling/util` location. I also move the `show_prompt` implementation from the child classes to `LLMInterface`, since the implementations were identical. I expanded the scope to also include a DummyEmbeddingEngine.
2024-11-30 17:02:49 +01:00 · 2024-11-30 17:02:49 +01:00 · bbaf78f54e
commit bbaf78f54e
parent 4d02560f1c 3e1949d895
9 changed files with 98 additions and 83 deletions
--- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@ -28,24 +28,19 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
        self.dimensions = dimensions
    async def embed_text(self, text: List[str]) -> List[List[float]]:
-        async def get_embedding(text_):
+        try:
-            try:
+            response = await litellm.aembedding(
-                response = await litellm.aembedding(
+                self.model,
-                    self.model,
+                input = text,
-                    input = text_,
+                api_key = self.api_key,
-                    api_key = self.api_key,
+                api_base = self.endpoint,
-                    api_base = self.endpoint,
+                api_version = self.api_version
-                    api_version = self.api_version
+            )
-                )
+        except litellm.exceptions.BadRequestError as error:
-            except litellm.exceptions.BadRequestError as error:
+            logger.error("Error embedding text: %s", str(error))
-                logger.error("Error embedding text: %s", str(error))
+            raise error
                raise error
-            return [data["embedding"] for data in response.data]
+        return [data["embedding"] for data in response.data]
        # tasks = [get_embedding(text_) for text_ in text]
        result = await get_embedding(text)
        return result
    def get_vector_size(self) -> int:
        return self.dimensions
--- a/cognee/infrastructure/llm/anthropic/adapter.py
+++ b/cognee/infrastructure/llm/anthropic/adapter.py
@ -4,7 +4,6 @@ import instructor
 from tenacity import retry, stop_after_attempt
 import anthropic
 from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.infrastructure.llm.prompts import read_query_prompt
 class AnthropicAdapter(LLMInterface):
@ -38,17 +37,3 @@ class AnthropicAdapter(LLMInterface):
            }],
            response_model = response_model,
        )
    def show_prompt(self, text_input: str, system_prompt: str) -> str:
        """Format and display the prompt for a user query."""
        if not text_input:
            text_input = "No user input provided."
        if not system_prompt:
            raise ValueError("No system prompt path provided.")
        system_prompt = read_query_prompt(system_prompt)
        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
        return formatted_prompt
--- a/cognee/infrastructure/llm/generic_llm_api/adapter.py
+++ b/cognee/infrastructure/llm/generic_llm_api/adapter.py
@ -6,7 +6,6 @@ import instructor
 from tenacity import retry, stop_after_attempt
 import openai
 from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.infrastructure.llm.prompts import read_query_prompt
 from cognee.shared.data_models import MonitoringTool
 from cognee.base_config import get_base_config
 from cognee.infrastructure.llm.config import get_llm_config
@ -123,13 +122,3 @@ class GenericAPIAdapter(LLMInterface):
            response_model = response_model,
        )
    def show_prompt(self, text_input: str, system_prompt: str) -> str:
        """Format and display the prompt for a user query."""
        if not text_input:
            text_input = "No user input provided."
        if not system_prompt:
            raise ValueError("No system prompt path provided.")
        system_prompt = read_query_prompt(system_prompt)
        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
        return formatted_prompt
--- a/cognee/infrastructure/llm/llm_interface.py
+++ b/cognee/infrastructure/llm/llm_interface.py
@ -3,6 +3,8 @@
 from typing import Type, Protocol
 from abc import abstractmethod
 from pydantic import BaseModel
 from cognee.infrastructure.llm.prompts import read_query_prompt
 class LLMInterface(Protocol):
    """ LLM Interface """
@ -14,7 +16,14 @@ class LLMInterface(Protocol):
        """To get structured output, import/call this function"""
        raise NotImplementedError
    @abstractmethod
    def show_prompt(self, text_input: str, system_prompt: str) -> str:
-        """To get structured output, import/call this function"""
+        """Format and display the prompt for a user query."""
-        raise NotImplementedError
+        if not text_input:
            text_input = "No user input provided."
        if not system_prompt:
            raise ValueError("No system prompt path provided.")
        system_prompt = read_query_prompt(system_prompt)
        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n"""
        return formatted_prompt
--- a/cognee/infrastructure/llm/openai/adapter.py
+++ b/cognee/infrastructure/llm/openai/adapter.py
@ -8,7 +8,6 @@ import instructor
 from pydantic import BaseModel
 from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.infrastructure.llm.prompts import read_query_prompt
 class OpenAIAdapter(LLMInterface):
    name = "OpenAI"
@ -121,14 +120,3 @@ class OpenAIAdapter(LLMInterface):
            max_tokens = 300,
            max_retries = 5,
        )
    def show_prompt(self, text_input: str, system_prompt: str) -> str:
        """Format and display the prompt for a user query."""
        if not text_input:
            text_input = "No user input provided."
        if not system_prompt:
            raise ValueError("No system prompt path provided.")
        system_prompt = read_query_prompt(system_prompt)
        formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
        return formatted_prompt
--- a/cognee/infrastructure/llm/prompts/generate_cog_layers.txt
+++ b/cognee/infrastructure/llm/prompts/generate_cog_layers.txt
@ -1,14 +0,0 @@
 You are tasked with analyzing `{{ data_type }}` files, especially in a multilayer network context for tasks such as analysis, categorization, and feature extraction. Various layers can be incorporated to capture the depth and breadth of information contained within the {{ data_type }}.
 These layers can help in understanding the content, context, and characteristics of the `{{ data_type }}`.
 Your objective is to extract meaningful layers of information that will contribute to constructing a detailed multilayer network or knowledge graph.
 Approach this task by considering the unique characteristics and inherent properties of the data at hand.
 VERY IMPORTANT: The context you are working in is `{{ category_name }}` and the specific domain you are extracting data on is `{{ category_name }}`.
 Guidelines for Layer Extraction:
 Take into account: The content type, in this case, is: `{{ category_name }}`, should play a major role in how you decompose into layers.
 Based on your analysis, define and describe the layers you've identified, explaining their relevance and contribution to understanding the dataset. Your independent identification of layers will enable a nuanced and multifaceted representation of the data, enhancing applications in knowledge discovery, content analysis, and information retrieval.
--- a/cognee/modules/data/extraction/extract_cognitive_layers.py
+++ b/cognee/modules/data/extraction/extract_cognitive_layers.py
@ -1,11 +0,0 @@
 from typing import Type, Dict
 from pydantic import BaseModel
 from cognee.infrastructure.llm.prompts import render_prompt
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 async def extract_cognitive_layers(content: str, category: Dict, response_model: Type[BaseModel]):
    llm_client = get_llm_client()
    system_prompt = render_prompt("generate_cog_layers.txt", category)
    return await llm_client.acreate_structured_output(content, system_prompt, response_model)
--- a/profiling/util/DummyEmbeddingEngine.py
+++ b/profiling/util/DummyEmbeddingEngine.py
@ -0,0 +1,9 @@
 import numpy as np
 from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
 class DummyEmbeddingEngine(EmbeddingEngine):
    async def embed_text(self, text: list[str]) -> list[list[float]]:
        return(list(list(np.random.randn(3072))))
    def get_vector_size(self) -> int:
        return(3072)
--- a/profiling/util/DummyLLMAdapter.py
+++ b/profiling/util/DummyLLMAdapter.py
@ -0,0 +1,65 @@
 from typing import Type
 from uuid import uuid4
 import spacy
 import textacy
 from pydantic import BaseModel
 from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent
 class DummyLLMAdapter(LLMInterface):
    nlp = spacy.load("en_core_web_sm")
    async def acreate_structured_output(
        self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
    ) -> BaseModel:
        if (
            str(response_model)
            == "<class 'cognee.shared.data_models.SummarizedContent'>"
        ):
            return dummy_summarize_content(text_input)
        elif (
            str(response_model) == "<class 'cognee.shared.data_models.KnowledgeGraph'>"
        ):
            return dummy_extract_knowledge_graph(text_input, self.nlp)
        else:
            raise Exception(
                "Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph"
            )
 def dummy_extract_knowledge_graph(text, nlp):
    doc = nlp(text)
    triples = list(textacy.extract.subject_verb_object_triples(doc))
    nodes = {}
    edges = []
    for triple in triples:
        source = "_".join([str(e) for e in triple.subject])
        target = "_".join([str(e) for e in triple.object])
        nodes[source] = nodes.get(
            source, Node(id=str(uuid4()), name=source, type="object", description="")
        )
        nodes[target] = nodes.get(
            target, Node(id=str(uuid4()), name=target, type="object", description="")
        )
        edge_type = "_".join([str(e) for e in triple.verb])
        edges.append(
            Edge(
                source_node_id=nodes[source].id,
                target_node_id=nodes[target].id,
                relationship_name=edge_type,
            )
        )
    return KnowledgeGraph(nodes=list(nodes.values()), edges=edges)
 def dummy_summarize_content(text):
    words = [(word, len(word)) for word in set(text.split(" "))]
    words = sorted(words, key=lambda x: x[1], reverse=True)
    summary = " ".join([word for word, _ in words[:50]])
    description = " ".join([word for word, _ in words[:10]])
    return SummarizedContent(summary=summary, description=description)