Cog 669 implement dummy llm adapter (#37)

Adds the `class DummyLLMAdapter(LLMInterface)` class for profiling of
large datasets without actual LLM calls in the top level
`profiling/util` location.

I also move the `show_prompt` implementation from the child classes to
`LLMInterface`, since the implementations were identical.

I expanded the scope to also include a DummyEmbeddingEngine.
This commit is contained in:
Vasilije 2024-11-30 17:02:49 +01:00 committed by GitHub
commit bbaf78f54e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 98 additions and 83 deletions

View file

@ -28,24 +28,19 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
self.dimensions = dimensions
async def embed_text(self, text: List[str]) -> List[List[float]]:
async def get_embedding(text_):
try:
response = await litellm.aembedding(
self.model,
input = text_,
api_key = self.api_key,
api_base = self.endpoint,
api_version = self.api_version
)
except litellm.exceptions.BadRequestError as error:
logger.error("Error embedding text: %s", str(error))
raise error
try:
response = await litellm.aembedding(
self.model,
input = text,
api_key = self.api_key,
api_base = self.endpoint,
api_version = self.api_version
)
except litellm.exceptions.BadRequestError as error:
logger.error("Error embedding text: %s", str(error))
raise error
return [data["embedding"] for data in response.data]
# tasks = [get_embedding(text_) for text_ in text]
result = await get_embedding(text)
return result
return [data["embedding"] for data in response.data]
def get_vector_size(self) -> int:
return self.dimensions

View file

@ -4,7 +4,6 @@ import instructor
from tenacity import retry, stop_after_attempt
import anthropic
from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.infrastructure.llm.prompts import read_query_prompt
class AnthropicAdapter(LLMInterface):
@ -38,17 +37,3 @@ class AnthropicAdapter(LLMInterface):
}],
response_model = response_model,
)
def show_prompt(self, text_input: str, system_prompt: str) -> str:
"""Format and display the prompt for a user query."""
if not text_input:
text_input = "No user input provided."
if not system_prompt:
raise ValueError("No system prompt path provided.")
system_prompt = read_query_prompt(system_prompt)
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
return formatted_prompt

View file

@ -6,7 +6,6 @@ import instructor
from tenacity import retry, stop_after_attempt
import openai
from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.shared.data_models import MonitoringTool
from cognee.base_config import get_base_config
from cognee.infrastructure.llm.config import get_llm_config
@ -123,13 +122,3 @@ class GenericAPIAdapter(LLMInterface):
response_model = response_model,
)
def show_prompt(self, text_input: str, system_prompt: str) -> str:
"""Format and display the prompt for a user query."""
if not text_input:
text_input = "No user input provided."
if not system_prompt:
raise ValueError("No system prompt path provided.")
system_prompt = read_query_prompt(system_prompt)
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
return formatted_prompt

View file

@ -3,6 +3,8 @@
from typing import Type, Protocol
from abc import abstractmethod
from pydantic import BaseModel
from cognee.infrastructure.llm.prompts import read_query_prompt
class LLMInterface(Protocol):
""" LLM Interface """
@ -14,7 +16,14 @@ class LLMInterface(Protocol):
"""To get structured output, import/call this function"""
raise NotImplementedError
@abstractmethod
def show_prompt(self, text_input: str, system_prompt: str) -> str:
"""To get structured output, import/call this function"""
raise NotImplementedError
"""Format and display the prompt for a user query."""
if not text_input:
text_input = "No user input provided."
if not system_prompt:
raise ValueError("No system prompt path provided.")
system_prompt = read_query_prompt(system_prompt)
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n"""
return formatted_prompt

View file

@ -8,7 +8,6 @@ import instructor
from pydantic import BaseModel
from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.infrastructure.llm.prompts import read_query_prompt
class OpenAIAdapter(LLMInterface):
name = "OpenAI"
@ -121,14 +120,3 @@ class OpenAIAdapter(LLMInterface):
max_tokens = 300,
max_retries = 5,
)
def show_prompt(self, text_input: str, system_prompt: str) -> str:
"""Format and display the prompt for a user query."""
if not text_input:
text_input = "No user input provided."
if not system_prompt:
raise ValueError("No system prompt path provided.")
system_prompt = read_query_prompt(system_prompt)
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
return formatted_prompt

View file

@ -1,14 +0,0 @@
You are tasked with analyzing `{{ data_type }}` files, especially in a multilayer network context for tasks such as analysis, categorization, and feature extraction. Various layers can be incorporated to capture the depth and breadth of information contained within the {{ data_type }}.
These layers can help in understanding the content, context, and characteristics of the `{{ data_type }}`.
Your objective is to extract meaningful layers of information that will contribute to constructing a detailed multilayer network or knowledge graph.
Approach this task by considering the unique characteristics and inherent properties of the data at hand.
VERY IMPORTANT: The context you are working in is `{{ category_name }}` and the specific domain you are extracting data on is `{{ category_name }}`.
Guidelines for Layer Extraction:
Take into account: The content type, in this case, is: `{{ category_name }}`, should play a major role in how you decompose into layers.
Based on your analysis, define and describe the layers you've identified, explaining their relevance and contribution to understanding the dataset. Your independent identification of layers will enable a nuanced and multifaceted representation of the data, enhancing applications in knowledge discovery, content analysis, and information retrieval.

View file

@ -1,11 +0,0 @@
from typing import Type, Dict
from pydantic import BaseModel
from cognee.infrastructure.llm.prompts import render_prompt
from cognee.infrastructure.llm.get_llm_client import get_llm_client
async def extract_cognitive_layers(content: str, category: Dict, response_model: Type[BaseModel]):
llm_client = get_llm_client()
system_prompt = render_prompt("generate_cog_layers.txt", category)
return await llm_client.acreate_structured_output(content, system_prompt, response_model)

View file

@ -0,0 +1,9 @@
import numpy as np
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
class DummyEmbeddingEngine(EmbeddingEngine):
async def embed_text(self, text: list[str]) -> list[list[float]]:
return(list(list(np.random.randn(3072))))
def get_vector_size(self) -> int:
return(3072)

View file

@ -0,0 +1,65 @@
from typing import Type
from uuid import uuid4
import spacy
import textacy
from pydantic import BaseModel
from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent
class DummyLLMAdapter(LLMInterface):
nlp = spacy.load("en_core_web_sm")
async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel:
if (
str(response_model)
== "<class 'cognee.shared.data_models.SummarizedContent'>"
):
return dummy_summarize_content(text_input)
elif (
str(response_model) == "<class 'cognee.shared.data_models.KnowledgeGraph'>"
):
return dummy_extract_knowledge_graph(text_input, self.nlp)
else:
raise Exception(
"Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph"
)
def dummy_extract_knowledge_graph(text, nlp):
doc = nlp(text)
triples = list(textacy.extract.subject_verb_object_triples(doc))
nodes = {}
edges = []
for triple in triples:
source = "_".join([str(e) for e in triple.subject])
target = "_".join([str(e) for e in triple.object])
nodes[source] = nodes.get(
source, Node(id=str(uuid4()), name=source, type="object", description="")
)
nodes[target] = nodes.get(
target, Node(id=str(uuid4()), name=target, type="object", description="")
)
edge_type = "_".join([str(e) for e in triple.verb])
edges.append(
Edge(
source_node_id=nodes[source].id,
target_node_id=nodes[target].id,
relationship_name=edge_type,
)
)
return KnowledgeGraph(nodes=list(nodes.values()), edges=edges)
def dummy_summarize_content(text):
words = [(word, len(word)) for word in set(text.split(" "))]
words = sorted(words, key=lambda x: x[1], reverse=True)
summary = " ".join([word for word, _ in words[:50]])
description = " ".join([word for word, _ in words[:10]])
return SummarizedContent(summary=summary, description=description)