Cog 669 implement dummy llm adapter (#37)
Adds the `class DummyLLMAdapter(LLMInterface)` class for profiling of large datasets without actual LLM calls in the top level `profiling/util` location. I also move the `show_prompt` implementation from the child classes to `LLMInterface`, since the implementations were identical. I expanded the scope to also include a DummyEmbeddingEngine.
This commit is contained in:
commit
bbaf78f54e
9 changed files with 98 additions and 83 deletions
|
|
@ -28,24 +28,19 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
||||||
self.dimensions = dimensions
|
self.dimensions = dimensions
|
||||||
|
|
||||||
async def embed_text(self, text: List[str]) -> List[List[float]]:
|
async def embed_text(self, text: List[str]) -> List[List[float]]:
|
||||||
async def get_embedding(text_):
|
try:
|
||||||
try:
|
response = await litellm.aembedding(
|
||||||
response = await litellm.aembedding(
|
self.model,
|
||||||
self.model,
|
input = text,
|
||||||
input = text_,
|
api_key = self.api_key,
|
||||||
api_key = self.api_key,
|
api_base = self.endpoint,
|
||||||
api_base = self.endpoint,
|
api_version = self.api_version
|
||||||
api_version = self.api_version
|
)
|
||||||
)
|
except litellm.exceptions.BadRequestError as error:
|
||||||
except litellm.exceptions.BadRequestError as error:
|
logger.error("Error embedding text: %s", str(error))
|
||||||
logger.error("Error embedding text: %s", str(error))
|
raise error
|
||||||
raise error
|
|
||||||
|
|
||||||
return [data["embedding"] for data in response.data]
|
return [data["embedding"] for data in response.data]
|
||||||
|
|
||||||
# tasks = [get_embedding(text_) for text_ in text]
|
|
||||||
result = await get_embedding(text)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def get_vector_size(self) -> int:
|
def get_vector_size(self) -> int:
|
||||||
return self.dimensions
|
return self.dimensions
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import instructor
|
||||||
from tenacity import retry, stop_after_attempt
|
from tenacity import retry, stop_after_attempt
|
||||||
import anthropic
|
import anthropic
|
||||||
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
||||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
|
||||||
|
|
||||||
|
|
||||||
class AnthropicAdapter(LLMInterface):
|
class AnthropicAdapter(LLMInterface):
|
||||||
|
|
@ -38,17 +37,3 @@ class AnthropicAdapter(LLMInterface):
|
||||||
}],
|
}],
|
||||||
response_model = response_model,
|
response_model = response_model,
|
||||||
)
|
)
|
||||||
|
|
||||||
def show_prompt(self, text_input: str, system_prompt: str) -> str:
|
|
||||||
"""Format and display the prompt for a user query."""
|
|
||||||
|
|
||||||
if not text_input:
|
|
||||||
text_input = "No user input provided."
|
|
||||||
if not system_prompt:
|
|
||||||
raise ValueError("No system prompt path provided.")
|
|
||||||
|
|
||||||
system_prompt = read_query_prompt(system_prompt)
|
|
||||||
|
|
||||||
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
|
|
||||||
|
|
||||||
return formatted_prompt
|
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ import instructor
|
||||||
from tenacity import retry, stop_after_attempt
|
from tenacity import retry, stop_after_attempt
|
||||||
import openai
|
import openai
|
||||||
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
||||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
|
||||||
from cognee.shared.data_models import MonitoringTool
|
from cognee.shared.data_models import MonitoringTool
|
||||||
from cognee.base_config import get_base_config
|
from cognee.base_config import get_base_config
|
||||||
from cognee.infrastructure.llm.config import get_llm_config
|
from cognee.infrastructure.llm.config import get_llm_config
|
||||||
|
|
@ -123,13 +122,3 @@ class GenericAPIAdapter(LLMInterface):
|
||||||
response_model = response_model,
|
response_model = response_model,
|
||||||
)
|
)
|
||||||
|
|
||||||
def show_prompt(self, text_input: str, system_prompt: str) -> str:
|
|
||||||
"""Format and display the prompt for a user query."""
|
|
||||||
if not text_input:
|
|
||||||
text_input = "No user input provided."
|
|
||||||
if not system_prompt:
|
|
||||||
raise ValueError("No system prompt path provided.")
|
|
||||||
system_prompt = read_query_prompt(system_prompt)
|
|
||||||
|
|
||||||
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
|
|
||||||
return formatted_prompt
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@
|
||||||
from typing import Type, Protocol
|
from typing import Type, Protocol
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||||
|
|
||||||
class LLMInterface(Protocol):
|
class LLMInterface(Protocol):
|
||||||
""" LLM Interface """
|
""" LLM Interface """
|
||||||
|
|
||||||
|
|
@ -14,7 +16,14 @@ class LLMInterface(Protocol):
|
||||||
"""To get structured output, import/call this function"""
|
"""To get structured output, import/call this function"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def show_prompt(self, text_input: str, system_prompt: str) -> str:
|
def show_prompt(self, text_input: str, system_prompt: str) -> str:
|
||||||
"""To get structured output, import/call this function"""
|
"""Format and display the prompt for a user query."""
|
||||||
raise NotImplementedError
|
if not text_input:
|
||||||
|
text_input = "No user input provided."
|
||||||
|
if not system_prompt:
|
||||||
|
raise ValueError("No system prompt path provided.")
|
||||||
|
system_prompt = read_query_prompt(system_prompt)
|
||||||
|
|
||||||
|
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n"""
|
||||||
|
|
||||||
|
return formatted_prompt
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,6 @@ import instructor
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
||||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
|
||||||
|
|
||||||
class OpenAIAdapter(LLMInterface):
|
class OpenAIAdapter(LLMInterface):
|
||||||
name = "OpenAI"
|
name = "OpenAI"
|
||||||
|
|
@ -121,14 +120,3 @@ class OpenAIAdapter(LLMInterface):
|
||||||
max_tokens = 300,
|
max_tokens = 300,
|
||||||
max_retries = 5,
|
max_retries = 5,
|
||||||
)
|
)
|
||||||
|
|
||||||
def show_prompt(self, text_input: str, system_prompt: str) -> str:
|
|
||||||
"""Format and display the prompt for a user query."""
|
|
||||||
if not text_input:
|
|
||||||
text_input = "No user input provided."
|
|
||||||
if not system_prompt:
|
|
||||||
raise ValueError("No system prompt path provided.")
|
|
||||||
system_prompt = read_query_prompt(system_prompt)
|
|
||||||
|
|
||||||
formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None
|
|
||||||
return formatted_prompt
|
|
||||||
|
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
You are tasked with analyzing `{{ data_type }}` files, especially in a multilayer network context for tasks such as analysis, categorization, and feature extraction. Various layers can be incorporated to capture the depth and breadth of information contained within the {{ data_type }}.
|
|
||||||
|
|
||||||
These layers can help in understanding the content, context, and characteristics of the `{{ data_type }}`.
|
|
||||||
|
|
||||||
Your objective is to extract meaningful layers of information that will contribute to constructing a detailed multilayer network or knowledge graph.
|
|
||||||
|
|
||||||
Approach this task by considering the unique characteristics and inherent properties of the data at hand.
|
|
||||||
|
|
||||||
VERY IMPORTANT: The context you are working in is `{{ category_name }}` and the specific domain you are extracting data on is `{{ category_name }}`.
|
|
||||||
|
|
||||||
Guidelines for Layer Extraction:
|
|
||||||
Take into account: The content type, in this case, is: `{{ category_name }}`, should play a major role in how you decompose into layers.
|
|
||||||
|
|
||||||
Based on your analysis, define and describe the layers you've identified, explaining their relevance and contribution to understanding the dataset. Your independent identification of layers will enable a nuanced and multifaceted representation of the data, enhancing applications in knowledge discovery, content analysis, and information retrieval.
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
from typing import Type, Dict
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from cognee.infrastructure.llm.prompts import render_prompt
|
|
||||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
|
||||||
|
|
||||||
async def extract_cognitive_layers(content: str, category: Dict, response_model: Type[BaseModel]):
|
|
||||||
llm_client = get_llm_client()
|
|
||||||
|
|
||||||
system_prompt = render_prompt("generate_cog_layers.txt", category)
|
|
||||||
|
|
||||||
return await llm_client.acreate_structured_output(content, system_prompt, response_model)
|
|
||||||
9
profiling/util/DummyEmbeddingEngine.py
Normal file
9
profiling/util/DummyEmbeddingEngine.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
import numpy as np
|
||||||
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
||||||
|
|
||||||
|
class DummyEmbeddingEngine(EmbeddingEngine):
|
||||||
|
async def embed_text(self, text: list[str]) -> list[list[float]]:
|
||||||
|
return(list(list(np.random.randn(3072))))
|
||||||
|
|
||||||
|
def get_vector_size(self) -> int:
|
||||||
|
return(3072)
|
||||||
65
profiling/util/DummyLLMAdapter.py
Normal file
65
profiling/util/DummyLLMAdapter.py
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
from typing import Type
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
import textacy
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
||||||
|
from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent
|
||||||
|
|
||||||
|
|
||||||
|
class DummyLLMAdapter(LLMInterface):
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
|
||||||
|
async def acreate_structured_output(
|
||||||
|
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
||||||
|
) -> BaseModel:
|
||||||
|
|
||||||
|
if (
|
||||||
|
str(response_model)
|
||||||
|
== "<class 'cognee.shared.data_models.SummarizedContent'>"
|
||||||
|
):
|
||||||
|
return dummy_summarize_content(text_input)
|
||||||
|
elif (
|
||||||
|
str(response_model) == "<class 'cognee.shared.data_models.KnowledgeGraph'>"
|
||||||
|
):
|
||||||
|
return dummy_extract_knowledge_graph(text_input, self.nlp)
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
"Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def dummy_extract_knowledge_graph(text, nlp):
|
||||||
|
doc = nlp(text)
|
||||||
|
triples = list(textacy.extract.subject_verb_object_triples(doc))
|
||||||
|
|
||||||
|
nodes = {}
|
||||||
|
edges = []
|
||||||
|
for triple in triples:
|
||||||
|
source = "_".join([str(e) for e in triple.subject])
|
||||||
|
target = "_".join([str(e) for e in triple.object])
|
||||||
|
nodes[source] = nodes.get(
|
||||||
|
source, Node(id=str(uuid4()), name=source, type="object", description="")
|
||||||
|
)
|
||||||
|
nodes[target] = nodes.get(
|
||||||
|
target, Node(id=str(uuid4()), name=target, type="object", description="")
|
||||||
|
)
|
||||||
|
edge_type = "_".join([str(e) for e in triple.verb])
|
||||||
|
edges.append(
|
||||||
|
Edge(
|
||||||
|
source_node_id=nodes[source].id,
|
||||||
|
target_node_id=nodes[target].id,
|
||||||
|
relationship_name=edge_type,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return KnowledgeGraph(nodes=list(nodes.values()), edges=edges)
|
||||||
|
|
||||||
|
|
||||||
|
def dummy_summarize_content(text):
|
||||||
|
words = [(word, len(word)) for word in set(text.split(" "))]
|
||||||
|
words = sorted(words, key=lambda x: x[1], reverse=True)
|
||||||
|
summary = " ".join([word for word, _ in words[:50]])
|
||||||
|
description = " ".join([word for word, _ in words[:10]])
|
||||||
|
return SummarizedContent(summary=summary, description=description)
|
||||||
Loading…
Add table
Reference in a new issue