From a2ff42332ed411512178ddc5f6ed798d5018db74 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 28 Nov 2024 11:49:28 +0100 Subject: [PATCH 1/3] DummyLLMAdapter WIP --- .../infrastructure/llm/anthropic/adapter.py | 15 ------ .../llm/generic_llm_api/adapter.py | 11 ----- cognee/infrastructure/llm/llm_interface.py | 14 +++++- cognee/infrastructure/llm/openai/adapter.py | 12 ----- .../llm/prompts/generate_cog_layers.txt | 14 ------ .../extraction/extract_cognitive_layers.py | 11 ----- profiling/util/DummyLLMAdapter.py | 47 +++++++++++++++++++ 7 files changed, 59 insertions(+), 65 deletions(-) delete mode 100644 cognee/infrastructure/llm/prompts/generate_cog_layers.txt delete mode 100644 cognee/modules/data/extraction/extract_cognitive_layers.py create mode 100644 profiling/util/DummyLLMAdapter.py diff --git a/cognee/infrastructure/llm/anthropic/adapter.py b/cognee/infrastructure/llm/anthropic/adapter.py index 8df59e3e5..7577bc12f 100644 --- a/cognee/infrastructure/llm/anthropic/adapter.py +++ b/cognee/infrastructure/llm/anthropic/adapter.py @@ -4,7 +4,6 @@ import instructor from tenacity import retry, stop_after_attempt import anthropic from cognee.infrastructure.llm.llm_interface import LLMInterface -from cognee.infrastructure.llm.prompts import read_query_prompt class AnthropicAdapter(LLMInterface): @@ -38,17 +37,3 @@ class AnthropicAdapter(LLMInterface): }], response_model = response_model, ) - - def show_prompt(self, text_input: str, system_prompt: str) -> str: - """Format and display the prompt for a user query.""" - - if not text_input: - text_input = "No user input provided." - if not system_prompt: - raise ValueError("No system prompt path provided.") - - system_prompt = read_query_prompt(system_prompt) - - formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None - - return formatted_prompt diff --git a/cognee/infrastructure/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/generic_llm_api/adapter.py index f65d559d5..5d1436ba0 100644 --- a/cognee/infrastructure/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/generic_llm_api/adapter.py @@ -6,7 +6,6 @@ import instructor from tenacity import retry, stop_after_attempt import openai from cognee.infrastructure.llm.llm_interface import LLMInterface -from cognee.infrastructure.llm.prompts import read_query_prompt from cognee.shared.data_models import MonitoringTool from cognee.base_config import get_base_config from cognee.infrastructure.llm.config import get_llm_config @@ -123,13 +122,3 @@ class GenericAPIAdapter(LLMInterface): response_model = response_model, ) - def show_prompt(self, text_input: str, system_prompt: str) -> str: - """Format and display the prompt for a user query.""" - if not text_input: - text_input = "No user input provided." - if not system_prompt: - raise ValueError("No system prompt path provided.") - system_prompt = read_query_prompt(system_prompt) - - formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None - return formatted_prompt diff --git a/cognee/infrastructure/llm/llm_interface.py b/cognee/infrastructure/llm/llm_interface.py index f0c6db133..069efb22d 100644 --- a/cognee/infrastructure/llm/llm_interface.py +++ b/cognee/infrastructure/llm/llm_interface.py @@ -3,6 +3,8 @@ from typing import Type, Protocol from abc import abstractmethod from pydantic import BaseModel +from cognee.infrastructure.llm.prompts import read_query_prompt + class LLMInterface(Protocol): """ LLM Interface """ @@ -16,5 +18,13 @@ class LLMInterface(Protocol): @abstractmethod def show_prompt(self, text_input: str, system_prompt: str) -> str: - """To get structured output, import/call this function""" - raise NotImplementedError + """Format and display the prompt for a user query.""" + if not text_input: + text_input = "No user input provided." + if not system_prompt: + raise ValueError("No system prompt path provided.") + system_prompt = read_query_prompt(system_prompt) + + formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" + + return formatted_prompt diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py index 1dc9b70f5..e74cbbd33 100644 --- a/cognee/infrastructure/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/openai/adapter.py @@ -8,7 +8,6 @@ import instructor from pydantic import BaseModel from cognee.infrastructure.llm.llm_interface import LLMInterface -from cognee.infrastructure.llm.prompts import read_query_prompt class OpenAIAdapter(LLMInterface): name = "OpenAI" @@ -121,14 +120,3 @@ class OpenAIAdapter(LLMInterface): max_tokens = 300, max_retries = 5, ) - - def show_prompt(self, text_input: str, system_prompt: str) -> str: - """Format and display the prompt for a user query.""" - if not text_input: - text_input = "No user input provided." - if not system_prompt: - raise ValueError("No system prompt path provided.") - system_prompt = read_query_prompt(system_prompt) - - formatted_prompt = f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" if system_prompt else None - return formatted_prompt diff --git a/cognee/infrastructure/llm/prompts/generate_cog_layers.txt b/cognee/infrastructure/llm/prompts/generate_cog_layers.txt deleted file mode 100644 index 925588189..000000000 --- a/cognee/infrastructure/llm/prompts/generate_cog_layers.txt +++ /dev/null @@ -1,14 +0,0 @@ -You are tasked with analyzing `{{ data_type }}` files, especially in a multilayer network context for tasks such as analysis, categorization, and feature extraction. Various layers can be incorporated to capture the depth and breadth of information contained within the {{ data_type }}. - -These layers can help in understanding the content, context, and characteristics of the `{{ data_type }}`. - -Your objective is to extract meaningful layers of information that will contribute to constructing a detailed multilayer network or knowledge graph. - -Approach this task by considering the unique characteristics and inherent properties of the data at hand. - -VERY IMPORTANT: The context you are working in is `{{ category_name }}` and the specific domain you are extracting data on is `{{ category_name }}`. - -Guidelines for Layer Extraction: -Take into account: The content type, in this case, is: `{{ category_name }}`, should play a major role in how you decompose into layers. - -Based on your analysis, define and describe the layers you've identified, explaining their relevance and contribution to understanding the dataset. Your independent identification of layers will enable a nuanced and multifaceted representation of the data, enhancing applications in knowledge discovery, content analysis, and information retrieval. diff --git a/cognee/modules/data/extraction/extract_cognitive_layers.py b/cognee/modules/data/extraction/extract_cognitive_layers.py deleted file mode 100644 index 82e9e8a94..000000000 --- a/cognee/modules/data/extraction/extract_cognitive_layers.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Type, Dict -from pydantic import BaseModel -from cognee.infrastructure.llm.prompts import render_prompt -from cognee.infrastructure.llm.get_llm_client import get_llm_client - -async def extract_cognitive_layers(content: str, category: Dict, response_model: Type[BaseModel]): - llm_client = get_llm_client() - - system_prompt = render_prompt("generate_cog_layers.txt", category) - - return await llm_client.acreate_structured_output(content, system_prompt, response_model) diff --git a/profiling/util/DummyLLMAdapter.py b/profiling/util/DummyLLMAdapter.py new file mode 100644 index 000000000..40698b938 --- /dev/null +++ b/profiling/util/DummyLLMAdapter.py @@ -0,0 +1,47 @@ +import spacy +import textacy +from typing import Type +from uuid import uuid4 +from pydantic import BaseModel +from cognee.infrastructure.llm.llm_interface import LLMInterface +from cognee.shared.data_models import SummarizedContent +from cognee.shared.data_models import KnowledgeGraph, Node, Edge + + +class DummyLLMAdapter(LLMInterface): + nlp = spacy.load('en_core_web_sm') + async def acreate_structured_output(self, + text_input: str, + system_prompt: str, + response_model: Type[BaseModel]) -> BaseModel: + + if isinstance(response_model, SummarizedContent): + return(dummy_summarize_content(text_input)) + elif isinstance(response_model, KnowledgeGraph): + return(dummy_extract_knowledge_graph(text_input, nlp)) + else: + raise Exception("Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph") + + +def dummy_extract_knowledge_graph(text, nlp): + doc = nlp(text) + triples = list(textacy.extract.subject_verb_object_triples(doc)) + + nodes = {} + edges = [] + for triple in triples: + source = "_".join([str(e) for e in triple.subject]) + target = "_".join([str(e) for e in triple.object]) + nodes[source] = nodes.get(source, Node(id=str(uuid4()), name=source, type="object", description="") ) + nodes[target] = nodes.get(target, Node(id=str(uuid4()), name=target, type="object", description="") ) + edge_type = "_".join([str(e) for e in triple.verb]) + edges.append(Edge(source_node_id=nodes[source].id, target_node_id=nodes[target].id, relationship_name=edge_type)) + return(KnowledgeGraph(nodes=list(nodes.keys()), edges=edges)) + + +def dummy_summarize_content(text): + words = [(word, len(word)) for word in set(text.split(" "))] + words = sorted(words, key=lambda x: x[1], reverse=True) + summary = " ".join([word for word, _ in words[:100]]) + description = " ".join([word for word, _ in words[:10]]) + return(SummarizedContent(summary=summary, description=description)) \ No newline at end of file From 5c9fd44680730be365438131b9224b0d2423ac15 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 28 Nov 2024 12:26:01 +0100 Subject: [PATCH 2/3] Fix DummyLLMAdapter --- cognee/infrastructure/llm/llm_interface.py | 1 - profiling/util/DummyLLMAdapter.py | 66 ++++++++++++++-------- 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/cognee/infrastructure/llm/llm_interface.py b/cognee/infrastructure/llm/llm_interface.py index 069efb22d..4a5bd79b6 100644 --- a/cognee/infrastructure/llm/llm_interface.py +++ b/cognee/infrastructure/llm/llm_interface.py @@ -16,7 +16,6 @@ class LLMInterface(Protocol): """To get structured output, import/call this function""" raise NotImplementedError - @abstractmethod def show_prompt(self, text_input: str, system_prompt: str) -> str: """Format and display the prompt for a user query.""" if not text_input: diff --git a/profiling/util/DummyLLMAdapter.py b/profiling/util/DummyLLMAdapter.py index 40698b938..df81ce123 100644 --- a/profiling/util/DummyLLMAdapter.py +++ b/profiling/util/DummyLLMAdapter.py @@ -1,26 +1,34 @@ -import spacy -import textacy from typing import Type from uuid import uuid4 + +import spacy +import textacy from pydantic import BaseModel + from cognee.infrastructure.llm.llm_interface import LLMInterface -from cognee.shared.data_models import SummarizedContent -from cognee.shared.data_models import KnowledgeGraph, Node, Edge +from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent class DummyLLMAdapter(LLMInterface): - nlp = spacy.load('en_core_web_sm') - async def acreate_structured_output(self, - text_input: str, - system_prompt: str, - response_model: Type[BaseModel]) -> BaseModel: - - if isinstance(response_model, SummarizedContent): - return(dummy_summarize_content(text_input)) - elif isinstance(response_model, KnowledgeGraph): - return(dummy_extract_knowledge_graph(text_input, nlp)) + nlp = spacy.load("en_core_web_sm") + + async def acreate_structured_output( + self, text_input: str, system_prompt: str, response_model: Type[BaseModel] + ) -> BaseModel: + + if ( + str(response_model) + == "" + ): + return dummy_summarize_content(text_input) + elif ( + str(response_model) == "" + ): + return dummy_extract_knowledge_graph(text_input, self.nlp) else: - raise Exception("Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph") + raise Exception( + "Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph" + ) def dummy_extract_knowledge_graph(text, nlp): @@ -31,17 +39,27 @@ def dummy_extract_knowledge_graph(text, nlp): edges = [] for triple in triples: source = "_".join([str(e) for e in triple.subject]) - target = "_".join([str(e) for e in triple.object]) - nodes[source] = nodes.get(source, Node(id=str(uuid4()), name=source, type="object", description="") ) - nodes[target] = nodes.get(target, Node(id=str(uuid4()), name=target, type="object", description="") ) + target = "_".join([str(e) for e in triple.object]) + nodes[source] = nodes.get( + source, Node(id=str(uuid4()), name=source, type="object", description="") + ) + nodes[target] = nodes.get( + target, Node(id=str(uuid4()), name=target, type="object", description="") + ) edge_type = "_".join([str(e) for e in triple.verb]) - edges.append(Edge(source_node_id=nodes[source].id, target_node_id=nodes[target].id, relationship_name=edge_type)) - return(KnowledgeGraph(nodes=list(nodes.keys()), edges=edges)) - + edges.append( + Edge( + source_node_id=nodes[source].id, + target_node_id=nodes[target].id, + relationship_name=edge_type, + ) + ) + return KnowledgeGraph(nodes=list(nodes.values()), edges=edges) + def dummy_summarize_content(text): words = [(word, len(word)) for word in set(text.split(" "))] words = sorted(words, key=lambda x: x[1], reverse=True) - summary = " ".join([word for word, _ in words[:100]]) - description = " ".join([word for word, _ in words[:10]]) - return(SummarizedContent(summary=summary, description=description)) \ No newline at end of file + summary = " ".join([word for word, _ in words[:50]]) + description = " ".join([word for word, _ in words[:10]]) + return SummarizedContent(summary=summary, description=description) From 3e1949d895f1450a3b8a436eb9491c11e60daad8 Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Thu, 28 Nov 2024 15:42:20 +0100 Subject: [PATCH 3/3] Remove unnecessary nesting in embed_text and add DummyEmbeddingEngine --- .../embeddings/LiteLLMEmbeddingEngine.py | 29 ++++++++----------- profiling/util/DummyEmbeddingEngine.py | 9 ++++++ 2 files changed, 21 insertions(+), 17 deletions(-) create mode 100644 profiling/util/DummyEmbeddingEngine.py diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index de30640e5..edc8eb57f 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -28,24 +28,19 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): self.dimensions = dimensions async def embed_text(self, text: List[str]) -> List[List[float]]: - async def get_embedding(text_): - try: - response = await litellm.aembedding( - self.model, - input = text_, - api_key = self.api_key, - api_base = self.endpoint, - api_version = self.api_version - ) - except litellm.exceptions.BadRequestError as error: - logger.error("Error embedding text: %s", str(error)) - raise error + try: + response = await litellm.aembedding( + self.model, + input = text, + api_key = self.api_key, + api_base = self.endpoint, + api_version = self.api_version + ) + except litellm.exceptions.BadRequestError as error: + logger.error("Error embedding text: %s", str(error)) + raise error - return [data["embedding"] for data in response.data] - - # tasks = [get_embedding(text_) for text_ in text] - result = await get_embedding(text) - return result + return [data["embedding"] for data in response.data] def get_vector_size(self) -> int: return self.dimensions diff --git a/profiling/util/DummyEmbeddingEngine.py b/profiling/util/DummyEmbeddingEngine.py new file mode 100644 index 000000000..7f5b3e847 --- /dev/null +++ b/profiling/util/DummyEmbeddingEngine.py @@ -0,0 +1,9 @@ +import numpy as np +from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine + +class DummyEmbeddingEngine(EmbeddingEngine): + async def embed_text(self, text: list[str]) -> list[list[float]]: + return(list(list(np.random.randn(3072)))) + + def get_vector_size(self) -> int: + return(3072)