Added graph intefrace, added neo4j + networkx structure and updates to the notebook

This commit is contained in:
Vasilije 2024-03-06 21:42:50 +01:00
parent 5426f68d2c
commit 2433e4ed93
21 changed files with 618 additions and 21 deletions

View file

@ -4022,7 +4022,13 @@
"metadata": {},
"outputs": [],
"source": [
"\n"
"\n",
"#pre filtering\n",
"# each semantic layer -> make categories, dimensions, on semantic layer given on the LLM\n",
"# weights need to be used topk and cutoff\n",
"# entry through entities\n",
"# combine unstructured and structured\n",
"# address / entrypoint node/ "
]
},
{

View file

@ -1,4 +1,3 @@
import logging
import os
from neo4j import AsyncSession
@ -6,32 +5,23 @@ from neo4j.exceptions import Neo4jError
print(os.getcwd())
import networkx as nx
from langchain.graphs import Neo4jGraph
import os
import openai
import instructor
from openai import OpenAI
from openai import AsyncOpenAI
import pickle
from abc import ABC, abstractmethod
# Adds response_model to ChatCompletion
# Allows the return of Pydantic model rather than raw JSON
from pydantic import BaseModel, Field
from typing import List, Dict, Optional
from ...utils import (
format_dict,
append_uuid_to_variable_names,
create_edge_variable_mapping,
create_node_variable_mapping,
get_unsumarized_vector_db_namespace,
)
from ...llm.queries import generate_summary, generate_graph
from cognitive_architecture.infrastructure.llm.openai.queries import generate_summary, generate_graph
import logging
from neo4j import AsyncGraphDatabase
from contextlib import asynccontextmanager
@ -45,11 +35,8 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
from ...config import Config
from ...shared.data_models import (
Node,
Edge,
KnowledgeGraph,
GraphQLQuery,
MemorySummary,
)
config = Config()

View file

@ -0,0 +1,10 @@
"""Get the LLM client."""
from cognitive_architecture.config import Config
from .openai.adapter import OpenAIAdapter
config = Config()
config.load()
def get_llm_client():
"""Get the LLM client."""
return OpenAIAdapter(config.openai_key, config.model)

View file

@ -0,0 +1,35 @@
""" LLM Interface """
from typing import List, Type, Protocol
from abc import abstractmethod
from pydantic import BaseModel
class LLMInterface(Protocol):
""" LLM Interface """
@abstractmethod
async def async_get_embedding_with_backoff(self, text, model="text-embedding-ada-002"):
"""To get text embeddings, import/call this function"""
raise NotImplementedError
@abstractmethod
def get_embedding_with_backoff(self, text: str, model: str = "text-embedding-ada-002"):
"""To get text embeddings, import/call this function"""
raise NotImplementedError
@abstractmethod
async def async_get_batch_embeddings_with_backoff(self, texts: List[str], models: List[str]):
"""To get multiple text embeddings in parallel, import/call this function"""
raise NotImplementedError
# """ Get completions """
# async def acompletions_with_backoff(self, **kwargs):
# raise NotImplementedError
#
""" Structured output """
@abstractmethod
async def acreate_structured_output(self,
text_input: str,
system_prompt_path: str,
response_model: Type[BaseModel]) -> BaseModel:
"""To get structured output, import/call this function"""
raise NotImplementedError

View file

@ -0,0 +1,197 @@
"""Adapter for OpenAI's GPT-3, GPT=4 API."""
import os
import time
import random
import asyncio
from typing import List, Type
import openai
import instructor
from openai import OpenAI,AsyncOpenAI
from pydantic import BaseModel
from cognitive_architecture.config import Config
from cognitive_architecture.utils import read_query_prompt
from ..llm_interface import LLMInterface
#
# config = Config()
# config.load()
# aclient = instructor.apatch(AsyncOpenAI())
# OPENAI_API_KEY = config.openai_key
class OpenAIAdapter(LLMInterface):
"""Adapter for OpenAI's GPT-3, GPT=4 API"""
def __init__(self, api_key: str, model:str):
openai.api_key = api_key
self.aclient = instructor.apatch(AsyncOpenAI())
self.model = model
# OPENAI_API_KEY = config.openai_key
@staticmethod
def retry_with_exponential_backoff(
func,
initial_delay: float = 1,
exponential_base: float = 2,
jitter: bool = True,
max_retries: int = 20,
errors: tuple = (openai.RateLimitError,),
):
"""Retry a function with exponential backoff."""
def wrapper(*args, **kwargs):
"""Wrapper for sync functions."""
# Initialize variables
num_retries = 0
delay = initial_delay
# Loop until a successful response or max_retries is hit or an exception is raised
while True:
try:
return func(*args, **kwargs)
# Retry on specified errors
except errors:
# Increment retries
num_retries += 1
# Check if max retries has been reached
if num_retries > max_retries:
raise Exception(
f"Maximum number of retries ({max_retries}) exceeded."
)
# Increment the delay
delay *= exponential_base * (1 + jitter * random.random())
# Sleep for the delay
time.sleep(delay)
# Raise exceptions for any errors not specified
except Exception as e:
raise e
return wrapper
@staticmethod
async def aretry_with_exponential_backoff(
func,
initial_delay: float = 1,
exponential_base: float = 2,
jitter: bool = True,
max_retries: int = 20,
errors: tuple = (openai.RateLimitError,),
):
"""Retry a function with exponential backoff."""
async def wrapper(*args, **kwargs):
"""Wrapper for async functions.
:param args: list
:param kwargs: dict"""
# Initialize variables
num_retries = 0
delay = initial_delay
# Loop until a successful response or max_retries is hit or an exception is raised
while True:
try:
return await func(*args, **kwargs)
# Retry on specified errors
except errors as e:
print(f"acreate (backoff): caught error: {e}")
# Increment retries
num_retries += 1
# Check if max retries has been reached
if num_retries > max_retries:
raise Exception(
f"Maximum number of retries ({max_retries}) exceeded."
)
# Increment the delay
delay *= exponential_base * (1 + jitter * random.random())
# Sleep for the delay
await asyncio.sleep(delay)
# Raise exceptions for any errors not specified
except Exception as e:
raise e
return wrapper
@retry_with_exponential_backoff
def completions_with_backoff(self, **kwargs):
"""Wrapper around ChatCompletion.create w/ backoff"""
# Local model
return openai.chat.completions.create(**kwargs)
@aretry_with_exponential_backoff
async def acompletions_with_backoff(self,**kwargs):
"""Wrapper around ChatCompletion.acreate w/ backoff"""
return await openai.chat.completions.acreate(**kwargs)
@aretry_with_exponential_backoff
async def acreate_embedding_with_backoff(self,**kwargs):
"""Wrapper around Embedding.acreate w/ backoff"""
client = openai.AsyncOpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
)
return await client.embeddings.create(**kwargs)
async def async_get_embedding_with_backoff(self, text, model="text-embedding-ada-002"):
"""To get text embeddings, import/call this function
It specifies defaults + handles rate-limiting + is async"""
text = text.replace("\n", " ")
response = await self.acreate_embedding_with_backoff(input=[text], model=model)
embedding = response.data[0].embedding
return embedding
@retry_with_exponential_backoff
def create_embedding_with_backoff(self, **kwargs):
"""Wrapper around Embedding.create w/ backoff"""
return openai.embeddings.create(**kwargs)
def get_embedding_with_backoffself(self, text: str, model: str = "text-embedding-ada-002"):
"""To get text embeddings, import/call this function
It specifies defaults + handles rate-limiting
:param text: str
:param model: str
"""
text = text.replace("\n", " ")
response = self.create_embedding_with_backoff(input=[text], model=model)
embedding = response.data[0].embedding
return embedding
async def async_get_batch_embeddings_with_backoff(self, texts: List[str], models: List[str]):
"""To get multiple text embeddings in parallel, import/call this function
It specifies defaults + handles rate-limiting + is async"""
# Create a generator of coroutines
coroutines = (self.async_get_embedding_with_backoff(text, model)
for text, model in zip(texts, models))
# Run the coroutines in parallel and gather the results
embeddings = await asyncio.gather(*coroutines)
return embeddings
async def acreate_structured_output(self, text_input: str, system_prompt_path: str, response_model: Type[BaseModel], model:str) -> BaseModel:
"""Generate a response from a user query."""
system_prompt = read_query_prompt(system_prompt_path)
return self.aclient.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": f"""Use the given format to
extract information from the following input: {text_input}. """,
},
{"role": "system", "content": system_prompt},
],
response_model=response_model,
)

View file

@ -158,7 +158,7 @@ def get_embedding_with_backoff(text:str, model:str="text-embedding-ada-002"):
async def async_get_multiple_embeddings_with_backoff(texts: List[str], models: List[str]) :
async def async_get_batch_embeddings_with_backoff(texts: List[str], models: List[str]) :
"""To get multiple text embeddings in parallel, import/call this function
It specifies defaults + handles rate-limiting + is async"""
# Create a generator of coroutines

View file

@ -3,8 +3,8 @@ import os
import instructor
from openai import OpenAI
import logging
from ..shared.data_models import KnowledgeGraph, MemorySummary
from ..config import Config
from cognitive_architecture.shared.data_models import KnowledgeGraph, MemorySummary
from cognitive_architecture.config import Config

View file

@ -0,0 +1,175 @@
{
"Natural Language Text": {
"type": "TEXT",
"subclass": [
"Articles, essays, and reports",
"Books and manuscripts",
"News stories and blog posts",
"Research papers and academic publications",
"Social media posts and comments",
"Website content and product descriptions",
"Personal narratives and stories"
]
},
"Structured Documents": {
"type": "TEXT",
"subclass": [
"Spreadsheets and tables",
"Forms and surveys",
"Databases and CSV files"
]
},
"Code and Scripts": {
"type": "TEXT",
"subclass": [
"Source code in various programming languages",
"Shell commands and scripts",
"Markup languages (HTML, XML)",
"Stylesheets (CSS) and configuration files (YAML, JSON, INI)"
]
},
"Conversational Data": {
"type": "TEXT",
"subclass": [
"Chat transcripts and messaging history",
"Customer service logs and interactions",
"Conversational AI training data"
]
},
"Educational Content": {
"type": "TEXT",
"subclass": [
"Textbook content and lecture notes",
"Exam questions and academic exercises",
"E-learning course materials"
]
},
"Creative Writing": {
"type": "TEXT",
"subclass": [
"Poetry and prose",
"Scripts for plays, movies, and television",
"Song lyrics"
]
},
"Technical Documentation": {
"type": "TEXT",
"subclass": [
"Manuals and user guides",
"Technical specifications and API documentation",
"Helpdesk articles and FAQs"
]
},
"Legal and Regulatory Documents": {
"type": "TEXT",
"subclass": [
"Contracts and agreements",
"Laws, regulations, and legal case documents",
"Policy documents and compliance materials"
]
},
"Medical and Scientific Texts": {
"type": "TEXT",
"subclass": [
"Clinical trial reports",
"Patient records and case notes",
"Scientific journal articles"
]
},
"Financial and Business Documents": {
"type": "TEXT",
"subclass": [
"Financial reports and statements",
"Business plans and proposals",
"Market research and analysis reports"
]
},
"Advertising and Marketing Materials": {
"type": "TEXT",
"subclass": [
"Ad copies and marketing slogans",
"Product catalogs and brochures",
"Press releases and promotional content"
]
},
"Emails and Correspondence": {
"type": "TEXT",
"subclass": [
"Professional and formal correspondence",
"Personal emails and letters"
]
},
"Metadata and Annotations": {
"type": "TEXT",
"subclass": [
"Image and video captions",
"Annotations and metadata for various media"
]
},
"Language Learning Materials": {
"type": "TEXT",
"subclass": [
"Vocabulary lists and grammar rules",
"Language exercises and quizzes"
]
},
"Audio Content": {
"type": "AUDIO",
"subclass": [
"Music tracks and albums",
"Podcasts and radio broadcasts",
"Audiobooks and audio guides",
"Recorded interviews and speeches",
"Sound effects and ambient sounds"
]
},
"Image Content": {
"type": "IMAGE",
"subclass": [
"Photographs and digital images",
"Illustrations, diagrams, and charts",
"Infographics and visual data representations",
"Artwork and paintings",
"Screenshots and graphical user interfaces"
]
},
"Video Content": {
"type": "VIDEO",
"subclass": [
"Movies and short films",
"Documentaries and educational videos",
"Video tutorials and how-to guides",
"Animated features and cartoons",
"Live event recordings and sports broadcasts"
]
},
"Multimedia Content": {
"type": "MULTIMEDIA",
"subclass": [
"Interactive web content and games",
"Virtual reality (VR) and augmented reality (AR) experiences",
"Mixed media presentations and slide decks",
"E-learning modules with integrated multimedia",
"Digital exhibitions and virtual tours"
]
},
"3D Models and CAD Content": {
"type": "3D_MODEL",
"subclass": [
"Architectural renderings and building plans",
"Product design models and prototypes",
"3D animations and character models",
"Scientific simulations and visualizations",
"Virtual objects for AR/VR environments"
]
},
"Procedural Content": {
"type": "PROCEDURAL",
"subclass": [
"Tutorials and step-by-step guides",
"Workflow and process descriptions",
"Simulation and training exercises",
"Recipes and crafting instructions"
]
}
}

View file

@ -0,0 +1,8 @@
You are tasked with analyzing a {{data_type}} files, especially in a multilayer network context for tasks such as analysis, categorization, and feature extraction, various layers can be incorporated to capture the depth and breadth of information contained within the {{data_type}}
These layers can help in understanding the content, context, and characteristics of the {{data_type}}
Your objective is to extract meaningful layers of information that will contribute to constructing a detailed multilayer network or knowledge graph.
Approach this task by considering the unique characteristics and inherent properties of the data at hand.
VERY IMPORTANT: The context you are working in is {required_layers.dict()['name']} and specific domain you are extracting data on is {{layer_name}}
Guidelines for Layer Extraction:
Take into account: The content type that in this case is: {{layer_name}} should play a major role in how you decompose into layers.
Based on your analysis, define and describe the layers you've identified, explaining their relevance and contribution to understanding the dataset. Your independent identification of layers will enable a nuanced and multifaceted representation of the data, enhancing applications in knowledge discovery, content analysis, and information retrieval.

View file

@ -1,8 +1,10 @@
You are a top-tier algorithm
designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- **Edges** represent relationships between concepts. They're akin to Wikipedia links.
- The aim is to achieve simplicity and clarity in the
knowledge graph, making it accessible for a vast audience.
YOU ARE ONLY EXTRACTING DATA FOR COGNITIVE LAYER {{layer}}
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
- For example, when you identify an entity representing a person,
@ -31,4 +33,4 @@ always use the most complete identifier for that entity throughout the knowledge
Remember, the knowledge graph should be coherent and easily understandable,
so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination
Adhere to the rules strictly. Non-compliance will result in termination"""

View file

@ -0,0 +1 @@
from .create_vector_memory import create_vector_memory

View file

@ -0,0 +1 @@
from .create_vector_memory import create_vector_memory

View file

@ -0,0 +1,13 @@
from cognitive_architecture.infrastructure.llm.get_llm_client import get_llm_client
async def content_to_cog_layers(memory_name: str, payload: list):
llm_client = get_llm_client()
# data_points = list()
# for point in map(create_data_point, payload):
# data_points.append(await point)
return await llm_client.acreate_structured_output(memory_name, payload, model="text-embedding-ada-002")

View file

@ -0,0 +1,15 @@
from cognitive_architecture.infrastructure.llm.get_llm_client import get_llm_client
async def content_to_cog_layers(memory_name: str, payload: list):
llm_client = get_llm_client()
# data_points = list()
# for point in map(create_data_point, payload):
# data_points.append(await point)
return await llm_client.acreate_structured_output(memory_name, payload, model="text-embedding-ada-002")

View file

@ -0,0 +1,9 @@
""" Content to Propositions"""
from cognitive_architecture.infrastructure.llm.get_llm_client import get_llm_client
async def generate_graph(memory_name: str, payload: str):
doc_path = "cognitive_architecture/infrastructure/llm/prompts/generate_graph_prompt.txt"
llm_client = get_llm_client()
return await llm_client.generate_graph(memory_name, doc_path=doc_path,payload= payload)

View file

@ -2,7 +2,7 @@ import uuid
from typing import List
from qdrant_client.models import PointStruct
from cognitive_architecture.infrastructure.databases.vector.get_vector_database import get_vector_database
from cognitive_architecture.openai_tools import async_get_embedding_with_backoff
from cognitive_architecture.infrastructure.llm.openai.openai_tools import async_get_embedding_with_backoff
async def create_information_points(memory_name: str, payload: List[str]):
vector_db = get_vector_database()

View file

@ -1,5 +1,6 @@
"""Data models for the cognitive architecture."""
from typing import Optional, List
from enum import Enum
from typing import Optional, List, Union
from pydantic import BaseModel, Field
@ -39,3 +40,129 @@ class MemorySummary(BaseModel):
""" Memory summary. """
nodes: List[Node] = Field(..., default_factory=list)
edges: List[Edge] = Field(..., default_factory=list)
class TextSubclass(str, Enum):
ARTICLES = "Articles, essays, and reports"
BOOKS = "Books and manuscripts"
NEWS_STORIES = "News stories and blog posts"
RESEARCH_PAPERS = "Research papers and academic publications"
SOCIAL_MEDIA = "Social media posts and comments"
WEBSITE_CONTENT = "Website content and product descriptions"
PERSONAL_NARRATIVES = "Personal narratives and stories"
SPREADSHEETS = "Spreadsheets and tables"
FORMS = "Forms and surveys"
DATABASES = "Databases and CSV files"
SOURCE_CODE = "Source code in various programming languages"
SHELL_SCRIPTS = "Shell commands and scripts"
MARKUP_LANGUAGES = "Markup languages (HTML, XML)"
STYLESHEETS = "Stylesheets (CSS) and configuration files (YAML, JSON, INI)"
CHAT_TRANSCRIPTS = "Chat transcripts and messaging history"
CUSTOMER_SERVICE_LOGS = "Customer service logs and interactions"
CONVERSATIONAL_AI = "Conversational AI training data"
TEXTBOOK_CONTENT = "Textbook content and lecture notes"
EXAM_QUESTIONS = "Exam questions and academic exercises"
E_LEARNING_MATERIALS = "E-learning course materials"
POETRY = "Poetry and prose"
SCRIPTS = "Scripts for plays, movies, and television"
SONG_LYRICS = "Song lyrics"
MANUALS = "Manuals and user guides"
TECH_SPECS = "Technical specifications and API documentation"
HELPDESK_ARTICLES = "Helpdesk articles and FAQs"
LEGAL_CONTRACTS = "Contracts and agreements"
LAWS = "Laws, regulations, and legal case documents"
POLICY_DOCUMENTS = "Policy documents and compliance materials"
CLINICAL_TRIALS = "Clinical trial reports"
PATIENT_RECORDS = "Patient records and case notes"
SCIENTIFIC_ARTICLES = "Scientific journal articles"
FINANCIAL_REPORTS = "Financial reports and statements"
BUSINESS_PLANS = "Business plans and proposals"
MARKET_RESEARCH = "Market research and analysis reports"
AD_COPIES = "Ad copies and marketing slogans"
PRODUCT_CATALOGS = "Product catalogs and brochures"
PRESS_RELEASES = "Press releases and promotional content"
PROFESSIONAL_EMAILS = "Professional and formal correspondence"
PERSONAL_EMAILS = "Personal emails and letters"
IMAGE_CAPTIONS = "Image and video captions"
ANNOTATIONS = "Annotations and metadata for various media"
VOCAB_LISTS = "Vocabulary lists and grammar rules"
LANGUAGE_EXERCISES = "Language exercises and quizzes"
class AudioSubclass(str, Enum):
MUSIC_TRACKS = "Music tracks and albums"
PODCASTS = "Podcasts and radio broadcasts"
AUDIOBOOKS = "Audiobooks and audio guides"
INTERVIEWS = "Recorded interviews and speeches"
SOUND_EFFECTS = "Sound effects and ambient sounds"
class ImageSubclass(str, Enum):
PHOTOGRAPHS = "Photographs and digital images"
ILLUSTRATIONS = "Illustrations, diagrams, and charts"
INFOGRAPHICS = "Infographics and visual data representations"
ARTWORK = "Artwork and paintings"
SCREENSHOTS = "Screenshots and graphical user interfaces"
class VideoSubclass(str, Enum):
MOVIES = "Movies and short films"
DOCUMENTARIES = "Documentaries and educational videos"
TUTORIALS = "Video tutorials and how-to guides"
ANIMATED_FEATURES = "Animated features and cartoons"
LIVE_EVENTS = "Live event recordings and sports broadcasts"
class MultimediaSubclass(str, Enum):
WEB_CONTENT = "Interactive web content and games"
VR_EXPERIENCES = "Virtual reality (VR) and augmented reality (AR) experiences"
MIXED_MEDIA = "Mixed media presentations and slide decks"
E_LEARNING_MODULES = "E-learning modules with integrated multimedia"
DIGITAL_EXHIBITIONS = "Digital exhibitions and virtual tours"
class Model3DSubclass(str, Enum):
ARCHITECTURAL_RENDERINGS = "Architectural renderings and building plans"
PRODUCT_MODELS = "Product design models and prototypes"
ANIMATIONS = "3D animations and character models"
SCIENTIFIC_VISUALIZATIONS = "Scientific simulations and visualizations"
VR_OBJECTS = "Virtual objects for AR/VR applications"
class ProceduralSubclass(str, Enum):
TUTORIALS_GUIDES = "Tutorials and step-by-step guides"
WORKFLOW_DESCRIPTIONS = "Workflow and process descriptions"
SIMULATIONS = "Simulation and training exercises"
RECIPES = "Recipes and crafting instructions"
class ContentType(BaseModel):
"""Base class for different types of content."""
type: str
class TextContent(ContentType):
type = "TEXT"
subclass: List[TextSubclass]
class AudioContent(ContentType):
type = "AUDIO"
subclass: List[AudioSubclass]
class ImageContent(ContentType):
type = "IMAGE"
subclass: List[ImageSubclass]
class VideoContent(ContentType):
type = "VIDEO"
subclass: List[VideoSubclass]
class MultimediaContent(ContentType):
type = "MULTIMEDIA"
subclass: List[MultimediaSubclass]
class Model3DContent(ContentType):
type = "3D_MODEL"
subclass: List[Model3DSubclass]
class ProceduralContent(ContentType):
type = "PROCEDURAL"
subclass: List[ProceduralSubclass]
class SinglePrediction(BaseModel):
"""Class for a single class label prediction."""
label: Union[TextContent, AudioContent, ImageContent, VideoContent, MultimediaContent, Model3DContent, ProceduralContent]

View file

@ -4,6 +4,7 @@ import os
import random
import string
import uuid
from pathlib import Path
from graphviz import Digraph
from sqlalchemy import or_
@ -284,3 +285,13 @@ async def get_memory_name_by_doc_id(session: AsyncSession, docs_id: str):
return None
def read_query_prompt(filename: str) -> str:
"""Read a query prompt from a file."""
file_path = Path(filename)
try:
return file_path.read_text()
except FileNotFoundError:
logging.error(f"File not found: {file_path.absolute()}")
except Exception as e:
logging.error(f"An error of type {type(e).__name__} occurred while reading file: {file_path.absolute()}. Error message: {e}")
return None