From 05aaee69b3c88662080b6c10fc14924ef2e9d4c9 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Sat, 16 Dec 2023 15:15:30 +0100 Subject: [PATCH] Build the docker and push --- level_4/api.py | 7 +- level_4/bin/dockerize | 36 +++++ .../database/vectordb/vectordb.py | 10 +- level_4/cognitive_architecture/utils.py | 50 ++++--- level_4/main.py | 137 +++++++++--------- 5 files changed, 146 insertions(+), 94 deletions(-) create mode 100755 level_4/bin/dockerize diff --git a/level_4/api.py b/level_4/api.py index bb2e06d97..051507c98 100644 --- a/level_4/api.py +++ b/level_4/api.py @@ -115,10 +115,15 @@ async def user_query_to_graph(payload: Payload): @app.post("/document-to-graph-db") async def document_to_graph_db(payload: Payload): + logging.info("Adding documents to graph db") try: decoded_payload = payload.payload + if 'settings' in decoded_payload and decoded_payload['settings'] is not None: + settings_for_loader = decoded_payload['settings'] + else: + settings_for_loader = None async with session_scope(session=AsyncSessionLocal()) as session: - result = await add_documents_to_graph_db(session =session, user_id = decoded_payload['user_id'], loader_settings =decoded_payload['settings']) + result = await add_documents_to_graph_db(session =session, user_id = decoded_payload['user_id'], loader_settings =settings_for_loader) return result except Exception as e: diff --git a/level_4/bin/dockerize b/level_4/bin/dockerize new file mode 100755 index 000000000..30cfede3c --- /dev/null +++ b/level_4/bin/dockerize @@ -0,0 +1,36 @@ +set -euo pipefail + +AWS_REGION=${region:-eu-west-1} +AWS_DEPLOYMENT_ACCOUNT=${account:-463722570299} +AWS_REPOSITORY=${repo:-"${AWS_DEPLOYMENT_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com"} + +STAGE=${stage:-"dev"} +SHA_SHORT="$(git rev-parse --short HEAD)" +CUR_DATE="$(date +%Y%m%d%H%M%S)" +VERSION="$STAGE-$CUR_DATE-$SHA_SHORT" +IMAGE_NAME=${image_name:-promethai-${STAGE}-promethai-backend} + +REPO_NAME="${AWS_REPOSITORY}/${IMAGE_NAME}" +FULL_IMAGE_NAME="${REPO_NAME}:${VERSION}" +APP_DIR=${app_dir:-"."} + +PUBLISH=${publish:-false} + +echo "Building docker image ${FULL_IMAGE_NAME} located in dir ${app_dir}" + +pushd "${APP_DIR}" && + docker buildx build --platform linux/amd64 \ + --build-arg STAGE=${STAGE} \ + -t "${FULL_IMAGE_NAME}" . && + echo "${VERSION}" >/tmp/.DOCKER_IMAGE_VERSION && + echo "Successfully built docker image ${FULL_IMAGE_NAME}" + +if [ "${PUBLISH}" = true ]; then + echo "Pushing docker image ${FULL_IMAGE_NAME} to ECR repository to AWS account ${AWS_DEPLOYMENT_ACCOUNT}" + if [ "${PUBLISH}" = true ]; then + echo "logging in" + aws ecr get-login-password --region "${AWS_REGION}" | docker login --username AWS --password-stdin "${AWS_REPOSITORY}" + fi + docker push "${FULL_IMAGE_NAME}" && + echo "Successfully pushed docker image ${FULL_IMAGE_NAME} to ECR repository" +fi \ No newline at end of file diff --git a/level_4/cognitive_architecture/database/vectordb/vectordb.py b/level_4/cognitive_architecture/database/vectordb/vectordb.py index 54708346b..4b988d223 100644 --- a/level_4/cognitive_architecture/database/vectordb/vectordb.py +++ b/level_4/cognitive_architecture/database/vectordb/vectordb.py @@ -151,7 +151,7 @@ class WeaviateVectorDB(VectorDB): # Update Weaviate memories here if namespace is None: namespace = self.namespace - retriever = self.init_weaviate(embeddings=embeddings,namespace = namespace, retriever_type="single_document_context") + retriever = self.init_weaviate(embeddings=OpenAIEmbeddings(),namespace = namespace, retriever_type="single_document_context") if loader_settings: # Assuming _document_loader returns a list of documents documents = await _document_loader(observation, loader_settings) @@ -167,15 +167,19 @@ class WeaviateVectorDB(VectorDB): Document(metadata=params, page_content=doc.page_content)]) else: chunk_count = 0 - documents = await _document_loader(observation, loader_settings) + from cognitive_architecture.database.vectordb.chunkers.chunkers import chunk_data + documents = [chunk_data(chunk_strategy="VANILLA", source_data=observation, chunk_size=50, + chunk_overlap=20)] for doc in documents[0]: chunk_count += 1 params['chunk_order'] = chunk_count # document_to_load = self._stuct(observation, params, metadata_schema_class) + logging.info("Loading document with defautl loader settings %s", str(doc)) + # logging.info("Loading document with defautl loader settings %s", str(document_to_load)) retriever.add_documents([ - Document(metadata=params, page_content=doc)]) + Document(metadata=params, page_content=doc.page_content)]) async def fetch_memories(self, observation: str, namespace: str = None, search_type: str = 'hybrid', **kwargs): """ diff --git a/level_4/cognitive_architecture/utils.py b/level_4/cognitive_architecture/utils.py index b2dc63536..5617ef291 100644 --- a/level_4/cognitive_architecture/utils.py +++ b/level_4/cognitive_architecture/utils.py @@ -4,6 +4,7 @@ import string import uuid from graphviz import Digraph +from sqlalchemy import or_ from sqlalchemy.orm import contains_eager @@ -194,34 +195,37 @@ async def get_unsumarized_vector_db_namespace(session: AsyncSession, user_id: st Example Usage: """ - try: - result = await session.execute( - select(Operation) - .join(Operation.docs) # Explicit join with docs table - .join(Operation.memories) # Explicit join with memories table - .options( - contains_eager(Operation.docs), # Informs ORM of the join for docs - contains_eager(Operation.memories) # Informs ORM of the join for memories - ) - .where( - (Operation.user_id == user_id) & # Filter by user_id - (Operation.docs.graph_summary == False) # Filter by user_id - ) - .order_by(Operation.created_at.desc()) # Order by creation date + # try: + result = await session.execute( + select(Operation) + .join(Operation.docs) # Explicit join with docs table + .join(Operation.memories) # Explicit join with memories table + .options( + contains_eager(Operation.docs), # Informs ORM of the join for docs + contains_eager(Operation.memories) # Informs ORM of the join for memories ) + .where( + (Operation.user_id == user_id) & # Filter by user_id + or_( + DocsModel.graph_summary == False, # Condition 1: graph_summary is False + DocsModel.graph_summary == None # Condition 3: graph_summary is None + ) # Filter by user_id + ) + .order_by(Operation.created_at.desc()) # Order by creation date + ) - operations = result.unique().scalars().all() + operations = result.unique().scalars().all() - # Extract memory names and document names and IDs - memory_names = [memory.memory_name for op in operations for memory in op.memories] - docs = [(doc.doc_name, doc.id) for op in operations for doc in op.docs] + # Extract memory names and document names and IDs + memory_names = [memory.memory_name for op in operations for memory in op.memories] + docs = [(doc.doc_name, doc.id) for op in operations for doc in op.docs] - return memory_names, docs + return memory_names, docs - except Exception as e: - # Handle the exception as needed - print(f"An error occurred: {e}") - return None + # except Exception as e: + # # Handle the exception as needed + # print(f"An error occurred: {e}") + # return None async def get_memory_name_by_doc_id(session: AsyncSession, docs_id: str): """ Asynchronously retrieves memory names associated with a specific document ID. diff --git a/level_4/main.py b/level_4/main.py index 655613bb1..ac0492f95 100644 --- a/level_4/main.py +++ b/level_4/main.py @@ -100,6 +100,7 @@ async def load_documents_to_vectorstore(session: AsyncSession, user_id: str, con DocsModel( id=str(uuid.uuid4()), operation_id=job_id, + graph_summary= False, doc_name=doc ) ) @@ -139,7 +140,7 @@ async def load_documents_to_vectorstore(session: AsyncSession, user_id: str, con observation=content, params=params, loader_settings=loader_settings) await update_entity(session, Operation, job_id, "SUCCESS") - return result, namespace_id + # return result, namespace_id async def user_query_to_graph_db(session: AsyncSession, user_id: str, query_input: str): @@ -175,74 +176,76 @@ async def user_query_to_graph_db(session: AsyncSession, user_id: str, query_inpu async def add_documents_to_graph_db(session: AsyncSession, user_id: str= None, loader_settings:dict=None, stupid_local_testing_flag=False): #clean this up Vasilije, don't be sloppy """""" - try: - # await update_document_vectordb_namespace(postgres_session, user_id) - memory_names, docs = await get_unsumarized_vector_db_namespace(session, user_id) - logging.info("Memory names are", memory_names) - logging.info("Docs are", docs) - for doc, memory_name in zip(docs, memory_names): - doc_name, doc_id = doc - if stupid_local_testing_flag: - classification = [{ - "DocumentCategory": "Literature", - "Title": "Bartleby, the Scrivener", - "Summary": "The document is a narrative about an enigmatic copyist named Bartleby who works in a law office. Despite initially being a diligent employee, Bartleby begins to refuse tasks with the phrase 'I would prefer not to' and eventually stops working altogether. His passive resistance and mysterious behavior confound the narrator, who is also his employer. Bartleby's refusal to leave the office leads to various complications, and he is eventually taken to the Tombs as a vagrant. The story ends with Bartleby's death and the revelation that he may have previously worked in the Dead Letter Office, which adds a layer of poignancy to his character.", - "d_id": "2a5c571f-bad6-4649-a4ac-36e4bb4f34cd" - }, - { - "DocumentCategory": "Science", - "Title": "The Mysterious World of Quantum Mechanics", - "Summary": "This article delves into the fundamentals of quantum mechanics, exploring its paradoxical nature where particles can exist in multiple states simultaneously. It discusses key experiments and theories that have shaped our understanding of the quantum world, such as the double-slit experiment, Schrödinger's cat, and quantum entanglement. The piece also touches upon the implications of quantum mechanics for future technology, including quantum computing and cryptography.", - "d_id": "f4e2c3b1-4567-8910-11a2-b3c4d5e6f7g8" - }, - { - "DocumentCategory": "History", - "Title": "The Rise and Fall of the Roman Empire", - "Summary": "This essay provides an overview of the Roman Empire's history, from its foundation to its eventual decline. It examines the political, social, and economic factors that contributed to the empire's expansion and success, as well as those that led to its downfall. Key events and figures such as Julius Caesar, the Punic Wars, and the transition from republic to empire are discussed. The essay concludes with an analysis of the empire's lasting impact on Western civilization.", - "d_id": "8h7g6f5e-4d3c-2b1a-09e8-d7c6b5a4f3e2" - }, - { - "DocumentCategory": "Technology", - "Title": "The Future of Artificial Intelligence", - "Summary": "This report explores the current state and future prospects of artificial intelligence (AI). It covers the evolution of AI from simple algorithms to advanced neural networks capable of deep learning. The document discusses various applications of AI in industries such as healthcare, finance, and transportation, as well as ethical considerations and potential risks associated with AI development. Predictions for future advancements and their societal impact are also presented.", - "d_id": "3c2b1a09-d8e7-f6g5-h4i3-j1k2l3m4n5o6" - }, - { - "DocumentCategory": "Economics", - "Title": "Global Economic Trends and Predictions", - "Summary": "This analysis examines major trends in the global economy, including the rise of emerging markets, the impact of technology on job markets, and shifts in international trade. It delves into the economic effects of recent global events, such as pandemics and geopolitical conflicts, and discusses how these might shape future economic policies and practices. The document provides predictions for economic growth, inflation rates, and currency fluctuations in the coming years.", - "d_id": "7k6j5h4g-3f2e-1d0c-b8a9-m7n6o5p4q3r2" - } - ] - for classification in classification: - neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username, - password=config.graph_database_password) - rs = neo4j_graph_db.create_document_node_cypher(classification, user_id) - neo4j_graph_db.query(rs, classification) - # select doc from the store - neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name, document_id=doc_id) - else: - try: - classification_content = fetch_document_vectordb_namespace(session, user_id, memory_name) - except: - classification_content = "None" - - classification = await classify_documents(doc_name, document_id =doc_id, content=classification_content) - - logging.info("Classification is", str(classification)) - neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username, - password=config.graph_database_password) - rs = neo4j_graph_db.create_document_node_cypher(classification, user_id) - neo4j_graph_db.query(rs, classification) - - # select doc from the store - neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name, - document_id=doc_id) - await update_entity(session, DocsModel, doc_id, True) - except: - pass + # try: + # await update_document_vectordb_namespace(postgres_session, user_id) + memory_names, docs = await get_unsumarized_vector_db_namespace(session, user_id) + logging.info("Memory names are", memory_names) + logging.info("Docs are", docs) + for doc, memory_name in zip(docs, memory_names): + doc_name, doc_id = doc + # if stupid_local_testing_flag: + # classification = [{ + # "DocumentCategory": "Literature", + # "Title": "Bartleby, the Scrivener", + # "Summary": "The document is a narrative about an enigmatic copyist named Bartleby who works in a law office. Despite initially being a diligent employee, Bartleby begins to refuse tasks with the phrase 'I would prefer not to' and eventually stops working altogether. His passive resistance and mysterious behavior confound the narrator, who is also his employer. Bartleby's refusal to leave the office leads to various complications, and he is eventually taken to the Tombs as a vagrant. The story ends with Bartleby's death and the revelation that he may have previously worked in the Dead Letter Office, which adds a layer of poignancy to his character.", + # "d_id": "2a5c571f-bad6-4649-a4ac-36e4bb4f34cd" + # }, + # { + # "DocumentCategory": "Science", + # "Title": "The Mysterious World of Quantum Mechanics", + # "Summary": "This article delves into the fundamentals of quantum mechanics, exploring its paradoxical nature where particles can exist in multiple states simultaneously. It discusses key experiments and theories that have shaped our understanding of the quantum world, such as the double-slit experiment, Schrödinger's cat, and quantum entanglement. The piece also touches upon the implications of quantum mechanics for future technology, including quantum computing and cryptography.", + # "d_id": "f4e2c3b1-4567-8910-11a2-b3c4d5e6f7g8" + # }, + # { + # "DocumentCategory": "History", + # "Title": "The Rise and Fall of the Roman Empire", + # "Summary": "This essay provides an overview of the Roman Empire's history, from its foundation to its eventual decline. It examines the political, social, and economic factors that contributed to the empire's expansion and success, as well as those that led to its downfall. Key events and figures such as Julius Caesar, the Punic Wars, and the transition from republic to empire are discussed. The essay concludes with an analysis of the empire's lasting impact on Western civilization.", + # "d_id": "8h7g6f5e-4d3c-2b1a-09e8-d7c6b5a4f3e2" + # }, + # { + # "DocumentCategory": "Technology", + # "Title": "The Future of Artificial Intelligence", + # "Summary": "This report explores the current state and future prospects of artificial intelligence (AI). It covers the evolution of AI from simple algorithms to advanced neural networks capable of deep learning. The document discusses various applications of AI in industries such as healthcare, finance, and transportation, as well as ethical considerations and potential risks associated with AI development. Predictions for future advancements and their societal impact are also presented.", + # "d_id": "3c2b1a09-d8e7-f6g5-h4i3-j1k2l3m4n5o6" + # }, + # { + # "DocumentCategory": "Economics", + # "Title": "Global Economic Trends and Predictions", + # "Summary": "This analysis examines major trends in the global economy, including the rise of emerging markets, the impact of technology on job markets, and shifts in international trade. It delves into the economic effects of recent global events, such as pandemics and geopolitical conflicts, and discusses how these might shape future economic policies and practices. The document provides predictions for economic growth, inflation rates, and currency fluctuations in the coming years.", + # "d_id": "7k6j5h4g-3f2e-1d0c-b8a9-m7n6o5p4q3r2" + # } + # ] + # for classification in classification: + # + # neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username, + # password=config.graph_database_password) + # rs = neo4j_graph_db.create_document_node_cypher(classification, user_id) + # neo4j_graph_db.query(rs, classification) + # + # # select doc from the store + # neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name, document_id=doc_id) + # else: + try: + classification_content = await fetch_document_vectordb_namespace(session, user_id, memory_name) + except: + classification_content = "None" + # + # classification = await classify_documents(doc_name, document_id =doc_id, content=classification_content) + # + # logging.info("Classification is", str(classification)) + # neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username, + # password=config.graph_database_password) + # rs = neo4j_graph_db.create_document_node_cypher(classification, user_id) + # neo4j_graph_db.query(rs, classification) + # + # # select doc from the store + # neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name, + # document_id=doc_id) + await update_entity(session, DocsModel, doc_id, True) + # except: + # pass class ResponseString(BaseModel): response: str = Field(..., default_factory=list)