Build the docker and push

2023-12-16 15:15:30 +01:00 · 2023-12-16 15:15:30 +01:00 · 05aaee69b3
commit 05aaee69b3
parent 7d0ee16d46
5 changed files with 146 additions and 94 deletions
--- a/level_4/api.py
+++ b/level_4/api.py
@ -115,10 +115,15 @@ async def user_query_to_graph(payload: Payload):

@app.post("/document-to-graph-db")
 async def document_to_graph_db(payload: Payload):
+    logging.info("Adding documents to graph db")
    try:
        decoded_payload = payload.payload
+        if 'settings' in decoded_payload and decoded_payload['settings'] is not None:
+            settings_for_loader = decoded_payload['settings']
+        else:
+            settings_for_loader = None
        async with session_scope(session=AsyncSessionLocal()) as session:
-            result = await add_documents_to_graph_db(session =session, user_id = decoded_payload['user_id'], loader_settings =decoded_payload['settings'])
+            result = await add_documents_to_graph_db(session =session, user_id = decoded_payload['user_id'], loader_settings =settings_for_loader)
        return result

    except Exception as e:
--- a/level_4/bin/dockerize
+++ b/level_4/bin/dockerize
@ -0,0 +1,36 @@
+set -euo pipefail
+
+AWS_REGION=${region:-eu-west-1}
+AWS_DEPLOYMENT_ACCOUNT=${account:-463722570299}
+AWS_REPOSITORY=${repo:-"${AWS_DEPLOYMENT_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com"}
+
+STAGE=${stage:-"dev"}
+SHA_SHORT="$(git rev-parse --short HEAD)"
+CUR_DATE="$(date +%Y%m%d%H%M%S)"
+VERSION="$STAGE-$CUR_DATE-$SHA_SHORT"
+IMAGE_NAME=${image_name:-promethai-${STAGE}-promethai-backend}
+
+REPO_NAME="${AWS_REPOSITORY}/${IMAGE_NAME}"
+FULL_IMAGE_NAME="${REPO_NAME}:${VERSION}"
+APP_DIR=${app_dir:-"."}
+
+PUBLISH=${publish:-false}
+
+echo "Building docker image ${FULL_IMAGE_NAME} located in dir ${app_dir}"
+
+pushd "${APP_DIR}" &&
+  docker buildx build --platform linux/amd64 \
+    --build-arg STAGE=${STAGE} \
+    -t "${FULL_IMAGE_NAME}" . &&
+  echo "${VERSION}" >/tmp/.DOCKER_IMAGE_VERSION &&
+  echo "Successfully built docker image ${FULL_IMAGE_NAME}"
+
+if [ "${PUBLISH}" = true ]; then
+  echo "Pushing docker image ${FULL_IMAGE_NAME} to ECR repository to AWS account ${AWS_DEPLOYMENT_ACCOUNT}"
+  if [ "${PUBLISH}" = true ]; then
+    echo "logging in"
+    aws ecr get-login-password --region "${AWS_REGION}" | docker login --username AWS --password-stdin "${AWS_REPOSITORY}"
+  fi
+  docker push "${FULL_IMAGE_NAME}" &&
+    echo "Successfully pushed docker image ${FULL_IMAGE_NAME} to ECR repository"
+fi
--- a/level_4/cognitive_architecture/database/vectordb/vectordb.py
+++ b/level_4/cognitive_architecture/database/vectordb/vectordb.py
@ -151,7 +151,7 @@ class WeaviateVectorDB(VectorDB):
        # Update Weaviate memories here
        if namespace is None:
            namespace = self.namespace
-        retriever = self.init_weaviate(embeddings=embeddings,namespace = namespace, retriever_type="single_document_context")
+        retriever = self.init_weaviate(embeddings=OpenAIEmbeddings(),namespace = namespace, retriever_type="single_document_context")
        if loader_settings:
            # Assuming _document_loader returns a list of documents
            documents = await _document_loader(observation, loader_settings)
@ -167,15 +167,19 @@ class WeaviateVectorDB(VectorDB):
            Document(metadata=params, page_content=doc.page_content)])
        else:
            chunk_count = 0
-            documents = await _document_loader(observation, loader_settings)
+            from cognitive_architecture.database.vectordb.chunkers.chunkers import chunk_data
+            documents = [chunk_data(chunk_strategy="VANILLA", source_data=observation, chunk_size=50,
+                       chunk_overlap=20)]
            for doc in documents[0]:
                chunk_count += 1
                params['chunk_order'] = chunk_count
                # document_to_load = self._stuct(observation, params, metadata_schema_class)

+                logging.info("Loading document with defautl loader settings %s", str(doc))
+
                # logging.info("Loading document with defautl loader settings %s", str(document_to_load))
                retriever.add_documents([
-                Document(metadata=params, page_content=doc)])
+                Document(metadata=params, page_content=doc.page_content)])

    async def fetch_memories(self, observation: str, namespace: str = None, search_type: str = 'hybrid', **kwargs):
        """
--- a/level_4/cognitive_architecture/utils.py
+++ b/level_4/cognitive_architecture/utils.py
@ -4,6 +4,7 @@ import string
 import uuid

 from graphviz import Digraph
+from sqlalchemy import or_
 from sqlalchemy.orm import contains_eager


@ -194,34 +195,37 @@ async def get_unsumarized_vector_db_namespace(session: AsyncSession, user_id: st

    Example Usage:
    """
-    try:
-        result = await session.execute(
-            select(Operation)
-            .join(Operation.docs)  # Explicit join with docs table
-            .join(Operation.memories)  # Explicit join with memories table
-            .options(
-                contains_eager(Operation.docs),  # Informs ORM of the join for docs
-                contains_eager(Operation.memories)  # Informs ORM of the join for memories
-            )
-            .where(
-                (Operation.user_id == user_id) &  # Filter by user_id
-                (Operation.docs.graph_summary == False)  # Filter by user_id
-            )
-            .order_by(Operation.created_at.desc())  # Order by creation date
+    # try:
+    result = await session.execute(
+        select(Operation)
+        .join(Operation.docs)  # Explicit join with docs table
+        .join(Operation.memories)  # Explicit join with memories table
+        .options(
+            contains_eager(Operation.docs),  # Informs ORM of the join for docs
+            contains_eager(Operation.memories)  # Informs ORM of the join for memories
        )
+        .where(
+            (Operation.user_id == user_id) &  # Filter by user_id
+            or_(
+                DocsModel.graph_summary == False,  # Condition 1: graph_summary is False
+                DocsModel.graph_summary == None  # Condition 3: graph_summary is None
+            )  # Filter by user_id
+        )
+        .order_by(Operation.created_at.desc())  # Order by creation date
+    )

-        operations = result.unique().scalars().all()
+    operations = result.unique().scalars().all()

-        # Extract memory names and document names and IDs
-        memory_names = [memory.memory_name for op in operations for memory in op.memories]
-        docs = [(doc.doc_name, doc.id) for op in operations for doc in op.docs]
+    # Extract memory names and document names and IDs
+    memory_names = [memory.memory_name for op in operations for memory in op.memories]
+    docs = [(doc.doc_name, doc.id) for op in operations for doc in op.docs]

-        return memory_names, docs
+    return memory_names, docs

-    except Exception as e:
-        # Handle the exception as needed
-        print(f"An error occurred: {e}")
-        return None
+    # except Exception as e:
+    #     # Handle the exception as needed
+    #     print(f"An error occurred: {e}")
+    #     return None
 async def get_memory_name_by_doc_id(session: AsyncSession, docs_id: str):
    """
    Asynchronously retrieves memory names associated with a specific document ID.
--- a/level_4/main.py
+++ b/level_4/main.py
@ -100,6 +100,7 @@ async def load_documents_to_vectorstore(session: AsyncSession, user_id: str, con
            DocsModel(
                id=str(uuid.uuid4()),
                operation_id=job_id,
+                graph_summary= False,
                doc_name=doc
            )
        )
@ -139,7 +140,7 @@ async def load_documents_to_vectorstore(session: AsyncSession, user_id: str, con
                                                        observation=content, params=params, loader_settings=loader_settings)

        await update_entity(session, Operation, job_id, "SUCCESS")
-        return result, namespace_id
+        # return result, namespace_id


 async def user_query_to_graph_db(session: AsyncSession, user_id: str, query_input: str):
@ -175,74 +176,76 @@ async def user_query_to_graph_db(session: AsyncSession, user_id: str, query_inpu

 async def add_documents_to_graph_db(session: AsyncSession, user_id: str= None, loader_settings:dict=None, stupid_local_testing_flag=False): #clean this up Vasilije, don't be sloppy
    """"""
-    try:
-        # await update_document_vectordb_namespace(postgres_session, user_id)
-        memory_names, docs = await get_unsumarized_vector_db_namespace(session, user_id)
-        logging.info("Memory names are", memory_names)
-        logging.info("Docs are", docs)
-        for doc, memory_name in zip(docs, memory_names):
-            doc_name, doc_id = doc
-            if stupid_local_testing_flag:
-                classification = [{
-                  "DocumentCategory": "Literature",
-                  "Title": "Bartleby, the Scrivener",
-                  "Summary": "The document is a narrative about an enigmatic copyist named Bartleby who works in a law office. Despite initially being a diligent employee, Bartleby begins to refuse tasks with the phrase 'I would prefer not to' and eventually stops working altogether. His passive resistance and mysterious behavior confound the narrator, who is also his employer. Bartleby's refusal to leave the office leads to various complications, and he is eventually taken to the Tombs as a vagrant. The story ends with Bartleby's death and the revelation that he may have previously worked in the Dead Letter Office, which adds a layer of poignancy to his character.",
-                  "d_id": "2a5c571f-bad6-4649-a4ac-36e4bb4f34cd"
-                },
-                    {
-                        "DocumentCategory": "Science",
-                        "Title": "The Mysterious World of Quantum Mechanics",
-                        "Summary": "This article delves into the fundamentals of quantum mechanics, exploring its paradoxical nature where particles can exist in multiple states simultaneously. It discusses key experiments and theories that have shaped our understanding of the quantum world, such as the double-slit experiment, Schrödinger's cat, and quantum entanglement. The piece also touches upon the implications of quantum mechanics for future technology, including quantum computing and cryptography.",
-                        "d_id": "f4e2c3b1-4567-8910-11a2-b3c4d5e6f7g8"
-                    },
-                    {
-                        "DocumentCategory": "History",
-                        "Title": "The Rise and Fall of the Roman Empire",
-                        "Summary": "This essay provides an overview of the Roman Empire's history, from its foundation to its eventual decline. It examines the political, social, and economic factors that contributed to the empire's expansion and success, as well as those that led to its downfall. Key events and figures such as Julius Caesar, the Punic Wars, and the transition from republic to empire are discussed. The essay concludes with an analysis of the empire's lasting impact on Western civilization.",
-                        "d_id": "8h7g6f5e-4d3c-2b1a-09e8-d7c6b5a4f3e2"
-                    },
-                    {
-                        "DocumentCategory": "Technology",
-                        "Title": "The Future of Artificial Intelligence",
-                        "Summary": "This report explores the current state and future prospects of artificial intelligence (AI). It covers the evolution of AI from simple algorithms to advanced neural networks capable of deep learning. The document discusses various applications of AI in industries such as healthcare, finance, and transportation, as well as ethical considerations and potential risks associated with AI development. Predictions for future advancements and their societal impact are also presented.",
-                        "d_id": "3c2b1a09-d8e7-f6g5-h4i3-j1k2l3m4n5o6"
-                    },
-                    {
-                        "DocumentCategory": "Economics",
-                        "Title": "Global Economic Trends and Predictions",
-                        "Summary": "This analysis examines major trends in the global economy, including the rise of emerging markets, the impact of technology on job markets, and shifts in international trade. It delves into the economic effects of recent global events, such as pandemics and geopolitical conflicts, and discusses how these might shape future economic policies and practices. The document provides predictions for economic growth, inflation rates, and currency fluctuations in the coming years.",
-                        "d_id": "7k6j5h4g-3f2e-1d0c-b8a9-m7n6o5p4q3r2"
-                    }
-                ]
-                for classification in classification:

-                    neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username,
-                                                  password=config.graph_database_password)
-                    rs = neo4j_graph_db.create_document_node_cypher(classification, user_id)
-                    neo4j_graph_db.query(rs, classification)

-                    # select doc from the store
-                    neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name, document_id=doc_id)
-            else:
-                try:
-                    classification_content = fetch_document_vectordb_namespace(session, user_id, memory_name)
-                except:
-                    classification_content = "None"
-
-                classification = await classify_documents(doc_name, document_id =doc_id, content=classification_content)
-
-                logging.info("Classification is", str(classification))
-                neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username,
-                                              password=config.graph_database_password)
-                rs = neo4j_graph_db.create_document_node_cypher(classification, user_id)
-                neo4j_graph_db.query(rs, classification)
-
-                # select doc from the store
-                neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name,
-                                                                   document_id=doc_id)
-                await update_entity(session, DocsModel, doc_id, True)
-    except:
-        pass
+    # try:
+    # await update_document_vectordb_namespace(postgres_session, user_id)
+    memory_names, docs = await get_unsumarized_vector_db_namespace(session, user_id)
+    logging.info("Memory names are", memory_names)
+    logging.info("Docs are", docs)
+    for doc, memory_name in zip(docs, memory_names):
+        doc_name, doc_id = doc
+        # if stupid_local_testing_flag:
+        #     classification = [{
+        #       "DocumentCategory": "Literature",
+        #       "Title": "Bartleby, the Scrivener",
+        #       "Summary": "The document is a narrative about an enigmatic copyist named Bartleby who works in a law office. Despite initially being a diligent employee, Bartleby begins to refuse tasks with the phrase 'I would prefer not to' and eventually stops working altogether. His passive resistance and mysterious behavior confound the narrator, who is also his employer. Bartleby's refusal to leave the office leads to various complications, and he is eventually taken to the Tombs as a vagrant. The story ends with Bartleby's death and the revelation that he may have previously worked in the Dead Letter Office, which adds a layer of poignancy to his character.",
+        #       "d_id": "2a5c571f-bad6-4649-a4ac-36e4bb4f34cd"
+        #     },
+        #         {
+        #             "DocumentCategory": "Science",
+        #             "Title": "The Mysterious World of Quantum Mechanics",
+        #             "Summary": "This article delves into the fundamentals of quantum mechanics, exploring its paradoxical nature where particles can exist in multiple states simultaneously. It discusses key experiments and theories that have shaped our understanding of the quantum world, such as the double-slit experiment, Schrödinger's cat, and quantum entanglement. The piece also touches upon the implications of quantum mechanics for future technology, including quantum computing and cryptography.",
+        #             "d_id": "f4e2c3b1-4567-8910-11a2-b3c4d5e6f7g8"
+        #         },
+        #         {
+        #             "DocumentCategory": "History",
+        #             "Title": "The Rise and Fall of the Roman Empire",
+        #             "Summary": "This essay provides an overview of the Roman Empire's history, from its foundation to its eventual decline. It examines the political, social, and economic factors that contributed to the empire's expansion and success, as well as those that led to its downfall. Key events and figures such as Julius Caesar, the Punic Wars, and the transition from republic to empire are discussed. The essay concludes with an analysis of the empire's lasting impact on Western civilization.",
+        #             "d_id": "8h7g6f5e-4d3c-2b1a-09e8-d7c6b5a4f3e2"
+        #         },
+        #         {
+        #             "DocumentCategory": "Technology",
+        #             "Title": "The Future of Artificial Intelligence",
+        #             "Summary": "This report explores the current state and future prospects of artificial intelligence (AI). It covers the evolution of AI from simple algorithms to advanced neural networks capable of deep learning. The document discusses various applications of AI in industries such as healthcare, finance, and transportation, as well as ethical considerations and potential risks associated with AI development. Predictions for future advancements and their societal impact are also presented.",
+        #             "d_id": "3c2b1a09-d8e7-f6g5-h4i3-j1k2l3m4n5o6"
+        #         },
+        #         {
+        #             "DocumentCategory": "Economics",
+        #             "Title": "Global Economic Trends and Predictions",
+        #             "Summary": "This analysis examines major trends in the global economy, including the rise of emerging markets, the impact of technology on job markets, and shifts in international trade. It delves into the economic effects of recent global events, such as pandemics and geopolitical conflicts, and discusses how these might shape future economic policies and practices. The document provides predictions for economic growth, inflation rates, and currency fluctuations in the coming years.",
+        #             "d_id": "7k6j5h4g-3f2e-1d0c-b8a9-m7n6o5p4q3r2"
+        #         }
+        #     ]
+        #     for classification in classification:
+        #
+        #         neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username,
+        #                                       password=config.graph_database_password)
+        #         rs = neo4j_graph_db.create_document_node_cypher(classification, user_id)
+        #         neo4j_graph_db.query(rs, classification)
+        #
+        #         # select doc from the store
+        #         neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name, document_id=doc_id)
+        # else:
+        try:
+            classification_content = await fetch_document_vectordb_namespace(session, user_id, memory_name)
+        except:
+            classification_content = "None"
+        #
+        # classification = await classify_documents(doc_name, document_id =doc_id, content=classification_content)
+        #
+        # logging.info("Classification is", str(classification))
+        # neo4j_graph_db = Neo4jGraphDB(url=config.graph_database_url, username=config.graph_database_username,
+        #                               password=config.graph_database_password)
+        # rs = neo4j_graph_db.create_document_node_cypher(classification, user_id)
+        # neo4j_graph_db.query(rs, classification)
+        #
+        # # select doc from the store
+        # neo4j_graph_db.update_document_node_with_namespace(user_id, vectordb_namespace=memory_name,
+        #                                                    document_id=doc_id)
+        await update_entity(session, DocsModel, doc_id, True)
+    # except:
+    #     pass

 class ResponseString(BaseModel):
    response: str = Field(..., default_factory=list)