cognee/cognitive_architecture/utils.py

384 lines
13 KiB
Python

""" This module contains utility functions for the cognitive architecture. """
import os
import uuid
import random
import string
import logging
import graphistry
from pathlib import Path
from jinja2 import Environment, FileSystemLoader, select_autoescape
from sqlalchemy import or_
from sqlalchemy.future import select
from sqlalchemy.orm import contains_eager
from sqlalchemy.ext.asyncio import AsyncSession
from cognitive_architecture.database.relationaldb.models.docs import DocsModel
from cognitive_architecture.database.relationaldb.models.memory import MemoryModel
from cognitive_architecture.database.relationaldb.models.operation import Operation
from cognitive_architecture.config import Config
config = Config()
config.load()
class Node:
def __init__(self, id, description, color):
self.id = id
self.description = description
self.color = color
class Edge:
def __init__(self, source, target, label, color):
self.source = source
self.target = target
self.label = label
self.color = color
def get_document_names(doc_input):
"""
Get a list of document names.
This function takes doc_input, which can be a folder path, a single document file path, or a document name as a string.
It returns a list of document names based on the doc_input.
Args:
doc_input (str): The doc_input can be a folder path, a single document file path, or a document name as a string.
Returns:
list: A list of document names.
Example usage:
- Folder path: get_document_names(".data")
- Single document file path: get_document_names(".data/example.pdf")
- Document name provided as a string: get_document_names("example.docx")
"""
if isinstance(doc_input, list):
return doc_input
if os.path.isdir(doc_input):
# doc_input is a folder
folder_path = doc_input
document_names = []
for filename in os.listdir(folder_path):
if os.path.isfile(os.path.join(folder_path, filename)):
document_names.append(filename)
return document_names
elif os.path.isfile(doc_input):
# doc_input is a single document file
return [os.path.basename(doc_input)]
elif isinstance(doc_input, str):
# doc_input is a document name provided as a string
return [doc_input]
else:
# doc_input is not valid
return []
def format_dict(d):
""" Format a dictionary as a string."""
# Initialize an empty list to store formatted items
formatted_items = []
# Iterate through all key-value pairs
for key, value in d.items():
# Format key-value pairs with a colon and space, and adding quotes for string values
formatted_item = (
f"{key}: '{value}'" if isinstance(value, str) else f"{key}: {value}"
)
formatted_items.append(formatted_item)
# Join all formatted items with a comma and a space
formatted_string = ", ".join(formatted_items)
# Add curly braces to mimic a dictionary
formatted_string = f"{{{formatted_string}}}"
return formatted_string
def append_uuid_to_variable_names(variable_mapping):
""" Append a UUID to the variable names to make them unique."""
unique_variable_mapping = {}
for original_name in variable_mapping.values():
unique_name = f"{original_name}_{uuid.uuid4().hex}"
unique_variable_mapping[original_name] = unique_name
return unique_variable_mapping
# Update the functions to use the unique variable names
def create_node_variable_mapping(nodes):
""" Create a mapping of node identifiers to unique variable names."""
mapping = {}
for node in nodes:
variable_name = f"{node['category']}{node['id']}".lower()
mapping[node["id"]] = variable_name
return mapping
def create_edge_variable_mapping(edges):
""" Create a mapping of edge identifiers to unique variable names."""
mapping = {}
for edge in edges:
# Construct a unique identifier for the edge
variable_name = f"edge{edge['source']}to{edge['target']}".lower()
mapping[(edge["source"], edge["target"])] = variable_name
return mapping
def generate_letter_uuid(length=8):
"""Generate a random string of uppercase letters with the specified length."""
letters = string.ascii_uppercase # A-Z
return "".join(random.choice(letters) for _ in range(length))
async def get_vectordb_namespace(session: AsyncSession, user_id: str):
""" Asynchronously retrieves the latest memory names for a given user."""
try:
result = await session.execute(
select(MemoryModel.memory_name)
.where(MemoryModel.user_id == user_id)
.order_by(MemoryModel.created_at.desc())
)
namespace = [row[0] for row in result.fetchall()]
return namespace
except Exception as e:
logging.error(
f"An error occurred while retrieving the Vectordb_namespace: {str(e)}"
)
return None
async def get_vectordb_document_name(session: AsyncSession, user_id: str):
""" Asynchronously retrieves the latest memory names for a given user."""
try:
result = await session.execute(
select(DocsModel.doc_name)
.where(DocsModel.user_id == user_id)
.order_by(DocsModel.created_at.desc())
)
doc_names = [row[0] for row in result.fetchall()]
return doc_names
except Exception as e:
logging.error(
f"An error occurred while retrieving the Vectordb_namespace: {str(e)}"
)
return None
async def get_model_id_name(session: AsyncSession, id: str):
""" Asynchronously retrieves the latest memory names for a given user."""
try:
result = await session.execute(
select(MemoryModel.memory_name)
.where(MemoryModel.id == id)
.order_by(MemoryModel.created_at.desc())
)
doc_names = [row[0] for row in result.fetchall()]
return doc_names
except Exception as e:
logging.error(
f"An error occurred while retrieving the Vectordb_namespace: {str(e)}"
)
return None
async def get_unsumarized_vector_db_namespace(session: AsyncSession, user_id: str):
"""
Asynchronously retrieves the latest memory names and document details for a given user.
This function executes a database query to fetch memory names and document details
associated with operations performed by a specific user. It leverages explicit joins
with the 'docs' and 'memories' tables and applies eager loading to optimize performance.
Parameters:
- session (AsyncSession): The database session for executing the query.
- user_id (str): The unique identifier of the user.
Returns:
- Tuple[List[str], List[Tuple[str, str]]]: A tuple containing a list of memory names and
a list of tuples with document names and their corresponding IDs.
Returns None if an exception occurs.
Raises:
- Exception: Propagates any exceptions that occur during query execution.
Example Usage:
"""
# try:
result = await session.execute(
select(Operation)
.join(Operation.docs) # Explicit join with docs table
.join(Operation.memories) # Explicit join with memories table
.options(
contains_eager(Operation.docs), # Informs ORM of the join for docs
contains_eager(Operation.memories), # Informs ORM of the join for memories
)
.where(
(Operation.user_id == user_id)
& or_( # Filter by user_id
DocsModel.graph_summary == False, # Condition 1: graph_summary is False
DocsModel.graph_summary == None, # Condition 3: graph_summary is None
) # Filter by user_id
)
.order_by(Operation.created_at.desc()) # Order by creation date
)
operations = result.unique().scalars().all()
# Extract memory names and document names and IDs
# memory_names = [memory.memory_name for op in operations for memory in op.memories]
memory_details = [
(memory.memory_name, memory.memory_category)
for op in operations
for memory in op.memories
]
docs = [(doc.doc_name, doc.id) for op in operations for doc in op.docs]
return memory_details, docs
async def get_memory_name_by_doc_id(session: AsyncSession, docs_id: str):
"""
Asynchronously retrieves memory names associated with a specific document ID.
This function executes a database query to fetch memory names linked to a document
through operations. The query is filtered based on a given document ID and retrieves
only the memory names without loading the entire Operation entity.
Parameters:
- session (AsyncSession): The database session for executing the query.
- docs_id (str): The unique identifier of the document.
Returns:
- List[str]: A list of memory names associated with the given document ID.
Returns None if an exception occurs.
Raises:
- Exception: Propagates any exceptions that occur during query execution.
"""
try:
result = await session.execute(
select(MemoryModel.memory_name)
.join(
Operation, Operation.id == MemoryModel.operation_id
) # Join with Operation
.join(
DocsModel, DocsModel.operation_id == Operation.id
) # Join with DocsModel
.where(DocsModel.id == docs_id) # Filtering based on the passed document ID
.distinct() # To avoid duplicate memory names
)
memory_names = [row[0] for row in result.fetchall()]
return memory_names
except Exception as e:
# Handle the exception as needed
print(f"An error occurred: {e}")
return None
async def read_query_prompt(filename: str) -> str:
"""Read a query prompt from a file.
:param filename: The name of the file to read.
:return: The content of the file as a string.
"""
script_directory = Path(__file__).parent
# Set the base directory relative to the script's directory
base_directory = script_directory.parent / "cognitive_architecture/infrastructure/llm/prompts"
# Construct the full file path
file_path = base_directory / filename
try:
return file_path.read_text()
except FileNotFoundError:
logging.error(f"File not found: {file_path.absolute()}")
except Exception as e:
logging.error(f"An error of type {type(e).__name__} occurred while reading file: {file_path.absolute()}. Error message: {e}")
return None
async def print_file_content(file_path):
# Create a Path object for the file path
path = Path(file_path)
# Check if the file exists
if path.is_file():
# Open and read the file, then print its content
with path.open('r') as file:
print(file.read())
else:
# Print an error message if the file does not exist
print(f"The file '{file_path}' does not exist.")
async def async_render_template(filename: str, context: dict) -> str:
"""Render a Jinja2 template asynchronously.
:param filename: The name of the template file to render.
:param context: The context to render the template with.
:return: The rendered template as a string."""
# Initialize the Jinja2 environment to load templates from the filesystem
script_directory = Path(__file__).parent
# Set the base directory relative to the script's directory
base_directory = script_directory.parent / "cognitive_architecture/infrastructure/llm/prompts"
# Construct the full file path
file_path = base_directory / filename
env = Environment(
loader=FileSystemLoader(base_directory),
autoescape=select_autoescape(['html', 'xml', 'txt'])
)
# Load the template by name
template = env.get_template(filename)
# Render the template with the provided context
rendered_template = template.render(context)
return rendered_template
async def render_graph(graph, graph_type):
# Authenticate with your Graphistry API key
import networkx as nx
import pandas as pd
graphistry.register(api=3, username=config.graphistry_username, password=config.graphistry_password)
# Convert the NetworkX graph to a Pandas DataFrame representing the edge list
edges = nx.to_pandas_edgelist(graph)
# Visualize the graph using Graphistry
plotter = graphistry.edges(edges, 'source', 'target')
# Visualize the graph (this will open a URL in your default web browser)
url = plotter.plot(render=False, as_files=True)
print(f"Graph is visualized at: {url}")
# import networkx as nx
# # Create a simple NetworkX graph
# G = nx.Graph()
#
# # Add nodes
# G.add_node(1)
# G.add_node(2)
#
# # Add an edge between nodes
# G.add_edge(1, 2)
#
# import asyncio
#
# # Define the graph type (for this example, it's just a placeholder as the function doesn't use it yet)
# graph_type = "simple"
#
# # Call the render_graph function
# asyncio.run(render_graph(G, graph_type))