Merge c0df9447e8 into f83b475ab1
This commit is contained in:
commit
98f5b401f4
1 changed files with 178 additions and 38 deletions
|
|
@ -1,3 +1,4 @@
|
|||
==========controled ingestion in batches=============
|
||||
import os
|
||||
import asyncio
|
||||
from lightrag import LightRAG, QueryParam
|
||||
|
|
@ -6,8 +7,8 @@ import numpy as np
|
|||
from dotenv import load_dotenv
|
||||
import logging
|
||||
from openai import AzureOpenAI
|
||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
||||
|
||||
import time
|
||||
start_time = time.time()
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
load_dotenv()
|
||||
|
|
@ -20,14 +21,15 @@ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
|||
AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
|
||||
AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION")
|
||||
|
||||
WORKING_DIR = "./dickens"
|
||||
WORKING_DIR = "C:\\Users\\user\\testfolder"
|
||||
|
||||
if os.path.exists(WORKING_DIR):
|
||||
import shutil
|
||||
# The code below removes the working_dir and creates a new one!
|
||||
# if os.path.exists(WORKING_DIR):
|
||||
# import shutil
|
||||
|
||||
shutil.rmtree(WORKING_DIR)
|
||||
# shutil.rmtree(WORKING_DIR)
|
||||
|
||||
os.mkdir(WORKING_DIR)
|
||||
# os.mkdir(WORKING_DIR)
|
||||
|
||||
|
||||
async def llm_model_func(
|
||||
|
|
@ -70,19 +72,149 @@ async def embedding_func(texts: list[str]) -> np.ndarray:
|
|||
|
||||
async def test_funcs():
|
||||
result = await llm_model_func("How are you?")
|
||||
print("Resposta do llm_model_func: ", result)
|
||||
print("Response from llm_model_func: ", result)
|
||||
|
||||
result = await embedding_func(["How are you?"])
|
||||
print("Resultado do embedding_func: ", result.shape)
|
||||
print("Dimensão da embedding: ", result.shape[1])
|
||||
print("Result from embedding_func: ", result.shape)
|
||||
print("Embedding dimension: ", result.shape[1])
|
||||
|
||||
|
||||
asyncio.run(test_funcs())
|
||||
|
||||
embedding_dimension = 3072
|
||||
embedding_dimension = 1536
|
||||
|
||||
rag = LightRAG(
|
||||
working_dir=WORKING_DIR,
|
||||
addon_params={"insert_batch_size": 3},
|
||||
llm_model_func=llm_model_func,
|
||||
embedding_func=EmbeddingFunc(
|
||||
embedding_dim=embedding_dimension,
|
||||
max_token_size=8192,
|
||||
func=embedding_func,
|
||||
),
|
||||
)
|
||||
|
||||
folder_path = 'C:/Users/example/test/LightRAG/my_docs' # With os, this specification of the documents folder is not a problem.
|
||||
|
||||
def normalize_path(path):
|
||||
# Normalize the path
|
||||
normalized_path = os.path.normpath(path)
|
||||
# Replace backslashes with forward slashes
|
||||
return normalized_path.replace('\\', '/')
|
||||
|
||||
# Output file where we store filenames
|
||||
output_file = 'processed.txt'
|
||||
# The maximum number of files to process
|
||||
batch_files = 5
|
||||
|
||||
# Function to include document in vector-store
|
||||
def process_doc(file_path):
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
return content
|
||||
|
||||
# logging.info(f"Adding document: {file_path}")
|
||||
# rag.insert(content)
|
||||
|
||||
# Function to read existing filenames from the output file
|
||||
def read_existing_files(output_file):
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
return set(line.strip() for line in f.readlines())
|
||||
return set()
|
||||
|
||||
input_docs = []
|
||||
# Check if the folder exists
|
||||
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
||||
# Read existing filenames from the output file
|
||||
existing_files = read_existing_files(output_file)
|
||||
# Open the output
|
||||
|
||||
|
||||
async def initialize_rag():
|
||||
==========Querier==============================================
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
from lightrag import LightRAG, QueryParam
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
import logging
|
||||
from openai import AzureOpenAI
|
||||
import time
|
||||
from multiprocessing import freeze_support
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
load_dotenv()
|
||||
|
||||
# Version of the script
|
||||
VERSION = "0.2"
|
||||
|
||||
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
||||
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
||||
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
||||
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
||||
|
||||
AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
|
||||
AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION")
|
||||
|
||||
WORKING_DIR = "C:/Users/user/testfolder"
|
||||
|
||||
async def llm_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
|
||||
) -> str:
|
||||
client = AzureOpenAI(
|
||||
api_key=AZURE_OPENAI_API_KEY,
|
||||
api_version=AZURE_OPENAI_API_VERSION,
|
||||
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
||||
)
|
||||
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
if history_messages:
|
||||
messages.extend(history_messages)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model=AZURE_OPENAI_DEPLOYMENT,
|
||||
messages=messages,
|
||||
temperature=kwargs.get("temperature", 0.5),
|
||||
max_tokens=kwargs.get("max_tokens", 1500)
|
||||
)
|
||||
|
||||
return chat_completion.choices[0].message.content
|
||||
|
||||
async def embedding_func(texts: list[str]) -> np.ndarray:
|
||||
client = AzureOpenAI(
|
||||
api_key=AZURE_OPENAI_API_KEY,
|
||||
api_version=AZURE_EMBEDDING_API_VERSION,
|
||||
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
||||
)
|
||||
embedding = client.embeddings.create(model=AZURE_EMBEDDING_DEPLOYMENT, input=texts)
|
||||
|
||||
embeddings = [item.embedding for item in embedding.data]
|
||||
return np.array(embeddings)
|
||||
|
||||
# async def test_funcs():
|
||||
# try:
|
||||
# result_llm = await llm_model_func("How are you?", system_prompt="Act as a friendly assistant.")
|
||||
# print("Response from llm_model_func: ", result_llm)
|
||||
|
||||
# result_embedding = await embedding_func(["How are you?"])
|
||||
# print("Result from embedding_func: ", result_embedding.shape)
|
||||
# print("Embedding dimension: ", result_embedding.shape[1])
|
||||
# except Exception as e:
|
||||
# print(f"An error occurred in test_funcs: {e}")
|
||||
|
||||
# asyncio.run(test_funcs())
|
||||
|
||||
embedding_dimension = 1536
|
||||
|
||||
rag = LightRAG(
|
||||
working_dir=WORKING_DIR,
|
||||
llm_model_func=llm_model_func,
|
||||
|
|
@ -93,34 +225,42 @@ async def initialize_rag():
|
|||
),
|
||||
)
|
||||
|
||||
await rag.initialize_storages()
|
||||
await initialize_pipeline_status()
|
||||
def query_rag(question):
|
||||
"""
|
||||
Execute a query in hybrid mode.
|
||||
"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = rag.query(
|
||||
question,
|
||||
param=QueryParam(mode="hybrid") # Set hybrid mode
|
||||
)
|
||||
if response is None or not response.strip():
|
||||
print("No relevant answer found. Check if the database is correctly populated.")
|
||||
return
|
||||
|
||||
return rag
|
||||
print("\n--- Answer ---")
|
||||
print(response)
|
||||
print("\n")
|
||||
duration = time.time() - start_time
|
||||
print(f"The answer took {duration} seconds.")
|
||||
except Exception as e:
|
||||
print(f"An error occurred while executing the query: {e}")
|
||||
|
||||
print("LightRAG - Interactive Questioning")
|
||||
print(f"Version: {VERSION}")
|
||||
print("Type 'exit' to terminate the program.\n")
|
||||
|
||||
def main():
|
||||
rag = asyncio.run(initialize_rag())
|
||||
while True:
|
||||
try:
|
||||
question = input("Ask your question: ")
|
||||
|
||||
book1 = open("./book_1.txt", encoding="utf-8")
|
||||
book2 = open("./book_2.txt", encoding="utf-8")
|
||||
if question.lower() in ["exit", "quit"]:
|
||||
print("Program terminated.")
|
||||
break
|
||||
query_rag(question)
|
||||
except:
|
||||
print("Error answering the question")
|
||||
|
||||
rag.insert([book1.read(), book2.read()])
|
||||
|
||||
query_text = "What are the main themes?"
|
||||
|
||||
print("Result (Naive):")
|
||||
print(rag.query(query_text, param=QueryParam(mode="naive")))
|
||||
|
||||
print("\nResult (Local):")
|
||||
print(rag.query(query_text, param=QueryParam(mode="local")))
|
||||
|
||||
print("\nResult (Global):")
|
||||
print(rag.query(query_text, param=QueryParam(mode="global")))
|
||||
|
||||
print("\nResult (Hybrid):")
|
||||
print(rag.query(query_text, param=QueryParam(mode="hybrid")))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue