This commit is contained in:
Hammton 2025-11-18 01:58:22 +08:00 committed by GitHub
commit 98f5b401f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,3 +1,4 @@
==========controled ingestion in batches=============
import os
import asyncio
from lightrag import LightRAG, QueryParam
@ -6,8 +7,8 @@ import numpy as np
from dotenv import load_dotenv
import logging
from openai import AzureOpenAI
from lightrag.kg.shared_storage import initialize_pipeline_status
import time
start_time = time.time()
logging.basicConfig(level=logging.INFO)
load_dotenv()
@ -20,14 +21,15 @@ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION")
WORKING_DIR = "./dickens"
WORKING_DIR = "C:\\Users\\user\\testfolder"
if os.path.exists(WORKING_DIR):
import shutil
# The code below removes the working_dir and creates a new one!
# if os.path.exists(WORKING_DIR):
# import shutil
shutil.rmtree(WORKING_DIR)
# shutil.rmtree(WORKING_DIR)
os.mkdir(WORKING_DIR)
# os.mkdir(WORKING_DIR)
async def llm_model_func(
@ -70,19 +72,149 @@ async def embedding_func(texts: list[str]) -> np.ndarray:
async def test_funcs():
result = await llm_model_func("How are you?")
print("Resposta do llm_model_func: ", result)
print("Response from llm_model_func: ", result)
result = await embedding_func(["How are you?"])
print("Resultado do embedding_func: ", result.shape)
print("Dimensão da embedding: ", result.shape[1])
print("Result from embedding_func: ", result.shape)
print("Embedding dimension: ", result.shape[1])
asyncio.run(test_funcs())
embedding_dimension = 3072
embedding_dimension = 1536
rag = LightRAG(
working_dir=WORKING_DIR,
addon_params={"insert_batch_size": 3},
llm_model_func=llm_model_func,
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=8192,
func=embedding_func,
),
)
folder_path = 'C:/Users/example/test/LightRAG/my_docs' # With os, this specification of the documents folder is not a problem.
def normalize_path(path):
# Normalize the path
normalized_path = os.path.normpath(path)
# Replace backslashes with forward slashes
return normalized_path.replace('\\', '/')
# Output file where we store filenames
output_file = 'processed.txt'
# The maximum number of files to process
batch_files = 5
# Function to include document in vector-store
def process_doc(file_path):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
return content
# logging.info(f"Adding document: {file_path}")
# rag.insert(content)
# Function to read existing filenames from the output file
def read_existing_files(output_file):
if os.path.exists(output_file):
with open(output_file, 'r') as f:
return set(line.strip() for line in f.readlines())
return set()
input_docs = []
# Check if the folder exists
if os.path.exists(folder_path) and os.path.isdir(folder_path):
# Read existing filenames from the output file
existing_files = read_existing_files(output_file)
# Open the output
async def initialize_rag():
==========Querier==============================================
import os
import asyncio
from lightrag import LightRAG, QueryParam
from lightrag.utils import EmbeddingFunc
import numpy as np
from dotenv import load_dotenv
import logging
from openai import AzureOpenAI
import time
from multiprocessing import freeze_support
# Configure logging
logging.basicConfig(level=logging.INFO)
def main():
start_time = time.time()
load_dotenv()
# Version of the script
VERSION = "0.2"
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION")
WORKING_DIR = "C:/Users/user/testfolder"
async def llm_model_func(
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
client = AzureOpenAI(
api_key=AZURE_OPENAI_API_KEY,
api_version=AZURE_OPENAI_API_VERSION,
azure_endpoint=AZURE_OPENAI_ENDPOINT,
)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
if history_messages:
messages.extend(history_messages)
messages.append({"role": "user", "content": prompt})
chat_completion = client.chat.completions.create(
model=AZURE_OPENAI_DEPLOYMENT,
messages=messages,
temperature=kwargs.get("temperature", 0.5),
max_tokens=kwargs.get("max_tokens", 1500)
)
return chat_completion.choices[0].message.content
async def embedding_func(texts: list[str]) -> np.ndarray:
client = AzureOpenAI(
api_key=AZURE_OPENAI_API_KEY,
api_version=AZURE_EMBEDDING_API_VERSION,
azure_endpoint=AZURE_OPENAI_ENDPOINT,
)
embedding = client.embeddings.create(model=AZURE_EMBEDDING_DEPLOYMENT, input=texts)
embeddings = [item.embedding for item in embedding.data]
return np.array(embeddings)
# async def test_funcs():
# try:
# result_llm = await llm_model_func("How are you?", system_prompt="Act as a friendly assistant.")
# print("Response from llm_model_func: ", result_llm)
# result_embedding = await embedding_func(["How are you?"])
# print("Result from embedding_func: ", result_embedding.shape)
# print("Embedding dimension: ", result_embedding.shape[1])
# except Exception as e:
# print(f"An error occurred in test_funcs: {e}")
# asyncio.run(test_funcs())
embedding_dimension = 1536
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
@ -93,34 +225,42 @@ async def initialize_rag():
),
)
await rag.initialize_storages()
await initialize_pipeline_status()
def query_rag(question):
"""
Execute a query in hybrid mode.
"""
try:
start_time = time.time()
response = rag.query(
question,
param=QueryParam(mode="hybrid") # Set hybrid mode
)
if response is None or not response.strip():
print("No relevant answer found. Check if the database is correctly populated.")
return
return rag
print("\n--- Answer ---")
print(response)
print("\n")
duration = time.time() - start_time
print(f"The answer took {duration} seconds.")
except Exception as e:
print(f"An error occurred while executing the query: {e}")
print("LightRAG - Interactive Questioning")
print(f"Version: {VERSION}")
print("Type 'exit' to terminate the program.\n")
def main():
rag = asyncio.run(initialize_rag())
while True:
try:
question = input("Ask your question: ")
book1 = open("./book_1.txt", encoding="utf-8")
book2 = open("./book_2.txt", encoding="utf-8")
if question.lower() in ["exit", "quit"]:
print("Program terminated.")
break
query_rag(question)
except:
print("Error answering the question")
rag.insert([book1.read(), book2.read()])
query_text = "What are the main themes?"
print("Result (Naive):")
print(rag.query(query_text, param=QueryParam(mode="naive")))
print("\nResult (Local):")
print(rag.query(query_text, param=QueryParam(mode="local")))
print("\nResult (Global):")
print(rag.query(query_text, param=QueryParam(mode="global")))
print("\nResult (Hybrid):")
print(rag.query(query_text, param=QueryParam(mode="hybrid")))
if __name__ == "__main__":
main()
if __name__ == '__main__':