Merge c0df9447e8 into f83b475ab1

2025-11-18 01:58:22 +08:00 · 2025-11-18 01:58:22 +08:00 · 98f5b401f4
commit 98f5b401f4
parent f83b475ab1 c0df9447e8
1 changed files with 178 additions and 38 deletions
--- a/examples/lightrag_azure_openai_demo.py
+++ b/examples/lightrag_azure_openai_demo.py
@ -1,3 +1,4 @@
+==========controled ingestion in batches=============
 import os
 import asyncio
 from lightrag import LightRAG, QueryParam
@ -6,8 +7,8 @@ import numpy as np
 from dotenv import load_dotenv
 import logging
 from openai import AzureOpenAI
-from lightrag.kg.shared_storage import initialize_pipeline_status
-
+import time
+start_time = time.time() 
 logging.basicConfig(level=logging.INFO)

 load_dotenv()
@ -20,14 +21,15 @@ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
 AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
 AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION")

-WORKING_DIR = "./dickens"
+WORKING_DIR = "C:\\Users\\user\\testfolder" 

-if os.path.exists(WORKING_DIR):
-    import shutil
+# The code below removes the working_dir and creates a new one!
+# if os.path.exists(WORKING_DIR):
+#     import shutil

-    shutil.rmtree(WORKING_DIR)
+#     shutil.rmtree(WORKING_DIR)

-os.mkdir(WORKING_DIR)
+# os.mkdir(WORKING_DIR)


 async def llm_model_func(
@ -70,19 +72,149 @@ async def embedding_func(texts: list[str]) -> np.ndarray:

 async def test_funcs():
    result = await llm_model_func("How are you?")
-    print("Resposta do llm_model_func: ", result)
+    print("Response from llm_model_func: ", result)

    result = await embedding_func(["How are you?"])
-    print("Resultado do embedding_func: ", result.shape)
-    print("Dimensão da embedding: ", result.shape[1])
+    print("Result from embedding_func: ", result.shape)
+    print("Embedding dimension: ", result.shape[1])


 asyncio.run(test_funcs())

-embedding_dimension = 3072
+embedding_dimension = 1536
+
+rag = LightRAG(
+    working_dir=WORKING_DIR,
+    addon_params={"insert_batch_size": 3},
+    llm_model_func=llm_model_func,
+    embedding_func=EmbeddingFunc(
+        embedding_dim=embedding_dimension,
+        max_token_size=8192,
+        func=embedding_func,
+    ),
+)
+
+folder_path = 'C:/Users/example/test/LightRAG/my_docs' # With os, this specification of the documents folder is not a problem.
+
+def normalize_path(path):
+    # Normalize the path
+    normalized_path = os.path.normpath(path)
+    # Replace backslashes with forward slashes
+    return normalized_path.replace('\\', '/')
+
+# Output file where we store filenames
+output_file = 'processed.txt'
+# The maximum number of files to process
+batch_files = 5 
+
+# Function to include document in vector-store
+def process_doc(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+    return content
+
+#     logging.info(f"Adding document: {file_path}")
+#     rag.insert(content)
+
+# Function to read existing filenames from the output file
+def read_existing_files(output_file):
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            return set(line.strip() for line in f.readlines())
+    return set()
+
+input_docs = []
+# Check if the folder exists
+if os.path.exists(folder_path) and os.path.isdir(folder_path):
+    # Read existing filenames from the output file
+    existing_files = read_existing_files(output_file)
+    # Open the output


-async def initialize_rag():
+==========Querier==============================================
+
+import os
+import asyncio
+from lightrag import LightRAG, QueryParam
+from lightrag.utils import EmbeddingFunc
+import numpy as np
+from dotenv import load_dotenv
+import logging
+from openai import AzureOpenAI
+import time
+from multiprocessing import freeze_support
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+
+def main():
+    start_time = time.time()
+    load_dotenv()
+
+    # Version of the script
+    VERSION = "0.2"
+
+    AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
+    AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
+    AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
+    AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+
+    AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
+    AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION")
+
+    WORKING_DIR = "C:/Users/user/testfolder" 
+
+    async def llm_model_func(
+        prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
+    ) -> str:
+        client = AzureOpenAI(
+            api_key=AZURE_OPENAI_API_KEY,
+            api_version=AZURE_OPENAI_API_VERSION,
+            azure_endpoint=AZURE_OPENAI_ENDPOINT,
+        )
+
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        if history_messages:
+            messages.extend(history_messages)
+        messages.append({"role": "user", "content": prompt})
+
+        chat_completion = client.chat.completions.create(
+            model=AZURE_OPENAI_DEPLOYMENT,
+            messages=messages,
+            temperature=kwargs.get("temperature", 0.5),
+            max_tokens=kwargs.get("max_tokens", 1500)
+        )
+
+        return chat_completion.choices[0].message.content
+
+    async def embedding_func(texts: list[str]) -> np.ndarray:
+        client = AzureOpenAI(
+            api_key=AZURE_OPENAI_API_KEY,
+            api_version=AZURE_EMBEDDING_API_VERSION,
+            azure_endpoint=AZURE_OPENAI_ENDPOINT,
+        )
+        embedding = client.embeddings.create(model=AZURE_EMBEDDING_DEPLOYMENT, input=texts)
+
+        embeddings = [item.embedding for item in embedding.data]
+        return np.array(embeddings)
+
+    # async def test_funcs():
+    #     try:
+    #         result_llm = await llm_model_func("How are you?", system_prompt="Act as a friendly assistant.")
+    #         print("Response from llm_model_func: ", result_llm)
+
+    #         result_embedding = await embedding_func(["How are you?"])
+    #         print("Result from embedding_func: ", result_embedding.shape)
+    #         print("Embedding dimension: ", result_embedding.shape[1])
+    #     except Exception as e:
+    #         print(f"An error occurred in test_funcs: {e}")
+
+    # asyncio.run(test_funcs())
+
+    embedding_dimension = 1536
+
    rag = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=llm_model_func,
@ -93,34 +225,42 @@ async def initialize_rag():
        ),
    )

-    await rag.initialize_storages()
-    await initialize_pipeline_status()
+    def query_rag(question):
+        """
+        Execute a query in hybrid mode.
+        """
+        try:
+            start_time = time.time()
+            response = rag.query(
+                question,
+                param=QueryParam(mode="hybrid")  # Set hybrid mode
+            )
+            if response is None or not response.strip():
+                print("No relevant answer found. Check if the database is correctly populated.")
+                return

-    return rag
+            print("\n--- Answer ---")
+            print(response)
+            print("\n")
+            duration = time.time() - start_time  
+            print(f"The answer took {duration} seconds.")      
+        except Exception as e:
+            print(f"An error occurred while executing the query: {e}")

+    print("LightRAG - Interactive Questioning")
+    print(f"Version: {VERSION}")
+    print("Type 'exit' to terminate the program.\n")

-def main():
-    rag = asyncio.run(initialize_rag())
+    while True:
+        try:
+            question = input("Ask your question: ")

-    book1 = open("./book_1.txt", encoding="utf-8")
-    book2 = open("./book_2.txt", encoding="utf-8")
+            if question.lower() in ["exit", "quit"]:
+                print("Program terminated.")
+                break
+            query_rag(question)    
+        except:
+            print("Error answering the question")

-    rag.insert([book1.read(), book2.read()])
-
-    query_text = "What are the main themes?"
-
-    print("Result (Naive):")
-    print(rag.query(query_text, param=QueryParam(mode="naive")))
-
-    print("\nResult (Local):")
-    print(rag.query(query_text, param=QueryParam(mode="local")))
-
-    print("\nResult (Global):")
-    print(rag.query(query_text, param=QueryParam(mode="global")))
-
-    print("\nResult (Hybrid):")
-    print(rag.query(query_text, param=QueryParam(mode="hybrid")))
-
-
-if __name__ == "__main__":
-    main()
+if __name__ == '__main__':
+