diff --git a/env.example b/env.example
deleted file mode 100644
index 2f28b99d..00000000
--- a/env.example
+++ /dev/null
@@ -1,494 +0,0 @@
-### This is sample file of .env
-
-###########################
-### Server Configuration
-###########################
-HOST=0.0.0.0
-PORT=9621
-WEBUI_TITLE='My Graph KB'
-WEBUI_DESCRIPTION="Simple and Fast Graph Based RAG System"
-# WORKERS=2
-### gunicorn worker timeout(as default LLM request timeout if LLM_TIMEOUT is not set)
-# TIMEOUT=150
-# CORS_ORIGINS=http://localhost:3000,http://localhost:8080
-
-### Optional SSL Configuration
-# SSL=true
-# SSL_CERTFILE=/path/to/cert.pem
-# SSL_KEYFILE=/path/to/key.pem
-
-### Directory Configuration (defaults to current working directory)
-### Default value is ./inputs and ./rag_storage
-# INPUT_DIR=<absolute_path_for_doc_input_dir>
-# WORKING_DIR=<absolute_path_for_working_dir>
-
-### Tiktoken cache directory (Store cached files in this folder for offline deployment)
-# TIKTOKEN_CACHE_DIR=/app/data/tiktoken
-
-### Ollama Emulating Model and Tag
-# OLLAMA_EMULATING_MODEL_NAME=lightrag
-OLLAMA_EMULATING_MODEL_TAG=latest
-
-### Max nodes for graph retrieval (Ensure WebUI local settings are also updated, which is limited to this value)
-# MAX_GRAPH_NODES=1000
-
-### Logging level
-# LOG_LEVEL=INFO
-# VERBOSE=False
-# LOG_MAX_BYTES=10485760
-# LOG_BACKUP_COUNT=5
-### Logfile location (defaults to current working directory)
-# LOG_DIR=/path/to/log/directory
-
-#####################################
-### Login and API-Key Configuration
-#####################################
-# AUTH_ACCOUNTS='admin:admin123,user1:pass456'
-# TOKEN_SECRET=Your-Key-For-LightRAG-API-Server
-# TOKEN_EXPIRE_HOURS=48
-# GUEST_TOKEN_EXPIRE_HOURS=24
-# JWT_ALGORITHM=HS256
-
-### API-Key to access LightRAG Server API
-### Use this key in HTTP requests with the 'X-API-Key' header
-### Example: curl -H "X-API-Key: your-secure-api-key-here" http://localhost:9621/query
-# LIGHTRAG_API_KEY=your-secure-api-key-here
-# WHITELIST_PATHS=/health,/api/*
-
-######################################################################################
-### Query Configuration
-###
-### How to control the context length sent to LLM:
-###    MAX_ENTITY_TOKENS + MAX_RELATION_TOKENS < MAX_TOTAL_TOKENS
-###    Chunk_Tokens = MAX_TOTAL_TOKENS - Actual_Entity_Tokens - Actual_Relation_Tokens
-######################################################################################
-# LLM response cache for query (Not valid for streaming response)
-ENABLE_LLM_CACHE=true
-# COSINE_THRESHOLD=0.2
-### Number of entities or relations retrieved from KG
-# TOP_K=40
-### Maximum number or chunks for naive vector search
-# CHUNK_TOP_K=20
-### control the actual entities send to LLM
-# MAX_ENTITY_TOKENS=6000
-### control the actual relations send to LLM
-# MAX_RELATION_TOKENS=8000
-### control the maximum tokens send to LLM (include entities, relations and chunks)
-# MAX_TOTAL_TOKENS=30000
-
-### chunk selection strategies
-###     VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval
-###     WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM
-###     If reranking is enabled, the impact of chunk selection strategies will be diminished.
-# KG_CHUNK_PICK_METHOD=VECTOR
-
-#########################################################
-### Reranking configuration
-### RERANK_BINDING type:  null, cohere, jina, aliyun
-### For rerank model deployed by vLLM use cohere binding
-#########################################################
-RERANK_BINDING=null
-### Enable rerank by default in query params when RERANK_BINDING is not null
-# RERANK_BY_DEFAULT=True
-### rerank score chunk filter(set to 0.0 to keep all chunks, 0.6 or above if LLM is not strong enough)
-# MIN_RERANK_SCORE=0.0
-
-### For local deployment with vLLM
-# RERANK_MODEL=BAAI/bge-reranker-v2-m3
-# RERANK_BINDING_HOST=http://localhost:8000/v1/rerank
-# RERANK_BINDING_API_KEY=your_rerank_api_key_here
-
-### Default value for Cohere AI
-# RERANK_MODEL=rerank-v3.5
-# RERANK_BINDING_HOST=https://api.cohere.com/v2/rerank
-# RERANK_BINDING_API_KEY=your_rerank_api_key_here
-### Cohere rerank chunking configuration (useful for models with token limits like ColBERT)
-# RERANK_ENABLE_CHUNKING=true
-# RERANK_MAX_TOKENS_PER_DOC=480
-
-### Default value for Jina AI
-# RERANK_MODEL=jina-reranker-v2-base-multilingual
-# RERANK_BINDING_HOST=https://api.jina.ai/v1/rerank
-# RERANK_BINDING_API_KEY=your_rerank_api_key_here
-
-### Default value for Aliyun
-# RERANK_MODEL=gte-rerank-v2
-# RERANK_BINDING_HOST=https://dashscope.aliyuncs.com/api/v1/services/rerank/text-rerank/text-rerank
-# RERANK_BINDING_API_KEY=your_rerank_api_key_here
-
-########################################
-### Document processing configuration
-########################################
-ENABLE_LLM_CACHE_FOR_EXTRACT=true
-
-### Document processing output language: English, Chinese, French, German ...
-SUMMARY_LANGUAGE=English
-
-### PDF decryption password for protected PDF files
-# PDF_DECRYPT_PASSWORD=your_pdf_password_here
-
-### Entity types that the LLM will attempt to recognize
-# ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]'
-
-### Chunk size for document splitting, 500~1500 is recommended
-# CHUNK_SIZE=1200
-# CHUNK_OVERLAP_SIZE=100
-
-### Number of summary segments or tokens to trigger LLM summary on entity/relation merge (at least 3 is recommended)
-# FORCE_LLM_SUMMARY_ON_MERGE=8
-### Max description token size to trigger LLM summary
-# SUMMARY_MAX_TOKENS = 1200
-### Recommended LLM summary output length in tokens
-# SUMMARY_LENGTH_RECOMMENDED_=600
-### Maximum context size sent to LLM for description summary
-# SUMMARY_CONTEXT_SIZE=12000
-
-### control the maximum chunk_ids stored in vector and graph db
-# MAX_SOURCE_IDS_PER_ENTITY=300
-# MAX_SOURCE_IDS_PER_RELATION=300
-### control chunk_ids limitation method: FIFO, KEEP
-###    FIFO: First in first out
-###    KEEP: Keep oldest (less merge action and faster)
-# SOURCE_IDS_LIMIT_METHOD=FIFO
-
-# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance)
-# MAX_FILE_PATHS=100
-
-### maximum number of related chunks per source entity or relation
-###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
-###     Higher values increase re-ranking time
-# RELATED_CHUNK_NUMBER=5
-
-###############################
-### Concurrency Configuration
-###############################
-### Max concurrency requests of LLM (for both query and document processing)
-MAX_ASYNC=4
-### Number of parallel processing documents(between 2~10, MAX_ASYNC/3 is recommended)
-MAX_PARALLEL_INSERT=2
-### Max concurrency requests for Embedding
-# EMBEDDING_FUNC_MAX_ASYNC=8
-### Num of chunks send to Embedding in single request
-# EMBEDDING_BATCH_NUM=10
-
-###########################################################################
-### LLM Configuration
-### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock, gemini
-### LLM_BINDING_HOST: host only for Ollama, endpoint for other LLM service
-### If LightRAG deployed in Docker:
-###    uses host.docker.internal instead of localhost in LLM_BINDING_HOST
-###########################################################################
-### LLM request timeout setting for all llm (0 means no timeout for Ollma)
-# LLM_TIMEOUT=180
-
-LLM_BINDING=openai
-LLM_MODEL=gpt-4o
-LLM_BINDING_HOST=https://api.openai.com/v1
-LLM_BINDING_API_KEY=your_api_key
-
-### Azure OpenAI example
-### Use deployment name as model name or set AZURE_OPENAI_DEPLOYMENT instead
-# AZURE_OPENAI_API_VERSION=2024-08-01-preview
-# LLM_BINDING=azure_openai
-# LLM_BINDING_HOST=https://xxxx.openai.azure.com/
-# LLM_BINDING_API_KEY=your_api_key
-# LLM_MODEL=my-gpt-mini-deployment
-
-### Openrouter example
-# LLM_MODEL=google/gemini-2.5-flash
-# LLM_BINDING_HOST=https://openrouter.ai/api/v1
-# LLM_BINDING_API_KEY=your_api_key
-# LLM_BINDING=openai
-
-### Gemini example
-# LLM_BINDING=gemini
-# LLM_MODEL=gemini-flash-latest
-# LLM_BINDING_API_KEY=your_gemini_api_key
-# LLM_BINDING_HOST=https://generativelanguage.googleapis.com
-
-### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
-### lightrag-server --llm-binding gemini --help
-### Gemini Specific Parameters
-# GEMINI_LLM_MAX_OUTPUT_TOKENS=9000
-# GEMINI_LLM_TEMPERATURE=0.7
-### Enable Thinking
-# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": -1, "include_thoughts": true}'
-### Disable Thinking
-# GEMINI_LLM_THINKING_CONFIG='{"thinking_budget": 0, "include_thoughts": false}'
-
-### use the following command to see all support options for OpenAI, azure_openai or OpenRouter
-### lightrag-server --llm-binding openai --help
-### OpenAI Specific Parameters
-# OPENAI_LLM_REASONING_EFFORT=minimal
-### OpenRouter Specific Parameters
-# OPENAI_LLM_EXTRA_BODY='{"reasoning": {"enabled": false}}'
-### Qwen3 Specific Parameters deploy by vLLM
-# OPENAI_LLM_EXTRA_BODY='{"chat_template_kwargs": {"enable_thinking": false}}'
-
-### OpenAI Compatible API Specific Parameters
-### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
-# OPENAI_LLM_TEMPERATURE=0.9
-### Set the max_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
-### Typically, max_tokens does not include prompt content
-### For vLLM/SGLang deployed models, or most of OpenAI compatible API provider
-# OPENAI_LLM_MAX_TOKENS=9000
-### For OpenAI o1-mini or newer modles utilizes max_completion_tokens instead of max_tokens
-OPENAI_LLM_MAX_COMPLETION_TOKENS=9000
-
-### use the following command to see all support options for Ollama LLM
-### lightrag-server --llm-binding ollama --help
-### Ollama Server Specific Parameters
-### OLLAMA_LLM_NUM_CTX must be provided, and should at least larger than MAX_TOTAL_TOKENS + 2000
-OLLAMA_LLM_NUM_CTX=32768
-### Set the max_output_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s)
-# OLLAMA_LLM_NUM_PREDICT=9000
-### Stop sequences for Ollama LLM
-# OLLAMA_LLM_STOP='["</s>", "<|EOT|>"]'
-
-### Bedrock Specific Parameters
-# BEDROCK_LLM_TEMPERATURE=1.0
-
-#######################################################################################
-### Embedding Configuration (Should not be changed after the first file processed)
-### EMBEDDING_BINDING: ollama, openai, azure_openai, jina, lollms, aws_bedrock
-### EMBEDDING_BINDING_HOST: host only for Ollama, endpoint for other Embedding service
-### If LightRAG deployed in Docker:
-###    uses host.docker.internal instead of localhost in EMBEDDING_BINDING_HOST
-#######################################################################################
-# EMBEDDING_TIMEOUT=30
-
-### Control whether to send embedding_dim parameter to embedding API
-### IMPORTANT: Jina ALWAYS sends dimension parameter (API requirement) - this setting is ignored for Jina
-### For OpenAI: Set to 'true' to enable dynamic dimension adjustment
-### For OpenAI: Set to 'false' (default) to disable sending dimension parameter
-### Note: Automatically ignored for backends that don't support dimension parameter (e.g., Ollama)
-
-# Ollama embedding
-# EMBEDDING_BINDING=ollama
-# EMBEDDING_MODEL=bge-m3:latest
-# EMBEDDING_DIM=1024
-# EMBEDDING_BINDING_API_KEY=your_api_key
-### If LightRAG deployed in Docker uses host.docker.internal instead of localhost
-# EMBEDDING_BINDING_HOST=http://localhost:11434
-
-### OpenAI compatible embedding
-EMBEDDING_BINDING=openai
-EMBEDDING_MODEL=text-embedding-3-large
-EMBEDDING_DIM=3072
-EMBEDDING_SEND_DIM=false
-EMBEDDING_TOKEN_LIMIT=8192
-EMBEDDING_BINDING_HOST=https://api.openai.com/v1
-EMBEDDING_BINDING_API_KEY=your_api_key
-
-### Optional for Azure embedding
-### Use deployment name as model name or set AZURE_EMBEDDING_DEPLOYMENT instead
-# AZURE_EMBEDDING_API_VERSION=2024-08-01-preview
-# EMBEDDING_BINDING=azure_openai
-# EMBEDDING_BINDING_HOST=https://xxxx.openai.azure.com/
-# EMBEDDING_API_KEY=your_api_key
-# EMBEDDING_MODEL==my-text-embedding-3-large-deployment
-# EMBEDDING_DIM=3072
-
-### Gemini embedding
-# EMBEDDING_BINDING=gemini
-# EMBEDDING_MODEL=gemini-embedding-001
-# EMBEDDING_DIM=1536
-# EMBEDDING_TOKEN_LIMIT=2048
-# EMBEDDING_BINDING_HOST=https://generativelanguage.googleapis.com
-# EMBEDDING_BINDING_API_KEY=your_api_key
-### Gemini embedding requires sending dimension to server
-# EMBEDDING_SEND_DIM=true
-
-### Jina AI Embedding
-# EMBEDDING_BINDING=jina
-# EMBEDDING_BINDING_HOST=https://api.jina.ai/v1/embeddings
-# EMBEDDING_MODEL=jina-embeddings-v4
-# EMBEDDING_DIM=2048
-# EMBEDDING_BINDING_API_KEY=your_api_key
-
-### Optional for Ollama embedding
-OLLAMA_EMBEDDING_NUM_CTX=8192
-### use the following command to see all support options for Ollama embedding
-### lightrag-server --embedding-binding ollama --help
-
-####################################################################
-### WORKSPACE sets workspace name for all storage types
-### for the purpose of isolating data from LightRAG instances.
-### Valid workspace name constraints: a-z, A-Z, 0-9, and _
-####################################################################
-# WORKSPACE=space1
-
-############################
-### Data storage selection
-############################
-### Default storage (Recommended for small scale deployment)
-# LIGHTRAG_KV_STORAGE=JsonKVStorage
-# LIGHTRAG_DOC_STATUS_STORAGE=JsonDocStatusStorage
-# LIGHTRAG_GRAPH_STORAGE=NetworkXStorage
-# LIGHTRAG_VECTOR_STORAGE=NanoVectorDBStorage
-
-### Redis Storage (Recommended for production deployment)
-# LIGHTRAG_KV_STORAGE=RedisKVStorage
-# LIGHTRAG_DOC_STATUS_STORAGE=RedisDocStatusStorage
-
-### Vector Storage (Recommended for production deployment)
-# LIGHTRAG_VECTOR_STORAGE=MilvusVectorDBStorage
-# LIGHTRAG_VECTOR_STORAGE=QdrantVectorDBStorage
-# LIGHTRAG_VECTOR_STORAGE=FaissVectorDBStorage
-
-### Graph Storage (Recommended for production deployment)
-# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
-# LIGHTRAG_GRAPH_STORAGE=MemgraphStorage
-
-### PostgreSQL
-# LIGHTRAG_KV_STORAGE=PGKVStorage
-# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
-# LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
-# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
-
-### MongoDB (Vector storage only available on Atlas Cloud)
-# LIGHTRAG_KV_STORAGE=MongoKVStorage
-# LIGHTRAG_DOC_STATUS_STORAGE=MongoDocStatusStorage
-# LIGHTRAG_GRAPH_STORAGE=MongoGraphStorage
-# LIGHTRAG_VECTOR_STORAGE=MongoVectorDBStorage
-
-### PostgreSQL Configuration
-POSTGRES_HOST=localhost
-POSTGRES_PORT=5432
-POSTGRES_USER=your_username
-POSTGRES_PASSWORD='your_password'
-POSTGRES_DATABASE=your_database
-POSTGRES_MAX_CONNECTIONS=12
-### DB specific workspace should not be set, keep for compatible only
-### POSTGRES_WORKSPACE=forced_workspace_name
-
-### PostgreSQL Vector Storage Configuration
-### Vector storage type: HNSW, IVFFlat, VCHORDRQ
-POSTGRES_VECTOR_INDEX_TYPE=HNSW
-POSTGRES_HNSW_M=16
-POSTGRES_HNSW_EF=200
-POSTGRES_IVFFLAT_LISTS=100
-POSTGRES_VCHORDRQ_BUILD_OPTIONS=
-POSTGRES_VCHORDRQ_PROBES=
-POSTGRES_VCHORDRQ_EPSILON=1.9
-
-### PostgreSQL Connection Retry Configuration (Network Robustness)
-### Number of retry attempts (1-10, default: 3)
-### Initial retry backoff in seconds (0.1-5.0, default: 0.5)
-### Maximum retry backoff in seconds (backoff-60.0, default: 5.0)
-### Connection pool close timeout in seconds (1.0-30.0, default: 5.0)
-# POSTGRES_CONNECTION_RETRIES=3
-# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5
-# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0
-# POSTGRES_POOL_CLOSE_TIMEOUT=5.0
-
-### PostgreSQL SSL Configuration (Optional)
-# POSTGRES_SSL_MODE=require
-# POSTGRES_SSL_CERT=/path/to/client-cert.pem
-# POSTGRES_SSL_KEY=/path/to/client-key.pem
-# POSTGRES_SSL_ROOT_CERT=/path/to/ca-cert.pem
-# POSTGRES_SSL_CRL=/path/to/crl.pem
-
-### PostgreSQL Server Settings (for Supabase Supavisor)
-# Use this to pass extra options to the PostgreSQL connection string.
-# For Supabase, you might need to set it like this:
-# POSTGRES_SERVER_SETTINGS="options=reference%3D[project-ref]"
-
-# Default is 100 set to 0 to disable
-# POSTGRES_STATEMENT_CACHE_SIZE=100
-
-### Neo4j Configuration
-NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
-NEO4J_USERNAME=neo4j
-NEO4J_PASSWORD='your_password'
-NEO4J_DATABASE=neo4j
-NEO4J_MAX_CONNECTION_POOL_SIZE=100
-NEO4J_CONNECTION_TIMEOUT=30
-NEO4J_CONNECTION_ACQUISITION_TIMEOUT=30
-NEO4J_MAX_TRANSACTION_RETRY_TIME=30
-NEO4J_MAX_CONNECTION_LIFETIME=300
-NEO4J_LIVENESS_CHECK_TIMEOUT=30
-NEO4J_KEEP_ALIVE=true
-### DB specific workspace should not be set, keep for compatible only
-### NEO4J_WORKSPACE=forced_workspace_name
-
-### MongoDB Configuration
-MONGO_URI=mongodb://root:root@localhost:27017/
-#MONGO_URI=mongodb+srv://xxxx
-MONGO_DATABASE=LightRAG
-# MONGODB_WORKSPACE=forced_workspace_name
-
-### Milvus Configuration
-MILVUS_URI=http://localhost:19530
-MILVUS_DB_NAME=lightrag
-# MILVUS_USER=root
-# MILVUS_PASSWORD=your_password
-# MILVUS_TOKEN=your_token
-### DB specific workspace should not be set, keep for compatible only
-### MILVUS_WORKSPACE=forced_workspace_name
-
-### Qdrant
-QDRANT_URL=http://localhost:6333
-# QDRANT_API_KEY=your-api-key
-### DB specific workspace should not be set, keep for compatible only
-### QDRANT_WORKSPACE=forced_workspace_name
-
-### Redis
-REDIS_URI=redis://localhost:6379
-REDIS_SOCKET_TIMEOUT=30
-REDIS_CONNECT_TIMEOUT=10
-REDIS_MAX_CONNECTIONS=100
-REDIS_RETRY_ATTEMPTS=3
-### DB specific workspace should not be set, keep for compatible only
-### REDIS_WORKSPACE=forced_workspace_name
-
-### Memgraph Configuration
-MEMGRAPH_URI=bolt://localhost:7687
-MEMGRAPH_USERNAME=
-MEMGRAPH_PASSWORD=
-MEMGRAPH_DATABASE=memgraph
-### DB specific workspace should not be set, keep for compatible only
-### MEMGRAPH_WORKSPACE=forced_workspace_name
-
-###########################################################
-### Langfuse Observability Configuration
-### Only works with LLM provided by OpenAI compatible API
-### Install with: pip install lightrag-hku[observability]
-### Sign up at: https://cloud.langfuse.com or self-host
-###########################################################
-# LANGFUSE_SECRET_KEY=""
-# LANGFUSE_PUBLIC_KEY=""
-# LANGFUSE_HOST="https://cloud.langfuse.com"  # 或您的自托管实例地址
-# LANGFUSE_ENABLE_TRACE=true
-
-############################
-### Evaluation Configuration
-############################
-### RAGAS evaluation models (used for RAG quality assessment)
-### ⚠️ IMPORTANT: Both LLM and Embedding endpoints MUST be OpenAI-compatible
-### Default uses OpenAI models for evaluation
-
-### LLM Configuration for Evaluation
-# EVAL_LLM_MODEL=gpt-4o-mini
-### API key for LLM evaluation (fallback to OPENAI_API_KEY if not set)
-# EVAL_LLM_BINDING_API_KEY=your_api_key
-### Custom OpenAI-compatible endpoint for LLM evaluation (optional)
-# EVAL_LLM_BINDING_HOST=https://api.openai.com/v1
-
-### Embedding Configuration for Evaluation
-# EVAL_EMBEDDING_MODEL=text-embedding-3-large
-### API key for embeddings (fallback: EVAL_LLM_BINDING_API_KEY -> OPENAI_API_KEY)
-# EVAL_EMBEDDING_BINDING_API_KEY=your_embedding_api_key
-### Custom OpenAI-compatible endpoint for embeddings (fallback: EVAL_LLM_BINDING_HOST)
-# EVAL_EMBEDDING_BINDING_HOST=https://api.openai.com/v1
-
-### Performance Tuning
-### Number of concurrent test case evaluations
-### Lower values reduce API rate limit issues but increase evaluation time
-# EVAL_MAX_CONCURRENT=2
-### TOP_K query parameter of LightRAG (default: 10)
-### Number of entities or relations retrieved from KG
-# EVAL_QUERY_TOP_K=10
-### LLM request retry and timeout settings for evaluation
-# EVAL_LLM_MAX_RETRIES=5
-# EVAL_LLM_TIMEOUT=180
diff --git a/examples/lightrag_openai_compatible_demo.py b/examples/lightrag_openai_compatible_demo.py
index abeb6347..db1e21e0 100644
--- a/examples/lightrag_openai_compatible_demo.py
+++ b/examples/lightrag_openai_compatible_demo.py
@@ -4,8 +4,7 @@ import inspect
 import logging
 import logging.config
 from lightrag import LightRAG, QueryParam
-from lightrag.llm.openai import openai_complete_if_cache
-from lightrag.llm.ollama import ollama_embed
+from lightrag.llm.openai import openai_complete_if_cache, openai_embed
 from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
 
 from dotenv import load_dotenv
@@ -109,14 +108,25 @@ async def initialize_rag():
         working_dir=WORKING_DIR,
         llm_model_func=llm_model_func,
         embedding_func=EmbeddingFunc(
-            embedding_dim=int(os.getenv("EMBEDDING_DIM", "1024")),
-            max_token_size=int(os.getenv("MAX_EMBED_TOKENS", "8192")),
-            func=lambda texts: ollama_embed(
+            embedding_dim=int(os.getenv("EMBEDDING_DIM", "3072")),
+            max_token_size=int(
+                os.getenv("EMBEDDING_TOKEN_LIMIT", os.getenv("MAX_EMBED_TOKENS", "8192"))
+            ),
+            func=lambda texts: openai_embed.func(
                 texts,
-                embed_model=os.getenv("EMBEDDING_MODEL", "bge-m3:latest"),
-                host=os.getenv("EMBEDDING_BINDING_HOST", "http://localhost:11434"),
+                model=os.getenv("EMBEDDING_MODEL", "text-embedding-3-large"),
+                base_url=os.getenv("EMBEDDING_BINDING_HOST"),
+                api_key=os.getenv("EMBEDDING_BINDING_API_KEY")
+                or os.getenv("OPENAI_API_KEY"),
+                embedding_dim=(
+                    int(os.getenv("EMBEDDING_DIM"))
+                    if os.getenv("EMBEDDING_SEND_DIM", "false").lower() == "true"
+                    else None
+                ),
             ),
         ),
+        chunk_token_size=120,
+        chunk_overlap_token_size=30
     )
 
     await rag.initialize_storages()  # Auto-initializes pipeline_status
@@ -155,15 +165,36 @@ async def main():
         print(f"Test dict: {test_text}")
         print(f"Detected embedding dimension: {embedding_dim}\n\n")
 
-        with open("./book.txt", "r", encoding="utf-8") as f:
-            await rag.ainsert(f.read())
+        # with open("./book.txt", "r", encoding="utf-8") as f:
+        #     await rag.ainsert(f.read())
 
+        text = """
+"Stuart Rosenberg (August 11, 1927 – March 15, 2007) was an American film and television director whose motion pictures include \"Cool Hand Luke\" (1967), \"Voyage of the Damned\" (1976), \"The Amityville Horror\" (1979), and \"The Pope of Greenwich Village\" (1984).",
+"He was noted for his work with actor Paul Newman."
+"Méditerranée is a 1963 French experimental film directed by Jean-Daniel Pollet with assistance from Volker Schlöndorff.",
+"It was written by Philippe Sollers and produced by Barbet Schroeder, with music by Antione Duhamel.",
+"The 45 minute film is cited as one of Pollet's most influential films, which according to Jonathan Rosenbaum directly influenced Jean-Luc Goddard's \"Contempt\", released later the same year.",
+"Footage for the film was shot around the Mediterranean, including at a Greek temple, a Sicilian garden, the sea, and also features a fisherman, a bullfighter, and a girl on an operating table."
+"Move is a 1970 American comedy film starring Elliott Gould, Paula Prentiss and Geneviève Waïte, and directed by Stuart Rosenberg.",
+"The screenplay was written by Joel Lieber and Stanley Hart, adapted from a novel by Lieber."
+"Ian Barry is an Australian director of film and TV."
+"Peter Levin is an American director of film, television and theatre."
+"Brian Johnson( born 1939 or 1940) is a British designer and director of film and television special effects."
+"Rachel Feldman( born August 22, 1954) is an American director of film and television and screenwriter of television films."
+"Hanro Smitsman, born in 1967 in Breda( Netherlands), is a writer and director of film and television."
+"Jean-Daniel Pollet (1936–2004) was a French film director and screenwriter who was most active in the 1960s and 1970s.",
+"He was associated with two approaches to filmmaking: comedies which blended burlesque and melancholic elements, and poetic films based on texts by writers such as the French poet Francis Ponge."
+"Howard Winchel Koch( April 11, 1916 – February 16, 2001) was an American producer and director of film and television."
+}"""
+        await rag.ainsert(text)
+        query = """Are director of film Move (1970 Film) and director of film Méditerranée (1963 Film) from the same country?
+"""
         # Perform naive search
         print("\n=====================")
         print("Query mode: naive")
         print("=====================")
         resp = await rag.aquery(
-            "What are the top themes in this story?",
+            query,
             param=QueryParam(mode="naive", stream=True),
         )
         if inspect.isasyncgen(resp):
@@ -176,7 +207,7 @@ async def main():
         print("Query mode: local")
         print("=====================")
         resp = await rag.aquery(
-            "What are the top themes in this story?",
+            query,
             param=QueryParam(mode="local", stream=True),
         )
         if inspect.isasyncgen(resp):
@@ -189,7 +220,7 @@ async def main():
         print("Query mode: global")
         print("=====================")
         resp = await rag.aquery(
-            "What are the top themes in this story?",
+            query,
             param=QueryParam(mode="global", stream=True),
         )
         if inspect.isasyncgen(resp):
@@ -197,12 +228,12 @@ async def main():
         else:
             print(resp)
 
-        # Perform hybrid search
+        #Perform hybrid search
         print("\n=====================")
         print("Query mode: hybrid")
         print("=====================")
         resp = await rag.aquery(
-            "What are the top themes in this story?",
+            query,
             param=QueryParam(mode="hybrid", stream=True),
         )
         if inspect.isasyncgen(resp):
@@ -210,6 +241,7 @@ async def main():
         else:
             print(resp)
 
+
     except Exception as e:
         print(f"An error occurred: {e}")
     finally:
@@ -221,4 +253,4 @@ if __name__ == "__main__":
     # Configure logging before running the main function
     configure_logging()
     asyncio.run(main())
-    print("\nDone!")
+    print("\nDone!")
\ No newline at end of file