From f1dafa0d0135aae8a071bf6805c0b9012b5681fa Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 13 Aug 2025 18:16:42 +0800
Subject: [PATCH 01/21] feat: KG related chunks selection by vector similarity

- Add env switch to toggle weighted polling vs vector-similarity strategy
- Implement similarity-based sorting with fallback to weighted
- Introduce batch vector read API for vector storage
- Implement vector store and retrive funtion for Nanovector DB
- Preserve default behavior (weighted polling selection method)
---
 lightrag/base.py                   |  13 +
 lightrag/constants.py              |   1 +
 lightrag/kg/nano_vector_db_impl.py |  44 ++-
 lightrag/lightrag.py               |   6 +
 lightrag/operate.py                | 414 +++++++++++++++++++----------
 lightrag/utils.py                  | 111 +++++++-
 6 files changed, 440 insertions(+), 149 deletions(-)

diff --git a/lightrag/base.py b/lightrag/base.py
index 0e651f7b..9ba34280 100644
--- a/lightrag/base.py
+++ b/lightrag/base.py
@@ -290,6 +290,19 @@ class BaseVectorStorage(StorageNameSpace, ABC):
             ids: List of vector IDs to be deleted
         """
 
+    @abstractmethod
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        pass
+
 
 @dataclass
 class BaseKVStorage(StorageNameSpace, ABC):
diff --git a/lightrag/constants.py b/lightrag/constants.py
index e66fe0ae..895852bd 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -27,6 +27,7 @@ DEFAULT_MAX_RELATION_TOKENS = 10000
 DEFAULT_MAX_TOTAL_TOKENS = 30000
 DEFAULT_COSINE_THRESHOLD = 0.2
 DEFAULT_RELATED_CHUNK_NUMBER = 5
+DEFAULT_KG_CHUNK_PICK_METHOD = "WEIGHT"
 # Deprated: history message have negtive effect on query performance
 DEFAULT_HISTORY_TURNS = 0
 
diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py
index b5ce2aa3..5bec06f4 100644
--- a/lightrag/kg/nano_vector_db_impl.py
+++ b/lightrag/kg/nano_vector_db_impl.py
@@ -1,5 +1,7 @@
 import asyncio
+import base64
 import os
+import zlib
 from typing import Any, final
 from dataclasses import dataclass
 import numpy as np
@@ -93,8 +95,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
         2. Only one process should updating the storage at a time before index_done_callback,
            KG-storage-log should be used to avoid data corruption
         """
-
-        logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}")
+        # logger.debug(f"[{self.workspace}] Inserting {len(data)} to {self.namespace}")
         if not data:
             return
 
@@ -120,6 +121,11 @@ class NanoVectorDBStorage(BaseVectorStorage):
         embeddings = np.concatenate(embeddings_list)
         if len(embeddings) == len(list_data):
             for i, d in enumerate(list_data):
+                # Compress vector using Float16 + zlib + Base64 for storage optimization
+                vector_f16 = embeddings[i].astype(np.float16)
+                compressed_vector = zlib.compress(vector_f16.tobytes())
+                encoded_vector = base64.b64encode(compressed_vector).decode("utf-8")
+                d["vector"] = encoded_vector
                 d["__vector__"] = embeddings[i]
             client = await self._get_client()
             results = client.upsert(datas=list_data)
@@ -147,7 +153,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
         )
         results = [
             {
-                **dp,
+                **{k: v for k, v in dp.items() if k != "vector"},
                 "id": dp["__id__"],
                 "distance": dp["__metrics__"],
                 "created_at": dp.get("__created_at__"),
@@ -296,7 +302,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
         if result:
             dp = result[0]
             return {
-                **dp,
+                **{k: v for k, v in dp.items() if k != "vector"},
                 "id": dp.get("__id__"),
                 "created_at": dp.get("__created_at__"),
             }
@@ -318,13 +324,41 @@ class NanoVectorDBStorage(BaseVectorStorage):
         results = client.get(ids)
         return [
             {
-                **dp,
+                **{k: v for k, v in dp.items() if k != "vector"},
                 "id": dp.get("__id__"),
                 "created_at": dp.get("__created_at__"),
             }
             for dp in results
         ]
 
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        client = await self._get_client()
+        results = client.get(ids)
+
+        vectors_dict = {}
+        for result in results:
+            if result and "vector" in result and "__id__" in result:
+                # Decompress vector data (Base64 + zlib + Float16 compressed)
+                decoded = base64.b64decode(result["vector"])
+                decompressed = zlib.decompress(decoded)
+                vector_f16 = np.frombuffer(decompressed, dtype=np.float16)
+                vector_f32 = vector_f16.astype(np.float32).tolist()
+                vectors_dict[result["__id__"]] = vector_f32
+
+        return vectors_dict
+
     async def drop(self) -> dict[str, str]:
         """Drop all vector data from storage and clean up resources
 
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index d6822296..4ffea343 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -31,6 +31,7 @@ from lightrag.constants import (
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_COSINE_THRESHOLD,
     DEFAULT_RELATED_CHUNK_NUMBER,
+    DEFAULT_KG_CHUNK_PICK_METHOD,
     DEFAULT_MIN_RERANK_SCORE,
     DEFAULT_SUMMARY_MAX_TOKENS,
     DEFAULT_MAX_ASYNC,
@@ -175,6 +176,11 @@ class LightRAG:
     )
     """Number of related chunks to grab from single entity or relation."""
 
+    kg_chunk_pick_method: str = field(
+        default=get_env_value("KG_CHUNK_PICK_METHOD", DEFAULT_KG_CHUNK_PICK_METHOD, str)
+    )
+    """Method for selecting text chunks: 'WEIGHT' for weight-based selection, 'VECTOR' for embedding similarity-based selection."""
+
     # Entity extraction
     # ---
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index dd8d54be..60f5bc6c 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -28,6 +28,7 @@ from .utils import (
     update_chunk_cache_list,
     remove_think_tags,
     linear_gradient_weighted_polling,
+    vector_similarity_sorting,
     process_chunks_unified,
     build_file_path,
 )
@@ -2349,19 +2350,23 @@ async def _build_query_context(
 
     # Get text chunks based on final filtered data
     if final_node_datas:
-        entity_chunks = await _find_most_related_text_unit_from_entities(
+        entity_chunks = await _find_related_text_unit_from_entities(
             final_node_datas,
             query_param,
             text_chunks_db,
             knowledge_graph_inst,
+            query,
+            chunks_vdb,
         )
 
     if final_edge_datas:
-        relation_chunks = await _find_related_text_unit_from_relationships(
+        relation_chunks = await _find_related_text_unit_from_relations(
             final_edge_datas,
             query_param,
             text_chunks_db,
             entity_chunks,
+            query,
+            chunks_vdb,
         )
 
     # Round-robin merge chunks from different sources with deduplication by chunk_id
@@ -2410,7 +2415,7 @@ async def _build_query_context(
                     }
                 )
 
-    logger.debug(
+    logger.info(
         f"Round-robin merged total chunks from {origin_len} to {len(merged_chunks)}"
     )
 
@@ -2611,104 +2616,6 @@ async def _get_node_data(
     return node_datas, use_relations
 
 
-async def _find_most_related_text_unit_from_entities(
-    node_datas: list[dict],
-    query_param: QueryParam,
-    text_chunks_db: BaseKVStorage,
-    knowledge_graph_inst: BaseGraphStorage,
-):
-    """
-    Find text chunks related to entities using linear gradient weighted polling algorithm.
-
-    This function implements the optimized text chunk selection strategy:
-    1. Sort text chunks for each entity by occurrence count in other entities
-    2. Use linear gradient weighted polling to select chunks fairly
-    """
-    logger.debug(f"Searching text chunks for {len(node_datas)} entities")
-
-    if not node_datas:
-        return []
-
-    # Step 1: Collect all text chunks for each entity
-    entities_with_chunks = []
-    for entity in node_datas:
-        if entity.get("source_id"):
-            chunks = split_string_by_multi_markers(
-                entity["source_id"], [GRAPH_FIELD_SEP]
-            )
-            if chunks:
-                entities_with_chunks.append(
-                    {
-                        "entity_name": entity["entity_name"],
-                        "chunks": chunks,
-                        "entity_data": entity,
-                    }
-                )
-
-    if not entities_with_chunks:
-        logger.warning("No entities with text chunks found")
-        return []
-
-    # Step 2: Count chunk occurrences and deduplicate (keep chunks from earlier positioned entities)
-    chunk_occurrence_count = {}
-    for entity_info in entities_with_chunks:
-        deduplicated_chunks = []
-        for chunk_id in entity_info["chunks"]:
-            chunk_occurrence_count[chunk_id] = (
-                chunk_occurrence_count.get(chunk_id, 0) + 1
-            )
-
-            # If this is the first occurrence (count == 1), keep it; otherwise skip (duplicate from later position)
-            if chunk_occurrence_count[chunk_id] == 1:
-                deduplicated_chunks.append(chunk_id)
-            # count > 1 means this chunk appeared in an earlier entity, so skip it
-
-        # Update entity's chunks to deduplicated chunks
-        entity_info["chunks"] = deduplicated_chunks
-
-    # Step 3: Sort chunks for each entity by occurrence count (higher count = higher priority)
-    for entity_info in entities_with_chunks:
-        sorted_chunks = sorted(
-            entity_info["chunks"],
-            key=lambda chunk_id: chunk_occurrence_count.get(chunk_id, 0),
-            reverse=True,
-        )
-        entity_info["sorted_chunks"] = sorted_chunks
-
-    # Step 4: Apply linear gradient weighted polling algorithm
-    max_related_chunks = text_chunks_db.global_config.get(
-        "related_chunk_number", DEFAULT_RELATED_CHUNK_NUMBER
-    )
-
-    selected_chunk_ids = linear_gradient_weighted_polling(
-        entities_with_chunks, max_related_chunks, min_related_chunks=1
-    )
-
-    logger.debug(
-        f"Found {len(selected_chunk_ids)} entity-related chunks using linear gradient weighted polling"
-    )
-
-    if not selected_chunk_ids:
-        return []
-
-    # Step 5: Batch retrieve chunk data
-    unique_chunk_ids = list(
-        dict.fromkeys(selected_chunk_ids)
-    )  # Remove duplicates while preserving order
-    chunk_data_list = await text_chunks_db.get_by_ids(unique_chunk_ids)
-
-    # Step 6: Build result chunks with valid data
-    result_chunks = []
-    for chunk_id, chunk_data in zip(unique_chunk_ids, chunk_data_list):
-        if chunk_data is not None and "content" in chunk_data:
-            chunk_data_copy = chunk_data.copy()
-            chunk_data_copy["source_type"] = "entity"
-            chunk_data_copy["chunk_id"] = chunk_id  # Add chunk_id for deduplication
-            result_chunks.append(chunk_data_copy)
-
-    return result_chunks
-
-
 async def _find_most_related_edges_from_entities(
     node_datas: list[dict],
     query_param: QueryParam,
@@ -2765,6 +2672,162 @@ async def _find_most_related_edges_from_entities(
     return all_edges_data
 
 
+async def _find_related_text_unit_from_entities(
+    node_datas: list[dict],
+    query_param: QueryParam,
+    text_chunks_db: BaseKVStorage,
+    knowledge_graph_inst: BaseGraphStorage,
+    query: str = None,
+    chunks_vdb: BaseVectorStorage = None,
+):
+    """
+    Find text chunks related to entities using configurable chunk selection method.
+
+    This function supports two chunk selection strategies:
+    1. WEIGHT: Linear gradient weighted polling based on chunk occurrence count
+    2. VECTOR: Vector similarity-based selection using embedding cosine similarity
+    """
+    logger.debug(f"Finding text chunks from {len(node_datas)} entities")
+
+    if not node_datas:
+        return []
+
+    # Step 1: Collect all text chunks for each entity
+    entities_with_chunks = []
+    for entity in node_datas:
+        if entity.get("source_id"):
+            chunks = split_string_by_multi_markers(
+                entity["source_id"], [GRAPH_FIELD_SEP]
+            )
+            if chunks:
+                entities_with_chunks.append(
+                    {
+                        "entity_name": entity["entity_name"],
+                        "chunks": chunks,
+                        "entity_data": entity,
+                    }
+                )
+
+    if not entities_with_chunks:
+        logger.warning("No entities with text chunks found")
+        return []
+
+    # Check chunk selection method from environment variable
+    kg_chunk_pick_method = os.getenv("KG_CHUNK_PICK_METHOD", "WEIGHT").upper()
+
+    max_related_chunks = text_chunks_db.global_config.get(
+        "related_chunk_number", DEFAULT_RELATED_CHUNK_NUMBER
+    )
+    # Step 2: Count chunk occurrences and deduplicate (keep chunks from earlier positioned entities)
+    chunk_occurrence_count = {}
+    for entity_info in entities_with_chunks:
+        deduplicated_chunks = []
+        for chunk_id in entity_info["chunks"]:
+            chunk_occurrence_count[chunk_id] = (
+                chunk_occurrence_count.get(chunk_id, 0) + 1
+            )
+
+            # If this is the first occurrence (count == 1), keep it; otherwise skip (duplicate from later position)
+            if chunk_occurrence_count[chunk_id] == 1:
+                deduplicated_chunks.append(chunk_id)
+            # count > 1 means this chunk appeared in an earlier entity, so skip it
+
+        # Update entity's chunks to deduplicated chunks
+        entity_info["chunks"] = deduplicated_chunks
+
+    # Step 3: Sort chunks for each entity by occurrence count (higher count = higher priority)
+    total_entity_chunks = 0
+    for entity_info in entities_with_chunks:
+        sorted_chunks = sorted(
+            entity_info["chunks"],
+            key=lambda chunk_id: chunk_occurrence_count.get(chunk_id, 0),
+            reverse=True,
+        )
+        entity_info["sorted_chunks"] = sorted_chunks
+        total_entity_chunks += len(sorted_chunks)
+
+    # Step 4: Apply the selected chunk selection algorithm
+    selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
+
+    if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
+        # Calculate num_of_chunks = max_related_chunks * len(entity_info) / 2
+        num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)
+
+        # Get embedding function from global config
+        embedding_func_config = text_chunks_db.embedding_func
+        if not embedding_func_config:
+            logger.warning("No embedding function found, falling back to WEIGHT method")
+            kg_chunk_pick_method = "WEIGHT"
+        else:
+            try:
+                # Extract the actual callable function from EmbeddingFunc
+                if hasattr(embedding_func_config, "func"):
+                    actual_embedding_func = embedding_func_config.func
+                else:
+                    logger.warning(
+                        "Invalid embedding function format, falling back to WEIGHT method"
+                    )
+                    kg_chunk_pick_method = "WEIGHT"
+                    actual_embedding_func = None
+
+                selected_chunk_ids = None
+                if actual_embedding_func:
+                    selected_chunk_ids = await vector_similarity_sorting(
+                        query=query,
+                        text_chunks_storage=text_chunks_db,
+                        chunks_vdb=chunks_vdb,
+                        num_of_chunks=num_of_chunks,
+                        entity_info=entities_with_chunks,
+                        embedding_func=actual_embedding_func,
+                    )
+
+                if selected_chunk_ids == []:
+                    kg_chunk_pick_method = "WEIGHT"
+                    logger.warning(
+                        "No entity-related chunks selected by vector similarity, falling back to WEIGHT method"
+                    )
+                else:
+                    logger.info(
+                        f"Selecting {len(selected_chunk_ids)} from {total_entity_chunks} entity-related chunks by vector similarity"
+                    )
+
+            except Exception as e:
+                logger.error(
+                    f"Error in vector similarity sorting: {e}, falling back to WEIGHT method"
+                )
+                kg_chunk_pick_method = "WEIGHT"
+
+    if kg_chunk_pick_method == "WEIGHT":
+        # Apply linear gradient weighted polling algorithm
+        selected_chunk_ids = linear_gradient_weighted_polling(
+            entities_with_chunks, max_related_chunks, min_related_chunks=1
+        )
+
+        logger.debug(
+            f"Selecting {len(selected_chunk_ids)} from {total_entity_chunks} entity-related chunks by weighted polling"
+        )
+
+    if not selected_chunk_ids:
+        return []
+
+    # Step 5: Batch retrieve chunk data
+    unique_chunk_ids = list(
+        dict.fromkeys(selected_chunk_ids)
+    )  # Remove duplicates while preserving order
+    chunk_data_list = await text_chunks_db.get_by_ids(unique_chunk_ids)
+
+    # Step 6: Build result chunks with valid data
+    result_chunks = []
+    for chunk_id, chunk_data in zip(unique_chunk_ids, chunk_data_list):
+        if chunk_data is not None and "content" in chunk_data:
+            chunk_data_copy = chunk_data.copy()
+            chunk_data_copy["source_type"] = "entity"
+            chunk_data_copy["chunk_id"] = chunk_id  # Add chunk_id for deduplication
+            result_chunks.append(chunk_data_copy)
+
+    return result_chunks
+
+
 async def _get_edge_data(
     keywords,
     knowledge_graph_inst: BaseGraphStorage,
@@ -2856,20 +2919,22 @@ async def _find_most_related_entities_from_relationships(
     return node_datas
 
 
-async def _find_related_text_unit_from_relationships(
+async def _find_related_text_unit_from_relations(
     edge_datas: list[dict],
     query_param: QueryParam,
     text_chunks_db: BaseKVStorage,
     entity_chunks: list[dict] = None,
+    query: str = None,
+    chunks_vdb: BaseVectorStorage = None,
 ):
     """
-    Find text chunks related to relationships using linear gradient weighted polling algorithm.
+    Find text chunks related to relationships using configurable chunk selection method.
 
-    This function implements the optimized text chunk selection strategy:
-    1. Sort text chunks for each relationship by occurrence count in other relationships
-    2. Use linear gradient weighted polling to select chunks fairly
+    This function supports two chunk selection strategies:
+    1. WEIGHT: Linear gradient weighted polling based on chunk occurrence count
+    2. VECTOR: Vector similarity-based selection using embedding cosine similarity
     """
-    logger.debug(f"Searching text chunks for {len(edge_datas)} relationships")
+    logger.debug(f"Finding text chunks from {len(edge_datas)} relations")
 
     if not edge_datas:
         return []
@@ -2899,14 +2964,40 @@ async def _find_related_text_unit_from_relationships(
                 )
 
     if not relations_with_chunks:
-        logger.warning("No relationships with text chunks found")
+        logger.warning("No relation-related chunks found")
         return []
 
+    # Check chunk selection method from environment variable
+    kg_chunk_pick_method = os.getenv("KG_CHUNK_PICK_METHOD", "WEIGHT").upper()
+
+    max_related_chunks = text_chunks_db.global_config.get(
+        "related_chunk_number", DEFAULT_RELATED_CHUNK_NUMBER
+    )
+
     # Step 2: Count chunk occurrences and deduplicate (keep chunks from earlier positioned relationships)
+    # Also remove duplicates with entity_chunks
+
+    # Extract chunk IDs from entity_chunks for deduplication
+    entity_chunk_ids = set()
+    if entity_chunks:
+        for chunk in entity_chunks:
+            chunk_id = chunk.get("chunk_id")
+            if chunk_id:
+                entity_chunk_ids.add(chunk_id)
+
     chunk_occurrence_count = {}
+    # Track unique chunk_ids that have been removed to avoid double counting
+    removed_entity_chunk_ids = set()
+
     for relation_info in relations_with_chunks:
         deduplicated_chunks = []
         for chunk_id in relation_info["chunks"]:
+            # Skip chunks that already exist in entity_chunks
+            if chunk_id in entity_chunk_ids:
+                # Only count each unique chunk_id once
+                removed_entity_chunk_ids.add(chunk_id)
+                continue
+
             chunk_occurrence_count[chunk_id] = (
                 chunk_occurrence_count.get(chunk_id, 0) + 1
             )
@@ -2919,6 +3010,20 @@ async def _find_related_text_unit_from_relationships(
         # Update relationship's chunks to deduplicated chunks
         relation_info["chunks"] = deduplicated_chunks
 
+    # Check if any relations still have chunks after deduplication
+    relations_with_chunks = [
+        relation_info
+        for relation_info in relations_with_chunks
+        if relation_info["chunks"]
+    ]
+
+    logger.info(
+        f"Find {len(relations_with_chunks)} additional relations-related chunks ({len(removed_entity_chunk_ids)} duplicated chunks removed)"
+    )
+
+    if not relations_with_chunks:
+        return []
+
     # Step 3: Sort chunks for each relationship by occurrence count (higher count = higher priority)
     for relation_info in relations_with_chunks:
         sorted_chunks = sorted(
@@ -2928,50 +3033,73 @@ async def _find_related_text_unit_from_relationships(
         )
         relation_info["sorted_chunks"] = sorted_chunks
 
-    # Step 4: Apply linear gradient weighted polling algorithm
-    max_related_chunks = text_chunks_db.global_config.get(
-        "related_chunk_number", DEFAULT_RELATED_CHUNK_NUMBER
-    )
+    # Step 4: Apply the selected chunk selection algorithm
+    selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 
-    selected_chunk_ids = linear_gradient_weighted_polling(
-        relations_with_chunks, max_related_chunks, min_related_chunks=1
-    )
+    if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
+        # Calculate num_of_chunks = max_related_chunks * len(entity_info) / 2
+        num_of_chunks = int(max_related_chunks * len(relations_with_chunks) / 2)
+
+        # Get embedding function from global config
+        embedding_func_config = text_chunks_db.embedding_func
+        if not embedding_func_config:
+            logger.warning("No embedding function found, falling back to WEIGHT method")
+            kg_chunk_pick_method = "WEIGHT"
+        else:
+            try:
+                # Extract the actual callable function from EmbeddingFunc
+                if hasattr(embedding_func_config, "func"):
+                    actual_embedding_func = embedding_func_config.func
+                else:
+                    logger.warning(
+                        "Invalid embedding function format, falling back to WEIGHT method"
+                    )
+                    kg_chunk_pick_method = "WEIGHT"
+                    actual_embedding_func = None
+
+                if actual_embedding_func:
+                    selected_chunk_ids = await vector_similarity_sorting(
+                        query=query,
+                        text_chunks_storage=text_chunks_db,
+                        chunks_vdb=chunks_vdb,
+                        num_of_chunks=num_of_chunks,
+                        entity_info=relations_with_chunks,
+                        embedding_func=actual_embedding_func,
+                    )
+
+                if selected_chunk_ids == []:
+                    kg_chunk_pick_method = "WEIGHT"
+                    logger.warning(
+                        "No relation-related chunks selected by vector similarity, falling back to WEIGHT method"
+                    )
+                else:
+                    logger.info(
+                        f"Selecting {len(selected_chunk_ids)} relation-related chunks by vector similarity"
+                    )
+
+            except Exception as e:
+                logger.error(
+                    f"Error in vector similarity sorting: {e}, falling back to WEIGHT method"
+                )
+                kg_chunk_pick_method = "WEIGHT"
+
+    if kg_chunk_pick_method == "WEIGHT":
+        # Apply linear gradient weighted polling algorithm
+        selected_chunk_ids = linear_gradient_weighted_polling(
+            relations_with_chunks, max_related_chunks, min_related_chunks=1
+        )
+
+        logger.info(
+            f"Selecting {len(selected_chunk_ids)} relation-related chunks by weighted polling"
+        )
 
     logger.debug(
-        f"Found {len(selected_chunk_ids)} relationship-related chunks using linear gradient weighted polling"
-    )
-    logger.info(
         f"KG related chunks: {len(entity_chunks)} from entitys, {len(selected_chunk_ids)} from relations"
     )
 
     if not selected_chunk_ids:
         return []
 
-    # Step 4.5: Remove duplicates with entity_chunks before batch retrieval
-    if entity_chunks:
-        # Extract chunk IDs from entity_chunks
-        entity_chunk_ids = set()
-        for chunk in entity_chunks:
-            chunk_id = chunk.get("chunk_id")
-            if chunk_id:
-                entity_chunk_ids.add(chunk_id)
-
-        # Filter out duplicate chunk IDs
-        original_count = len(selected_chunk_ids)
-        selected_chunk_ids = [
-            chunk_id
-            for chunk_id in selected_chunk_ids
-            if chunk_id not in entity_chunk_ids
-        ]
-
-        logger.debug(
-            f"Deduplication relation-chunks with entity-chunks: {original_count} -> {len(selected_chunk_ids)} chunks "
-        )
-
-        # Early return if no chunks remain after deduplication
-        if not selected_chunk_ids:
-            return []
-
     # Step 5: Batch retrieve chunk data
     unique_chunk_ids = list(
         dict.fromkeys(selected_chunk_ids)
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 96b7bdc3..a1d0dca9 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -60,7 +60,7 @@ def get_env_value(
 
 # Use TYPE_CHECKING to avoid circular imports
 if TYPE_CHECKING:
-    from lightrag.base import BaseKVStorage, QueryParam
+    from lightrag.base import BaseKVStorage, BaseVectorStorage, QueryParam
 
 # use the .env that is inside the current folder
 # allows to use different .env file for each lightrag instance
@@ -1650,6 +1650,115 @@ def linear_gradient_weighted_polling(
     return selected_chunks
 
 
+async def vector_similarity_sorting(
+    query: str,
+    text_chunks_storage: "BaseKVStorage",
+    chunks_vdb: "BaseVectorStorage",
+    num_of_chunks: int,
+    entity_info: list[dict[str, Any]],
+    embedding_func: callable,
+) -> list[str]:
+    """
+    Vector similarity-based text chunk selection algorithm.
+
+    This algorithm selects text chunks based on cosine similarity between
+    the query embedding and text chunk embeddings.
+
+    Args:
+        query: User's original query string
+        text_chunks_storage: Text chunks storage instance
+        chunks_vdb: Vector database storage for chunks
+        num_of_chunks: Number of chunks to select
+        entity_info: List of entity information containing chunk IDs
+        embedding_func: Embedding function to compute query embedding
+
+    Returns:
+        List of selected text chunk IDs sorted by similarity (highest first)
+    """
+    logger.debug(
+        f"Vector similarity chunk selection: num_of_chunks={num_of_chunks}, entity_info_count={len(entity_info) if entity_info else 0}"
+    )
+
+    if not entity_info or num_of_chunks <= 0:
+        return []
+
+    # Collect all unique chunk IDs from entity info
+    all_chunk_ids = set()
+    for i, entity in enumerate(entity_info):
+        chunk_ids = entity.get("sorted_chunks", [])
+        all_chunk_ids.update(chunk_ids)
+
+    if not all_chunk_ids:
+        logger.warning(
+            "Vector similarity chunk selection:  no chunk IDs found in entity_info"
+        )
+        return []
+
+    logger.debug(
+        f"Vector similarity chunk selection: {len(all_chunk_ids)} unique chunk IDs collected"
+    )
+
+    all_chunk_ids = list(all_chunk_ids)
+
+    try:
+        # Get query embedding
+        query_embedding = await embedding_func([query])
+        query_embedding = query_embedding[
+            0
+        ]  # Extract first embedding from batch result
+
+        # Get chunk embeddings from vector database
+        chunk_vectors = await chunks_vdb.get_vectors_by_ids(all_chunk_ids)
+        logger.debug(
+            f"Vector similarity chunk selection: {len(chunk_vectors)} chunk vectors Retrieved"
+        )
+
+        if not chunk_vectors:
+            logger.warning(
+                "Vector similarity chunk selection: no vectors retrieved from chunks_vdb"
+            )
+            return []
+
+        # Calculate cosine similarities
+        similarities = []
+        valid_vectors = 0
+        for chunk_id in all_chunk_ids:
+            if chunk_id in chunk_vectors:
+                chunk_embedding = chunk_vectors[chunk_id]
+                try:
+                    # Calculate cosine similarity
+                    similarity = cosine_similarity(query_embedding, chunk_embedding)
+                    similarities.append((chunk_id, similarity))
+                    valid_vectors += 1
+                except Exception as e:
+                    logger.warning(
+                        f"Vector similarity chunk selection: failed to calculate similarity for chunk {chunk_id}: {e}"
+                    )
+            else:
+                logger.warning(
+                    f"Vector similarity chunk selection:  no vector found for chunk {chunk_id}"
+                )
+
+        # Sort by similarity (highest first) and select top num_of_chunks
+        similarities.sort(key=lambda x: x[1], reverse=True)
+        selected_chunks = [chunk_id for chunk_id, _ in similarities[:num_of_chunks]]
+
+        logger.debug(
+            f"Vector similarity chunk selection: {len(selected_chunks)} chunks from {len(all_chunk_ids)} candidates"
+        )
+
+        return selected_chunks
+
+    except Exception as e:
+        logger.error(f"[VECTOR_SIMILARITY] Error in vector similarity sorting: {e}")
+        import traceback
+
+        logger.error(f"[VECTOR_SIMILARITY] Traceback: {traceback.format_exc()}")
+        # Fallback to simple truncation
+        logger.debug("[VECTOR_SIMILARITY] Falling back to simple truncation")
+        return all_chunk_ids[:num_of_chunks]
+
+
 class TokenTracker:
     """Track token usage for LLM calls."""
 

From 947e826e6198564beb8cdc4f72dfb4fe53162fc1 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 13 Aug 2025 18:29:07 +0800
Subject: [PATCH 02/21] Bump api version to 0200

---
 lightrag/api/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/api/__init__.py b/lightrag/api/__init__.py
index 48c39b83..e28380e1 100644
--- a/lightrag/api/__init__.py
+++ b/lightrag/api/__init__.py
@@ -1 +1 @@
-__api_version__ = "0199"
+__api_version__ = "0200"

From 5a40ff654e622e29db36e4180ed1ff096bb152de Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 13 Aug 2025 23:10:42 +0800
Subject: [PATCH 03/21] Change KG chunk selection default to VECTOR

- Set KG_CHUNK_PICK_METHOD default to VECTOR
- Update env.example with new config option
---
 env.example           | 2 ++
 lightrag/constants.py | 2 +-
 lightrag/operate.py   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/env.example b/env.example
index be054576..43739d10 100644
--- a/env.example
+++ b/env.example
@@ -71,6 +71,8 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=10000
 ### control the maximum tokens send to LLM (include entities, raltions and chunks)
 # MAX_TOTAL_TOKENS=30000
+### chunk selection strategies for KG: WEIGHT or VECTOR
+KG_CHUNK_PICK_METHOD=VECTOR
 ### maximum number of related chunks per source entity or relation (higher values increase re-ranking time)
 # RELATED_CHUNK_NUMBER=5
 
diff --git a/lightrag/constants.py b/lightrag/constants.py
index 895852bd..1c7917c1 100644
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@@ -27,7 +27,7 @@ DEFAULT_MAX_RELATION_TOKENS = 10000
 DEFAULT_MAX_TOTAL_TOKENS = 30000
 DEFAULT_COSINE_THRESHOLD = 0.2
 DEFAULT_RELATED_CHUNK_NUMBER = 5
-DEFAULT_KG_CHUNK_PICK_METHOD = "WEIGHT"
+DEFAULT_KG_CHUNK_PICK_METHOD = "VECTOR"
 # Deprated: history message have negtive effect on query performance
 DEFAULT_HISTORY_TURNS = 0
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 60f5bc6c..1f94bd01 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2803,7 +2803,7 @@ async def _find_related_text_unit_from_entities(
             entities_with_chunks, max_related_chunks, min_related_chunks=1
         )
 
-        logger.debug(
+        logger.info(
             f"Selecting {len(selected_chunk_ids)} from {total_entity_chunks} entity-related chunks by weighted polling"
         )
 

From edac10906c884e8d3daef213dc826eef8e40a275 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 13 Aug 2025 23:45:31 +0800
Subject: [PATCH 04/21] fix: Add total_relation_chunks statistics and improve
 logging in _find_related_text_unit_from_relations

---
 lightrag/operate.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 1f94bd01..0b13a874 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -46,6 +46,7 @@ from .constants import (
     DEFAULT_MAX_RELATION_TOKENS,
     DEFAULT_MAX_TOTAL_TOKENS,
     DEFAULT_RELATED_CHUNK_NUMBER,
+    DEFAULT_KG_CHUNK_PICK_METHOD,
 )
 from .kg.shared_storage import get_storage_keyed_lock
 import time
@@ -2712,12 +2713,13 @@ async def _find_related_text_unit_from_entities(
         logger.warning("No entities with text chunks found")
         return []
 
-    # Check chunk selection method from environment variable
-    kg_chunk_pick_method = os.getenv("KG_CHUNK_PICK_METHOD", "WEIGHT").upper()
-
+    kg_chunk_pick_method = text_chunks_db.global_config.get(
+        "kg_chunk_pick_method", DEFAULT_KG_CHUNK_PICK_METHOD
+    )
     max_related_chunks = text_chunks_db.global_config.get(
         "related_chunk_number", DEFAULT_RELATED_CHUNK_NUMBER
     )
+
     # Step 2: Count chunk occurrences and deduplicate (keep chunks from earlier positioned entities)
     chunk_occurrence_count = {}
     for entity_info in entities_with_chunks:
@@ -2967,9 +2969,9 @@ async def _find_related_text_unit_from_relations(
         logger.warning("No relation-related chunks found")
         return []
 
-    # Check chunk selection method from environment variable
-    kg_chunk_pick_method = os.getenv("KG_CHUNK_PICK_METHOD", "WEIGHT").upper()
-
+    kg_chunk_pick_method = text_chunks_db.global_config.get(
+        "kg_chunk_pick_method", DEFAULT_KG_CHUNK_PICK_METHOD
+    )
     max_related_chunks = text_chunks_db.global_config.get(
         "related_chunk_number", DEFAULT_RELATED_CHUNK_NUMBER
     )
@@ -3025,6 +3027,7 @@ async def _find_related_text_unit_from_relations(
         return []
 
     # Step 3: Sort chunks for each relationship by occurrence count (higher count = higher priority)
+    total_relation_chunks = 0
     for relation_info in relations_with_chunks:
         sorted_chunks = sorted(
             relation_info["chunks"],
@@ -3032,6 +3035,7 @@ async def _find_related_text_unit_from_relations(
             reverse=True,
         )
         relation_info["sorted_chunks"] = sorted_chunks
+        total_relation_chunks += len(sorted_chunks)
 
     # Step 4: Apply the selected chunk selection algorithm
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
@@ -3074,7 +3078,7 @@ async def _find_related_text_unit_from_relations(
                     )
                 else:
                     logger.info(
-                        f"Selecting {len(selected_chunk_ids)} relation-related chunks by vector similarity"
+                        f"Selecting {len(selected_chunk_ids)} from {total_relation_chunks} relation-related chunks by vector similarity"
                     )
 
             except Exception as e:
@@ -3090,7 +3094,7 @@ async def _find_related_text_unit_from_relations(
         )
 
         logger.info(
-            f"Selecting {len(selected_chunk_ids)} relation-related chunks by weighted polling"
+            f"Selecting {len(selected_chunk_ids)} from {total_relation_chunks} relation-related chunks by weighted polling"
         )
 
     logger.debug(

From ac3b5605a14530be5f35e3322a5d20d264ec2fe4 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 00:41:58 +0800
Subject: [PATCH 05/21] Refactor logging for relation chunk discovery with
 dedup info

---
 lightrag/operate.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 0b13a874..468a03e4 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -3019,11 +3019,10 @@ async def _find_related_text_unit_from_relations(
         if relation_info["chunks"]
     ]
 
-    logger.info(
-        f"Find {len(relations_with_chunks)} additional relations-related chunks ({len(removed_entity_chunk_ids)} duplicated chunks removed)"
-    )
-
     if not relations_with_chunks:
+        logger.info(
+            f"Find no additional relations-related chunks from {len(edge_datas)} relations"
+        )
         return []
 
     # Step 3: Sort chunks for each relationship by occurrence count (higher count = higher priority)
@@ -3037,6 +3036,13 @@ async def _find_related_text_unit_from_relations(
         relation_info["sorted_chunks"] = sorted_chunks
         total_relation_chunks += len(sorted_chunks)
 
+    logger.info(
+        f"Find {total_relation_chunks} additional relations-related chunks ({len(removed_entity_chunk_ids)} duplicated chunks removed)"
+    )
+    logger.debug(
+        f"Only {len(relations_with_chunks)} out of {len(edge_datas)} relations have additional chunks after chunk deduplication"
+    )
+
     # Step 4: Apply the selected chunk selection algorithm
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 

From bac09118d5ef4fc843b56b8aff1a65574a113d3d Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 01:09:18 +0800
Subject: [PATCH 06/21] Simplify embedding func extraction

---
 lightrag/operate.py | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 468a03e4..59d9b790 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2752,7 +2752,6 @@ async def _find_related_text_unit_from_entities(
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 
     if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
-        # Calculate num_of_chunks = max_related_chunks * len(entity_info) / 2
         num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)
 
         # Get embedding function from global config
@@ -2762,15 +2761,7 @@ async def _find_related_text_unit_from_entities(
             kg_chunk_pick_method = "WEIGHT"
         else:
             try:
-                # Extract the actual callable function from EmbeddingFunc
-                if hasattr(embedding_func_config, "func"):
-                    actual_embedding_func = embedding_func_config.func
-                else:
-                    logger.warning(
-                        "Invalid embedding function format, falling back to WEIGHT method"
-                    )
-                    kg_chunk_pick_method = "WEIGHT"
-                    actual_embedding_func = None
+                actual_embedding_func = embedding_func_config.func
 
                 selected_chunk_ids = None
                 if actual_embedding_func:
@@ -3037,17 +3028,13 @@ async def _find_related_text_unit_from_relations(
         total_relation_chunks += len(sorted_chunks)
 
     logger.info(
-        f"Find {total_relation_chunks} additional relations-related chunks ({len(removed_entity_chunk_ids)} duplicated chunks removed)"
-    )
-    logger.debug(
-        f"Only {len(relations_with_chunks)} out of {len(edge_datas)} relations have additional chunks after chunk deduplication"
+        f"Find {total_relation_chunks} additional chunks in {len(relations_with_chunks)} relations ({len(removed_entity_chunk_ids)} duplicated chunks removed)"
     )
 
     # Step 4: Apply the selected chunk selection algorithm
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 
     if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
-        # Calculate num_of_chunks = max_related_chunks * len(entity_info) / 2
         num_of_chunks = int(max_related_chunks * len(relations_with_chunks) / 2)
 
         # Get embedding function from global config
@@ -3057,15 +3044,7 @@ async def _find_related_text_unit_from_relations(
             kg_chunk_pick_method = "WEIGHT"
         else:
             try:
-                # Extract the actual callable function from EmbeddingFunc
-                if hasattr(embedding_func_config, "func"):
-                    actual_embedding_func = embedding_func_config.func
-                else:
-                    logger.warning(
-                        "Invalid embedding function format, falling back to WEIGHT method"
-                    )
-                    kg_chunk_pick_method = "WEIGHT"
-                    actual_embedding_func = None
+                actual_embedding_func = embedding_func_config.func
 
                 if actual_embedding_func:
                     selected_chunk_ids = await vector_similarity_sorting(

From 2a46667ac97c292db9af69a4d5e992280b34db41 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 01:50:27 +0800
Subject: [PATCH 07/21] Add OpenAI frequency penalty sample env params

---
 env.example | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/env.example b/env.example
index 43739d10..2cc97d55 100644
--- a/env.example
+++ b/env.example
@@ -136,7 +136,13 @@ LLM_BINDING_API_KEY=your_api_key
 # LLM_BINDING_API_KEY=your_api_key
 # LLM_BINDING=openai
 
-### Most Commont Parameters for Ollama Server
+### OpenAI Specific Parameters
+### Apply frequency penalty to prevent the LLM from generating repetitive or looping outputs
+# OPENAI_LLM_FREQUENCY_PENALTY=1.1
+### use the following command to see all support options for openai and azure_openai
+### lightrag-server --llm-binding openai --help
+
+### Ollama Server Specific Parameters
 ### Time out in seconds, None for infinite timeout
 TIMEOUT=240
 ### OLLAMA_LLM_NUM_CTX must be larger than MAX_TOTAL_TOKENS + 2000

From 3343833571774620e464eb65a28481257e6f6cc1 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 02:36:01 +0800
Subject: [PATCH 08/21] Remove query params from cache key generation for
 keyword extration

---
 lightrag/operate.py | 14 +++-----------
 lightrag/utils.py   | 14 +++++++-------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 59d9b790..3f1d31b6 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -1749,7 +1749,7 @@ async def kg_query(
         query_param.user_prompt or "",
         query_param.enable_rerank,
     )
-    cached_response, quantized, min_val, max_val = await handle_cache(
+    cached_response = await handle_cache(
         hashing_kv, args_hash, query, query_param.mode, cache_type="query"
     )
     if cached_response is not None:
@@ -1924,18 +1924,10 @@ async def extract_keywords_only(
     args_hash = compute_args_hash(
         param.mode,
         text,
-        param.response_type,
-        param.top_k,
-        param.chunk_top_k,
-        param.max_entity_tokens,
-        param.max_relation_tokens,
-        param.max_total_tokens,
         param.hl_keywords or [],
         param.ll_keywords or [],
-        param.user_prompt or "",
-        param.enable_rerank,
     )
-    cached_response, quantized, min_val, max_val = await handle_cache(
+    cached_response = await handle_cache(
         hashing_kv, args_hash, text, param.mode, cache_type="keywords"
     )
     if cached_response is not None:
@@ -3137,7 +3129,7 @@ async def naive_query(
         query_param.user_prompt or "",
         query_param.enable_rerank,
     )
-    cached_response, quantized, min_val, max_val = await handle_cache(
+    cached_response = await handle_cache(
         hashing_kv, args_hash, query, query_param.mode, cache_type="query"
     )
     if cached_response is not None:
diff --git a/lightrag/utils.py b/lightrag/utils.py
index a1d0dca9..fc5f78db 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -762,27 +762,27 @@ async def handle_cache(
     prompt,
     mode="default",
     cache_type=None,
-):
+) -> str|None:
     """Generic cache handling function with flattened cache keys"""
     if hashing_kv is None:
-        return None, None, None, None
+        return None
 
     if mode != "default":  # handle cache for all type of query
         if not hashing_kv.global_config.get("enable_llm_cache"):
-            return None, None, None, None
+            return None
     else:  # handle cache for entity extraction
         if not hashing_kv.global_config.get("enable_llm_cache_for_entity_extract"):
-            return None, None, None, None
+            return None
 
     # Use flattened cache key format: {mode}:{cache_type}:{hash}
     flattened_key = generate_cache_key(mode, cache_type, args_hash)
     cache_entry = await hashing_kv.get_by_id(flattened_key)
     if cache_entry:
         logger.debug(f"Flattened cache hit(key:{flattened_key})")
-        return cache_entry["return"], None, None, None
+        return cache_entry["return"]
 
     logger.debug(f"Cache missed(mode:{mode} type:{cache_type})")
-    return None, None, None, None
+    return None
 
 
 @dataclass
@@ -1409,7 +1409,7 @@ async def use_llm_func_with_cache(
         # Generate cache key for this LLM call
         cache_key = generate_cache_key("default", cache_type, arg_hash)
 
-        cached_return, _1, _2, _3 = await handle_cache(
+        cached_return = await handle_cache(
             llm_response_cache,
             arg_hash,
             _prompt,

From a11e8d77eb122f2d97528b08d8111e4058d45c60 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 14:24:15 +0800
Subject: [PATCH 09/21] Improve missing-vector warning logic in vector
 similarity

- Check for any missing vectors
- Separate no-vector vs partial-vector warnings
- Ensure early return on empty vectors
---
 lightrag/utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index b0327842..249c1a38 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1713,10 +1713,15 @@ async def vector_similarity_sorting(
             f"Vector similarity chunk selection: {len(chunk_vectors)} chunk vectors Retrieved"
         )
 
-        if not chunk_vectors:
-            logger.warning(
-                "Vector similarity chunk selection: no vectors retrieved from chunks_vdb"
-            )
+        if not chunk_vectors or len(chunk_vectors) != len(all_chunk_ids):
+            if not chunk_vectors:
+                logger.warning(
+                    "Vector similarity chunk selection: no vectors retrieved from chunks_vdb"
+                )
+            else:
+                logger.warning(
+                    f"Vector similarity chunk selection: found {len(chunk_vectors)} but expecting {len(all_chunk_ids)}"
+                )
             return []
 
         # Calculate cosine similarities

From a8b7890470991bcdbd71d6ef4e1978e94d178b12 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 16:01:13 +0800
Subject: [PATCH 10/21] Rename chunk selection functions for better clarity

---
 lightrag/operate.py | 12 ++++++------
 lightrag/utils.py   |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index 3f1d31b6..dafc9e84 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -27,8 +27,8 @@ from .utils import (
     use_llm_func_with_cache,
     update_chunk_cache_list,
     remove_think_tags,
-    linear_gradient_weighted_polling,
-    vector_similarity_sorting,
+    pick_by_weighted_polling,
+    pick_by_vector_similarity,
     process_chunks_unified,
     build_file_path,
 )
@@ -2757,7 +2757,7 @@ async def _find_related_text_unit_from_entities(
 
                 selected_chunk_ids = None
                 if actual_embedding_func:
-                    selected_chunk_ids = await vector_similarity_sorting(
+                    selected_chunk_ids = await pick_by_vector_similarity(
                         query=query,
                         text_chunks_storage=text_chunks_db,
                         chunks_vdb=chunks_vdb,
@@ -2784,7 +2784,7 @@ async def _find_related_text_unit_from_entities(
 
     if kg_chunk_pick_method == "WEIGHT":
         # Apply linear gradient weighted polling algorithm
-        selected_chunk_ids = linear_gradient_weighted_polling(
+        selected_chunk_ids = pick_by_weighted_polling(
             entities_with_chunks, max_related_chunks, min_related_chunks=1
         )
 
@@ -3039,7 +3039,7 @@ async def _find_related_text_unit_from_relations(
                 actual_embedding_func = embedding_func_config.func
 
                 if actual_embedding_func:
-                    selected_chunk_ids = await vector_similarity_sorting(
+                    selected_chunk_ids = await pick_by_vector_similarity(
                         query=query,
                         text_chunks_storage=text_chunks_db,
                         chunks_vdb=chunks_vdb,
@@ -3066,7 +3066,7 @@ async def _find_related_text_unit_from_relations(
 
     if kg_chunk_pick_method == "WEIGHT":
         # Apply linear gradient weighted polling algorithm
-        selected_chunk_ids = linear_gradient_weighted_polling(
+        selected_chunk_ids = pick_by_weighted_polling(
             relations_with_chunks, max_related_chunks, min_related_chunks=1
         )
 
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 249c1a38..4d2a5289 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1570,7 +1570,7 @@ def check_storage_env_vars(storage_name: str) -> None:
         )
 
 
-def linear_gradient_weighted_polling(
+def pick_by_weighted_polling(
     entities_or_relations: list[dict],
     max_related_chunks: int,
     min_related_chunks: int = 1,
@@ -1650,7 +1650,7 @@ def linear_gradient_weighted_polling(
     return selected_chunks
 
 
-async def vector_similarity_sorting(
+async def pick_by_vector_similarity(
     query: str,
     text_chunks_storage: "BaseKVStorage",
     chunks_vdb: "BaseVectorStorage",

From 4a19d0de251e3a33dabec4eb624733e31d189193 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 22:58:26 +0800
Subject: [PATCH 11/21] Add chunk tracking system to monitor chunk sources and
 frequencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Track chunk sources (E/R/C types)
• Log frequency and order metadata
• Preserve chunk_id through processing
• Add debug logging for chunk tracking
• Handle rerank and truncation operations
---
 lightrag/operate.py | 65 +++++++++++++++++++++++++++++++++++++++++----
 lightrag/utils.py   | 26 ++++++++++++++++++
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index dafc9e84..d14b6919 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2107,6 +2107,9 @@ async def _build_query_context(
     global_entities = []
     global_relations = []
 
+    # Track chunk sources and metadata for final logging
+    chunk_tracking = {}  # chunk_id -> {source, frequency, order}
+
     # Handle local and global modes
     if query_param.mode == "local":
         local_entities, local_relations = await _get_node_data(
@@ -2145,6 +2148,15 @@ async def _build_query_context(
                 chunks_vdb,
                 query_param,
             )
+            # Track vector chunks with source metadata
+            for i, chunk in enumerate(vector_chunks):
+                chunk_id = chunk.get("chunk_id") or chunk.get("id")
+                if chunk_id:
+                    chunk_tracking[chunk_id] = {
+                        "source": "C",
+                        "frequency": 1,  # Vector chunks always have frequency 1
+                        "order": i + 1,  # 1-based order in vector search results
+                    }
 
     # Use round-robin merge to combine local and global data fairly
     final_entities = []
@@ -2342,6 +2354,7 @@ async def _build_query_context(
                 seen_edges.add(pair)
 
     # Get text chunks based on final filtered data
+    logger.info(f"chunk_tracking: {chunk_tracking}")
     if final_node_datas:
         entity_chunks = await _find_related_text_unit_from_entities(
             final_node_datas,
@@ -2350,8 +2363,9 @@ async def _build_query_context(
             knowledge_graph_inst,
             query,
             chunks_vdb,
+            chunk_tracking=chunk_tracking,
         )
-
+    logger.info(f"chunk_tracking: {chunk_tracking}")
     if final_edge_datas:
         relation_chunks = await _find_related_text_unit_from_relations(
             final_edge_datas,
@@ -2360,7 +2374,9 @@ async def _build_query_context(
             entity_chunks,
             query,
             chunks_vdb,
+            chunk_tracking=chunk_tracking,
         )
+    logger.info(f"chunk_tracking: {chunk_tracking}")
 
     # Round-robin merge chunks from different sources with deduplication by chunk_id
     merged_chunks = []
@@ -2379,6 +2395,7 @@ async def _build_query_context(
                     {
                         "content": chunk["content"],
                         "file_path": chunk.get("file_path", "unknown_source"),
+                        "chunk_id": chunk_id,
                     }
                 )
 
@@ -2392,6 +2409,7 @@ async def _build_query_context(
                     {
                         "content": chunk["content"],
                         "file_path": chunk.get("file_path", "unknown_source"),
+                        "chunk_id": chunk_id,
                     }
                 )
 
@@ -2405,6 +2423,7 @@ async def _build_query_context(
                     {
                         "content": chunk["content"],
                         "file_path": chunk.get("file_path", "unknown_source"),
+                        "chunk_id": chunk_id,
                     }
                 )
 
@@ -2523,6 +2542,24 @@ async def _build_query_context(
     if not entities_context and not relations_context:
         return None
 
+    # output chunks tracking infomations
+    # format: <source><frequency>/<order> (e.g., E5/2 R2/1 C1/1)
+    if truncated_chunks and chunk_tracking:
+        chunk_tracking_log = []
+        for chunk in truncated_chunks:
+            chunk_id = chunk.get("chunk_id")
+            if chunk_id and chunk_id in chunk_tracking:
+                tracking_info = chunk_tracking[chunk_id]
+                source = tracking_info["source"]
+                frequency = tracking_info["frequency"]
+                order = tracking_info["order"]
+                chunk_tracking_log.append(f"{source}{frequency}/{order}")
+            else:
+                chunk_tracking_log.append("?0/0")
+
+        if chunk_tracking_log:
+            logger.info(f"chunks: {' '.join(chunk_tracking_log)}")
+
     entities_str = json.dumps(entities_context, ensure_ascii=False)
     relations_str = json.dumps(relations_context, ensure_ascii=False)
     text_units_str = json.dumps(text_units_context, ensure_ascii=False)
@@ -2672,6 +2709,7 @@ async def _find_related_text_unit_from_entities(
     knowledge_graph_inst: BaseGraphStorage,
     query: str = None,
     chunks_vdb: BaseVectorStorage = None,
+    chunk_tracking: dict = None,
 ):
     """
     Find text chunks related to entities using configurable chunk selection method.
@@ -2801,15 +2839,23 @@ async def _find_related_text_unit_from_entities(
     )  # Remove duplicates while preserving order
     chunk_data_list = await text_chunks_db.get_by_ids(unique_chunk_ids)
 
-    # Step 6: Build result chunks with valid data
+    # Step 6: Build result chunks with valid data and update chunk tracking
     result_chunks = []
-    for chunk_id, chunk_data in zip(unique_chunk_ids, chunk_data_list):
+    for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)):
         if chunk_data is not None and "content" in chunk_data:
             chunk_data_copy = chunk_data.copy()
             chunk_data_copy["source_type"] = "entity"
             chunk_data_copy["chunk_id"] = chunk_id  # Add chunk_id for deduplication
             result_chunks.append(chunk_data_copy)
 
+            # Update chunk tracking if provided
+            if chunk_tracking is not None:
+                chunk_tracking[chunk_id] = {
+                    "source": "E",
+                    "frequency": chunk_occurrence_count.get(chunk_id, 1),
+                    "order": i + 1,  # 1-based order in final entity-related results
+                }
+
     return result_chunks
 
 
@@ -2911,6 +2957,7 @@ async def _find_related_text_unit_from_relations(
     entity_chunks: list[dict] = None,
     query: str = None,
     chunks_vdb: BaseVectorStorage = None,
+    chunk_tracking: dict = None,
 ):
     """
     Find text chunks related to relationships using configurable chunk selection method.
@@ -3087,15 +3134,23 @@ async def _find_related_text_unit_from_relations(
     )  # Remove duplicates while preserving order
     chunk_data_list = await text_chunks_db.get_by_ids(unique_chunk_ids)
 
-    # Step 6: Build result chunks with valid data
+    # Step 6: Build result chunks with valid data and update chunk tracking
     result_chunks = []
-    for chunk_id, chunk_data in zip(unique_chunk_ids, chunk_data_list):
+    for i, (chunk_id, chunk_data) in enumerate(zip(unique_chunk_ids, chunk_data_list)):
         if chunk_data is not None and "content" in chunk_data:
             chunk_data_copy = chunk_data.copy()
             chunk_data_copy["source_type"] = "relationship"
             chunk_data_copy["chunk_id"] = chunk_id  # Add chunk_id for deduplication
             result_chunks.append(chunk_data_copy)
 
+            # Update chunk tracking if provided
+            if chunk_tracking is not None:
+                chunk_tracking[chunk_id] = {
+                    "source": "R",
+                    "frequency": chunk_occurrence_count.get(chunk_id, 1),
+                    "order": i + 1,  # 1-based order in final relation-related results
+                }
+
     return result_chunks
 
 
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 4d2a5289..340e4251 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1901,6 +1901,13 @@ async def process_chunks_unified(
 
     # 1. Apply reranking if enabled and query is provided
     if query_param.enable_rerank and query and unique_chunks:
+        # 保存 chunk_id 字段，因为 rerank 可能会丢失这个字段
+        chunk_ids = {}
+        for chunk in unique_chunks:
+            chunk_id = chunk.get("chunk_id")
+            if chunk_id:
+                chunk_ids[id(chunk)] = chunk_id
+
         rerank_top_k = query_param.chunk_top_k or len(unique_chunks)
         unique_chunks = await apply_rerank_if_enabled(
             query=query,
@@ -1910,6 +1917,11 @@ async def process_chunks_unified(
             top_n=rerank_top_k,
         )
 
+        # 恢复 chunk_id 字段
+        for chunk in unique_chunks:
+            if id(chunk) in chunk_ids:
+                chunk["chunk_id"] = chunk_ids[id(chunk)]
+
     # 2. Filter by minimum rerank score if reranking is enabled
     if query_param.enable_rerank and unique_chunks:
         min_rerank_score = global_config.get("min_rerank_score", 0.5)
@@ -1956,12 +1968,26 @@ async def process_chunks_unified(
             )
 
         original_count = len(unique_chunks)
+
+        # Keep chunk_id field, cause truncate_list_by_token_size will lose it
+        chunk_ids_map = {}
+        for i, chunk in enumerate(unique_chunks):
+            chunk_id = chunk.get("chunk_id")
+            if chunk_id:
+                chunk_ids_map[i] = chunk_id
+
         unique_chunks = truncate_list_by_token_size(
             unique_chunks,
             key=lambda x: x.get("content", ""),
             max_token_size=chunk_token_limit,
             tokenizer=tokenizer,
         )
+
+        # restore chunk_id feiled
+        for i, chunk in enumerate(unique_chunks):
+            if i in chunk_ids_map:
+                chunk["chunk_id"] = chunk_ids_map[i]
+
         logger.debug(
             f"Token truncation: {len(unique_chunks)} chunks from {original_count} "
             f"(chunk available tokens: {chunk_token_limit}, source: {source_type})"

From f733ac829c668afd39eda49d19d71290865797a2 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Thu, 14 Aug 2025 23:44:34 +0800
Subject: [PATCH 12/21] Remove debug logging statements from query context
 building

---
 lightrag/operate.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index d14b6919..ddd70872 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2354,7 +2354,6 @@ async def _build_query_context(
                 seen_edges.add(pair)
 
     # Get text chunks based on final filtered data
-    logger.info(f"chunk_tracking: {chunk_tracking}")
     if final_node_datas:
         entity_chunks = await _find_related_text_unit_from_entities(
             final_node_datas,
@@ -2365,7 +2364,7 @@ async def _build_query_context(
             chunks_vdb,
             chunk_tracking=chunk_tracking,
         )
-    logger.info(f"chunk_tracking: {chunk_tracking}")
+
     if final_edge_datas:
         relation_chunks = await _find_related_text_unit_from_relations(
             final_edge_datas,
@@ -2376,7 +2375,6 @@ async def _build_query_context(
             chunks_vdb,
             chunk_tracking=chunk_tracking,
         )
-    logger.info(f"chunk_tracking: {chunk_tracking}")
 
     # Round-robin merge chunks from different sources with deduplication by chunk_id
     merged_chunks = []

From 0b45d463dfd191492be59acbab58bc40a4959fe5 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 00:43:45 +0800
Subject: [PATCH 13/21] Add .clinerules to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2773b704..cb9f3049 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,3 +72,4 @@ test_*
 # Cline files
 memory-bank
 memory-bank/
+.clinerules

From 3acb32f5479d7827f1477dd6b5a2857c02ce44c8 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 02:19:01 +0800
Subject: [PATCH 14/21] Add comments explaining chunk deduplication behavior in
 query context

---
 lightrag/operate.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lightrag/operate.py b/lightrag/operate.py
index ddd70872..1096f28d 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2354,6 +2354,7 @@ async def _build_query_context(
                 seen_edges.add(pair)
 
     # Get text chunks based on final filtered data
+    # To preserve the influence of entity order,  entiy-based chunks should not be deduplcicated by vector_chunks
     if final_node_datas:
         entity_chunks = await _find_related_text_unit_from_entities(
             final_node_datas,
@@ -2365,6 +2366,8 @@ async def _build_query_context(
             chunk_tracking=chunk_tracking,
         )
 
+    # Find deduplcicated chunks from edge
+    # Deduplication cause chunks solely relation-based to be prioritized and sent to the LLM when re-ranking is disabled
     if final_edge_datas:
         relation_chunks = await _find_related_text_unit_from_relations(
             final_edge_datas,

From 6cab68bb47fb0d1ff60ca23f788b6efad4ab17ee Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 10:09:44 +0800
Subject: [PATCH 15/21] Improve KG chunk selection documentation and
 configuration clarity

---
 env.example         | 16 ++++++++++++----
 lightrag/operate.py |  8 ++++++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/env.example b/env.example
index 2cc97d55..39d6da68 100644
--- a/env.example
+++ b/env.example
@@ -71,12 +71,20 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=10000
 ### control the maximum tokens send to LLM (include entities, raltions and chunks)
 # MAX_TOTAL_TOKENS=30000
-### chunk selection strategies for KG: WEIGHT or VECTOR
-KG_CHUNK_PICK_METHOD=VECTOR
-### maximum number of related chunks per source entity or relation (higher values increase re-ranking time)
+
+### maximum number of related chunks per source entity or relation
+###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
+###     Higher values increase re-ranking time
 # RELATED_CHUNK_NUMBER=5
 
-### Reranker configuration (Set ENABLE_RERANK to true in reranking model is configed)
+### chunk selection strategies
+###     VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval
+###     WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM
+###     If reranking is enabled, the impact of chunk selection strategies will be diminished.
+# KG_CHUNK_PICK_METHOD=VECTOR
+
+### Reranking configuration
+###     Reranker Set ENABLE_RERANK to true in reranking model is configed
 # ENABLE_RERANK=True
 ### Minimum rerank score for document chunk exclusion (set to 0.0 to keep all chunks, 0.6 or above if LLM is not strong enought)
 # MIN_RERANK_SCORE=0.0
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 1096f28d..c9be89f7 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2779,9 +2779,12 @@ async def _find_related_text_unit_from_entities(
         entity_info["sorted_chunks"] = sorted_chunks
         total_entity_chunks += len(sorted_chunks)
 
-    # Step 4: Apply the selected chunk selection algorithm
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 
+    # Step 4: Apply the selected chunk selection algorithm
+    # Pick by vector similarity: 
+    #     The order of text chunks aligns with the naive retrieval's destination. 
+    #     When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval.
     if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
         num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)
 
@@ -2822,7 +2825,8 @@ async def _find_related_text_unit_from_entities(
                 kg_chunk_pick_method = "WEIGHT"
 
     if kg_chunk_pick_method == "WEIGHT":
-        # Apply linear gradient weighted polling algorithm
+        # Pick by entity and chunk weight:
+        #     When reranking is disabled, delivered more solely KG related chunks to the LLM
         selected_chunk_ids = pick_by_weighted_polling(
             entities_with_chunks, max_related_chunks, min_related_chunks=1
         )

From 1e2d5252d7572371f0e190bc4666143a590e6ced Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 16:32:26 +0800
Subject: [PATCH 16/21] Add get_vectors_by_ids method and filter out vector
 data from query results

---
 lightrag/kg/faiss_impl.py | 39 ++++++++++++++++++++++++++++++++++++---
 lightrag/operate.py       |  4 ++--
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py
index 0ab95c43..5098ebf7 100644
--- a/lightrag/kg/faiss_impl.py
+++ b/lightrag/kg/faiss_impl.py
@@ -210,9 +210,11 @@ class FaissVectorDBStorage(BaseVectorStorage):
                 continue
 
             meta = self._id_to_meta.get(idx, {})
+            # Filter out __vector__ from query results to avoid returning large vector data
+            filtered_meta = {k: v for k, v in meta.items() if k != "__vector__"}
             results.append(
                 {
-                    **meta,
+                    **filtered_meta,
                     "id": meta.get("__id__"),
                     "distance": float(dist),
                     "created_at": meta.get("__created_at__"),
@@ -424,8 +426,10 @@ class FaissVectorDBStorage(BaseVectorStorage):
         if not metadata:
             return None
 
+        # Filter out __vector__ from metadata to avoid returning large vector data
+        filtered_metadata = {k: v for k, v in metadata.items() if k != "__vector__"}
         return {
-            **metadata,
+            **filtered_metadata,
             "id": metadata.get("__id__"),
             "created_at": metadata.get("__created_at__"),
         }
@@ -448,9 +452,13 @@ class FaissVectorDBStorage(BaseVectorStorage):
             if fid is not None:
                 metadata = self._id_to_meta.get(fid, {})
                 if metadata:
+                    # Filter out __vector__ from metadata to avoid returning large vector data
+                    filtered_metadata = {
+                        k: v for k, v in metadata.items() if k != "__vector__"
+                    }
                     results.append(
                         {
-                            **metadata,
+                            **filtered_metadata,
                             "id": metadata.get("__id__"),
                             "created_at": metadata.get("__created_at__"),
                         }
@@ -458,6 +466,31 @@ class FaissVectorDBStorage(BaseVectorStorage):
 
         return results
 
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        vectors_dict = {}
+        for id in ids:
+            # Find the Faiss internal ID for the custom ID
+            fid = self._find_faiss_id_by_custom_id(id)
+            if fid is not None and fid in self._id_to_meta:
+                metadata = self._id_to_meta[fid]
+                # Get the stored vector from metadata
+                if "__vector__" in metadata:
+                    vectors_dict[id] = metadata["__vector__"]
+
+        return vectors_dict
+
     async def drop(self) -> dict[str, str]:
         """Drop all vector data from storage and clean up resources
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index c9be89f7..acb75f0f 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2782,8 +2782,8 @@ async def _find_related_text_unit_from_entities(
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 
     # Step 4: Apply the selected chunk selection algorithm
-    # Pick by vector similarity: 
-    #     The order of text chunks aligns with the naive retrieval's destination. 
+    # Pick by vector similarity:
+    #     The order of text chunks aligns with the naive retrieval's destination.
     #     When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval.
     if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
         num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)

From a71499a18070155bb70a26cb0ccaf90b353038a9 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 16:36:50 +0800
Subject: [PATCH 17/21] Add get_vectors_by_ids method to MilvusVectorDBStorage

---
 lightrag/kg/milvus_impl.py | 44 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py
index 1aa41021..4d927353 100644
--- a/lightrag/kg/milvus_impl.py
+++ b/lightrag/kg/milvus_impl.py
@@ -1018,6 +1018,50 @@ class MilvusVectorDBStorage(BaseVectorStorage):
             )
             return []
 
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        try:
+            # Ensure collection is loaded before querying
+            self._ensure_collection_loaded()
+
+            # Prepare the ID filter expression
+            id_list = '", "'.join(ids)
+            filter_expr = f'id in ["{id_list}"]'
+
+            # Query Milvus with the filter, requesting only vector field
+            result = self._client.query(
+                collection_name=self.final_namespace,
+                filter=filter_expr,
+                output_fields=["vector"],
+            )
+
+            vectors_dict = {}
+            for item in result:
+                if item and "vector" in item and "id" in item:
+                    # Convert numpy array to list if needed
+                    vector_data = item["vector"]
+                    if isinstance(vector_data, np.ndarray):
+                        vector_data = vector_data.tolist()
+                    vectors_dict[item["id"]] = vector_data
+
+            return vectors_dict
+        except Exception as e:
+            logger.error(
+                f"[{self.workspace}] Error retrieving vectors by IDs from {self.namespace}: {e}"
+            )
+            return {}
+
     async def drop(self) -> dict[str, str]:
         """Drop all vector data from storage and clean up resources
 

From 8f7031b882d5c3581fd6bae14cfef83cfbed5191 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 16:46:52 +0800
Subject: [PATCH 18/21] Add get_vectors_by_ids method to QdrantVectorDBStorage

---
 lightrag/kg/qdrant_impl.py | 44 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/lightrag/kg/qdrant_impl.py b/lightrag/kg/qdrant_impl.py
index 1686cba6..4ece163c 100644
--- a/lightrag/kg/qdrant_impl.py
+++ b/lightrag/kg/qdrant_impl.py
@@ -402,6 +402,50 @@ class QdrantVectorDBStorage(BaseVectorStorage):
             )
             return []
 
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        try:
+            # Convert to Qdrant compatible IDs
+            qdrant_ids = [compute_mdhash_id_for_qdrant(id) for id in ids]
+
+            # Retrieve the points by IDs with vectors
+            results = self._client.retrieve(
+                collection_name=self.final_namespace,
+                ids=qdrant_ids,
+                with_vectors=True,  # Important: request vectors
+                with_payload=True,
+            )
+
+            vectors_dict = {}
+            for point in results:
+                if point and point.vector is not None and point.payload:
+                    # Get original ID from payload
+                    original_id = point.payload.get("id")
+                    if original_id:
+                        # Convert numpy array to list if needed
+                        vector_data = point.vector
+                        if isinstance(vector_data, np.ndarray):
+                            vector_data = vector_data.tolist()
+                        vectors_dict[original_id] = vector_data
+
+            return vectors_dict
+        except Exception as e:
+            logger.error(
+                f"[{self.workspace}] Error retrieving vectors by IDs from {self.namespace}: {e}"
+            )
+            return {}
+
     async def drop(self) -> dict[str, str]:
         """Drop all vector data from storage and clean up resources
 

From 7a7385a200642531a540a4f74c6bdb49d4174508 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 16:51:41 +0800
Subject: [PATCH 19/21] Add efficient vector retrieval by IDs to
 PGVectorStorage

---
 lightrag/kg/postgres_impl.py | 47 ++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index 88292db5..e50128c9 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -2158,6 +2158,53 @@ class PGVectorStorage(BaseVectorStorage):
             )
             return []
 
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        table_name = namespace_to_table_name(self.namespace)
+        if not table_name:
+            logger.error(
+                f"[{self.workspace}] Unknown namespace for vector lookup: {self.namespace}"
+            )
+            return {}
+
+        ids_str = ",".join([f"'{id}'" for id in ids])
+        query = f"SELECT id, content_vector FROM {table_name} WHERE workspace=$1 AND id IN ({ids_str})"
+        params = {"workspace": self.workspace}
+
+        try:
+            results = await self.db.query(query, params, multirows=True)
+            vectors_dict = {}
+
+            for result in results:
+                if result and "content_vector" in result and "id" in result:
+                    try:
+                        # Parse JSON string to get vector as list of floats
+                        vector_data = json.loads(result["content_vector"])
+                        if isinstance(vector_data, list):
+                            vectors_dict[result["id"]] = vector_data
+                    except (json.JSONDecodeError, TypeError) as e:
+                        logger.warning(
+                            f"[{self.workspace}] Failed to parse vector data for ID {result['id']}: {e}"
+                        )
+
+            return vectors_dict
+        except Exception as e:
+            logger.error(
+                f"[{self.workspace}] Error retrieving vectors by IDs from {self.namespace}: {e}"
+            )
+            return {}
+
     async def drop(self) -> dict[str, str]:
         """Drop the storage"""
         async with get_storage_lock():

From 3a227e37b89b51dd004ee91e4adb1f3e602ac338 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 16:53:14 +0800
Subject: [PATCH 20/21] Add get_vectors_by_ids method to MongoVectorDBStorage

---
 lightrag/kg/mongo_impl.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py
index ff671c56..8fa53c60 100644
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@@ -1967,6 +1967,37 @@ class MongoVectorDBStorage(BaseVectorStorage):
             )
             return []
 
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        try:
+            # Query MongoDB for the specified IDs, only retrieving the vector field
+            cursor = self._data.find({"_id": {"$in": ids}}, {"vector": 1})
+            results = await cursor.to_list(length=None)
+
+            vectors_dict = {}
+            for result in results:
+                if result and "vector" in result and "_id" in result:
+                    # MongoDB stores vectors as arrays, so they should already be lists
+                    vectors_dict[result["_id"]] = result["vector"]
+
+            return vectors_dict
+        except PyMongoError as e:
+            logger.error(
+                f"[{self.workspace}] Error retrieving vectors by IDs from {self.namespace}: {e}"
+            )
+            return {}
+
     async def drop(self) -> dict[str, str]:
         """Drop the storage by removing all documents in the collection and recreating vector index.
 

From 2a781dfb9160c942647f2ccfe6c60034450d0242 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 19:14:38 +0800
Subject: [PATCH 21/21] Update Neo4j database naming in env.example

---
 env.example               | 2 +-
 lightrag/kg/neo4j_impl.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/env.example b/env.example
index 39d6da68..187a7064 100644
--- a/env.example
+++ b/env.example
@@ -268,7 +268,7 @@ POSTGRES_IVFFLAT_LISTS=100
 NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
 NEO4J_USERNAME=neo4j
 NEO4J_PASSWORD='your_password'
-# NEO4J_DATABASE=chunk_entity_relation
+# NEO4J_DATABASE=chunk-entity-relation
 NEO4J_MAX_CONNECTION_POOL_SIZE=100
 NEO4J_CONNECTION_TIMEOUT=30
 NEO4J_CONNECTION_ACQUISITION_TIMEOUT=30
diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index febd07e5..2a32307a 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -155,7 +155,7 @@ class Neo4JStorage(BaseGraphStorage):
                         except neo4jExceptions.ServiceUnavailable as e:
                             logger.error(
                                 f"[{self.workspace}] "
-                                + f"{database} at {URI} is not available".capitalize()
+                                + f"Database {database} at {URI} is not available"
                             )
                             raise e
                 except neo4jExceptions.AuthError as e:
@@ -167,7 +167,7 @@ class Neo4JStorage(BaseGraphStorage):
                     if e.code == "Neo.ClientError.Database.DatabaseNotFound":
                         logger.info(
                             f"[{self.workspace}] "
-                            + f"{database} at {URI} not found. Try to create specified database.".capitalize()
+                            + f"Database {database} at {URI} not found. Try to create specified database."
                         )
                         try:
                             async with self._driver.session() as session:
@@ -177,7 +177,7 @@ class Neo4JStorage(BaseGraphStorage):
                                 await result.consume()  # Ensure result is consumed
                                 logger.info(
                                     f"[{self.workspace}] "
-                                    + f"{database} at {URI} created".capitalize()
+                                    + f"Database {database} at {URI} created"
                                 )
                                 connected = True
                         except (