From 1e2d5252d7572371f0e190bc4666143a590e6ced Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 16:32:26 +0800
Subject: [PATCH] Add get_vectors_by_ids method and filter out vector data from
 query results

---
 lightrag/kg/faiss_impl.py | 39 ++++++++++++++++++++++++++++++++++++---
 lightrag/operate.py       |  4 ++--
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py
index 0ab95c43..5098ebf7 100644
--- a/lightrag/kg/faiss_impl.py
+++ b/lightrag/kg/faiss_impl.py
@@ -210,9 +210,11 @@ class FaissVectorDBStorage(BaseVectorStorage):
                 continue
 
             meta = self._id_to_meta.get(idx, {})
+            # Filter out __vector__ from query results to avoid returning large vector data
+            filtered_meta = {k: v for k, v in meta.items() if k != "__vector__"}
             results.append(
                 {
-                    **meta,
+                    **filtered_meta,
                     "id": meta.get("__id__"),
                     "distance": float(dist),
                     "created_at": meta.get("__created_at__"),
@@ -424,8 +426,10 @@ class FaissVectorDBStorage(BaseVectorStorage):
         if not metadata:
             return None
 
+        # Filter out __vector__ from metadata to avoid returning large vector data
+        filtered_metadata = {k: v for k, v in metadata.items() if k != "__vector__"}
         return {
-            **metadata,
+            **filtered_metadata,
             "id": metadata.get("__id__"),
             "created_at": metadata.get("__created_at__"),
         }
@@ -448,9 +452,13 @@ class FaissVectorDBStorage(BaseVectorStorage):
             if fid is not None:
                 metadata = self._id_to_meta.get(fid, {})
                 if metadata:
+                    # Filter out __vector__ from metadata to avoid returning large vector data
+                    filtered_metadata = {
+                        k: v for k, v in metadata.items() if k != "__vector__"
+                    }
                     results.append(
                         {
-                            **metadata,
+                            **filtered_metadata,
                             "id": metadata.get("__id__"),
                             "created_at": metadata.get("__created_at__"),
                         }
@@ -458,6 +466,31 @@ class FaissVectorDBStorage(BaseVectorStorage):
 
         return results
 
+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        vectors_dict = {}
+        for id in ids:
+            # Find the Faiss internal ID for the custom ID
+            fid = self._find_faiss_id_by_custom_id(id)
+            if fid is not None and fid in self._id_to_meta:
+                metadata = self._id_to_meta[fid]
+                # Get the stored vector from metadata
+                if "__vector__" in metadata:
+                    vectors_dict[id] = metadata["__vector__"]
+
+        return vectors_dict
+
     async def drop(self) -> dict[str, str]:
         """Drop all vector data from storage and clean up resources
 
diff --git a/lightrag/operate.py b/lightrag/operate.py
index c9be89f7..acb75f0f 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2782,8 +2782,8 @@ async def _find_related_text_unit_from_entities(
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 
     # Step 4: Apply the selected chunk selection algorithm
-    # Pick by vector similarity: 
-    #     The order of text chunks aligns with the naive retrieval's destination. 
+    # Pick by vector similarity:
+    #     The order of text chunks aligns with the naive retrieval's destination.
     #     When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval.
     if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
         num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)