Add get_vectors_by_ids method and filter out vector data from query results

2025-08-15 16:32:26 +08:00 · 2025-08-15 16:32:26 +08:00 · 1e2d5252d7
commit 1e2d5252d7
parent 6cab68bb47
2 changed files with 38 additions and 5 deletions
--- a/lightrag/kg/faiss_impl.py
+++ b/lightrag/kg/faiss_impl.py
@ -210,9 +210,11 @@ class FaissVectorDBStorage(BaseVectorStorage):
                continue

            meta = self._id_to_meta.get(idx, {})
+            # Filter out __vector__ from query results to avoid returning large vector data
+            filtered_meta = {k: v for k, v in meta.items() if k != "__vector__"}
            results.append(
                {
-                    **meta,
+                    **filtered_meta,
                    "id": meta.get("__id__"),
                    "distance": float(dist),
                    "created_at": meta.get("__created_at__"),
@ -424,8 +426,10 @@ class FaissVectorDBStorage(BaseVectorStorage):
        if not metadata:
            return None

+        # Filter out __vector__ from metadata to avoid returning large vector data
+        filtered_metadata = {k: v for k, v in metadata.items() if k != "__vector__"}
        return {
-            **metadata,
+            **filtered_metadata,
            "id": metadata.get("__id__"),
            "created_at": metadata.get("__created_at__"),
        }
@ -448,9 +452,13 @@ class FaissVectorDBStorage(BaseVectorStorage):
            if fid is not None:
                metadata = self._id_to_meta.get(fid, {})
                if metadata:
+                    # Filter out __vector__ from metadata to avoid returning large vector data
+                    filtered_metadata = {
+                        k: v for k, v in metadata.items() if k != "__vector__"
+                    }
                    results.append(
                        {
-                            **metadata,
+                            **filtered_metadata,
                            "id": metadata.get("__id__"),
                            "created_at": metadata.get("__created_at__"),
                        }
@ -458,6 +466,31 @@ class FaissVectorDBStorage(BaseVectorStorage):

        return results

+    async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
+        """Get vectors by their IDs, returning only ID and vector data for efficiency
+
+        Args:
+            ids: List of unique identifiers
+
+        Returns:
+            Dictionary mapping IDs to their vector embeddings
+            Format: {id: [vector_values], ...}
+        """
+        if not ids:
+            return {}
+
+        vectors_dict = {}
+        for id in ids:
+            # Find the Faiss internal ID for the custom ID
+            fid = self._find_faiss_id_by_custom_id(id)
+            if fid is not None and fid in self._id_to_meta:
+                metadata = self._id_to_meta[fid]
+                # Get the stored vector from metadata
+                if "__vector__" in metadata:
+                    vectors_dict[id] = metadata["__vector__"]
+
+        return vectors_dict
+
    async def drop(self) -> dict[str, str]:
        """Drop all vector data from storage and clean up resources

--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -2782,8 +2782,8 @@ async def _find_related_text_unit_from_entities(
    selected_chunk_ids = []  # Initialize to avoid UnboundLocalError

    # Step 4: Apply the selected chunk selection algorithm
-    # Pick by vector similarity: 
-    #     The order of text chunks aligns with the naive retrieval's destination. 
+    # Pick by vector similarity:
+    #     The order of text chunks aligns with the naive retrieval's destination.
    #     When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval.
    if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
        num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)