From 1e2d5252d7572371f0e190bc4666143a590e6ced Mon Sep 17 00:00:00 2001 From: yangdx Date: Fri, 15 Aug 2025 16:32:26 +0800 Subject: [PATCH] Add get_vectors_by_ids method and filter out vector data from query results --- lightrag/kg/faiss_impl.py | 39 ++++++++++++++++++++++++++++++++++++--- lightrag/operate.py | 4 ++-- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index 0ab95c43..5098ebf7 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -210,9 +210,11 @@ class FaissVectorDBStorage(BaseVectorStorage): continue meta = self._id_to_meta.get(idx, {}) + # Filter out __vector__ from query results to avoid returning large vector data + filtered_meta = {k: v for k, v in meta.items() if k != "__vector__"} results.append( { - **meta, + **filtered_meta, "id": meta.get("__id__"), "distance": float(dist), "created_at": meta.get("__created_at__"), @@ -424,8 +426,10 @@ class FaissVectorDBStorage(BaseVectorStorage): if not metadata: return None + # Filter out __vector__ from metadata to avoid returning large vector data + filtered_metadata = {k: v for k, v in metadata.items() if k != "__vector__"} return { - **metadata, + **filtered_metadata, "id": metadata.get("__id__"), "created_at": metadata.get("__created_at__"), } @@ -448,9 +452,13 @@ class FaissVectorDBStorage(BaseVectorStorage): if fid is not None: metadata = self._id_to_meta.get(fid, {}) if metadata: + # Filter out __vector__ from metadata to avoid returning large vector data + filtered_metadata = { + k: v for k, v in metadata.items() if k != "__vector__" + } results.append( { - **metadata, + **filtered_metadata, "id": metadata.get("__id__"), "created_at": metadata.get("__created_at__"), } @@ -458,6 +466,31 @@ class FaissVectorDBStorage(BaseVectorStorage): return results + async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]: + """Get vectors by their IDs, returning only ID and vector data for efficiency + + Args: + ids: List of unique identifiers + + Returns: + Dictionary mapping IDs to their vector embeddings + Format: {id: [vector_values], ...} + """ + if not ids: + return {} + + vectors_dict = {} + for id in ids: + # Find the Faiss internal ID for the custom ID + fid = self._find_faiss_id_by_custom_id(id) + if fid is not None and fid in self._id_to_meta: + metadata = self._id_to_meta[fid] + # Get the stored vector from metadata + if "__vector__" in metadata: + vectors_dict[id] = metadata["__vector__"] + + return vectors_dict + async def drop(self) -> dict[str, str]: """Drop all vector data from storage and clean up resources diff --git a/lightrag/operate.py b/lightrag/operate.py index c9be89f7..acb75f0f 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -2782,8 +2782,8 @@ async def _find_related_text_unit_from_entities( selected_chunk_ids = [] # Initialize to avoid UnboundLocalError # Step 4: Apply the selected chunk selection algorithm - # Pick by vector similarity: - # The order of text chunks aligns with the naive retrieval's destination. + # Pick by vector similarity: + # The order of text chunks aligns with the naive retrieval's destination. # When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval. if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb: num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)