Add get_vectors_by_ids method and filter out vector data from query results

This commit is contained in:
yangdx 2025-08-15 16:32:26 +08:00
parent 6cab68bb47
commit 1e2d5252d7
2 changed files with 38 additions and 5 deletions

View file

@ -210,9 +210,11 @@ class FaissVectorDBStorage(BaseVectorStorage):
continue
meta = self._id_to_meta.get(idx, {})
# Filter out __vector__ from query results to avoid returning large vector data
filtered_meta = {k: v for k, v in meta.items() if k != "__vector__"}
results.append(
{
**meta,
**filtered_meta,
"id": meta.get("__id__"),
"distance": float(dist),
"created_at": meta.get("__created_at__"),
@ -424,8 +426,10 @@ class FaissVectorDBStorage(BaseVectorStorage):
if not metadata:
return None
# Filter out __vector__ from metadata to avoid returning large vector data
filtered_metadata = {k: v for k, v in metadata.items() if k != "__vector__"}
return {
**metadata,
**filtered_metadata,
"id": metadata.get("__id__"),
"created_at": metadata.get("__created_at__"),
}
@ -448,9 +452,13 @@ class FaissVectorDBStorage(BaseVectorStorage):
if fid is not None:
metadata = self._id_to_meta.get(fid, {})
if metadata:
# Filter out __vector__ from metadata to avoid returning large vector data
filtered_metadata = {
k: v for k, v in metadata.items() if k != "__vector__"
}
results.append(
{
**metadata,
**filtered_metadata,
"id": metadata.get("__id__"),
"created_at": metadata.get("__created_at__"),
}
@ -458,6 +466,31 @@ class FaissVectorDBStorage(BaseVectorStorage):
return results
async def get_vectors_by_ids(self, ids: list[str]) -> dict[str, list[float]]:
"""Get vectors by their IDs, returning only ID and vector data for efficiency
Args:
ids: List of unique identifiers
Returns:
Dictionary mapping IDs to their vector embeddings
Format: {id: [vector_values], ...}
"""
if not ids:
return {}
vectors_dict = {}
for id in ids:
# Find the Faiss internal ID for the custom ID
fid = self._find_faiss_id_by_custom_id(id)
if fid is not None and fid in self._id_to_meta:
metadata = self._id_to_meta[fid]
# Get the stored vector from metadata
if "__vector__" in metadata:
vectors_dict[id] = metadata["__vector__"]
return vectors_dict
async def drop(self) -> dict[str, str]:
"""Drop all vector data from storage and clean up resources

View file

@ -2782,8 +2782,8 @@ async def _find_related_text_unit_from_entities(
selected_chunk_ids = [] # Initialize to avoid UnboundLocalError
# Step 4: Apply the selected chunk selection algorithm
# Pick by vector similarity:
# The order of text chunks aligns with the naive retrieval's destination.
# Pick by vector similarity:
# The order of text chunks aligns with the naive retrieval's destination.
# When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval.
if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)