diff --git a/lightrag/base.py b/lightrag/base.py index 9ba34280..dacfbd90 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -219,7 +219,7 @@ class BaseVectorStorage(StorageNameSpace, ABC): @abstractmethod async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: """Query the vector storage and retrieve top_k results.""" diff --git a/lightrag/kg/deprecated/chroma_impl.py b/lightrag/kg/deprecated/chroma_impl.py index ebdd4593..a6c43504 100644 --- a/lightrag/kg/deprecated/chroma_impl.py +++ b/lightrag/kg/deprecated/chroma_impl.py @@ -165,7 +165,7 @@ class ChromaVectorDBStorage(BaseVectorStorage): raise async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: try: embedding = await self.embedding_func( diff --git a/lightrag/kg/faiss_impl.py b/lightrag/kg/faiss_impl.py index 5098ebf7..5687834d 100644 --- a/lightrag/kg/faiss_impl.py +++ b/lightrag/kg/faiss_impl.py @@ -180,7 +180,7 @@ class FaissVectorDBStorage(BaseVectorStorage): return [m["__id__"] for m in list_data] async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: """ Search by a textual query; returns top_k results with their metadata + similarity distance. diff --git a/lightrag/kg/milvus_impl.py b/lightrag/kg/milvus_impl.py index 4d927353..6747bb2d 100644 --- a/lightrag/kg/milvus_impl.py +++ b/lightrag/kg/milvus_impl.py @@ -810,7 +810,7 @@ class MilvusVectorDBStorage(BaseVectorStorage): return results async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: # Ensure collection is loaded before querying self._ensure_collection_loaded() diff --git a/lightrag/kg/mongo_impl.py b/lightrag/kg/mongo_impl.py index 8fa53c60..0c164bd2 100644 --- a/lightrag/kg/mongo_impl.py +++ b/lightrag/kg/mongo_impl.py @@ -1771,7 +1771,7 @@ class MongoVectorDBStorage(BaseVectorStorage): return list_data async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: """Queries the vector database using Atlas Vector Search.""" # Generate the embedding diff --git a/lightrag/kg/nano_vector_db_impl.py b/lightrag/kg/nano_vector_db_impl.py index 5bec06f4..19352a4a 100644 --- a/lightrag/kg/nano_vector_db_impl.py +++ b/lightrag/kg/nano_vector_db_impl.py @@ -137,7 +137,7 @@ class NanoVectorDBStorage(BaseVectorStorage): ) async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: # Execute embedding outside of lock to avoid improve cocurrent embedding = await self.embedding_func( diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 3ab8bfb8..46e8e6e6 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -2005,7 +2005,7 @@ class PGVectorStorage(BaseVectorStorage): #################### query method ############### async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: embeddings = await self.embedding_func( [query], _priority=5 @@ -2016,7 +2016,6 @@ class PGVectorStorage(BaseVectorStorage): sql = SQL_TEMPLATES[self.namespace].format(embedding_string=embedding_string) params = { "workspace": self.workspace, - "doc_ids": ids, "closer_than_threshold": 1 - self.cosine_better_than_threshold, "top_k": top_k, } @@ -4578,85 +4577,31 @@ SQL_TEMPLATES = { update_time = EXCLUDED.update_time """, "relationships": """ - WITH relevant_chunks AS (SELECT id as chunk_id - FROM LIGHTRAG_VDB_CHUNKS - WHERE $2 - :: varchar [] IS NULL OR full_doc_id = ANY ($2:: varchar []) - ) - , rc AS ( - SELECT array_agg(chunk_id) AS chunk_arr - FROM relevant_chunks - ), cand AS ( - SELECT - r.id, r.source_id AS src_id, r.target_id AS tgt_id, r.chunk_ids, r.create_time, r.content_vector <=> '[{embedding_string}]'::vector AS dist - FROM LIGHTRAG_VDB_RELATION r - WHERE r.workspace = $1 - ORDER BY r.content_vector <=> '[{embedding_string}]'::vector - LIMIT ($4 * 50) - ) - SELECT c.src_id, - c.tgt_id, - EXTRACT(EPOCH FROM c.create_time) ::BIGINT AS created_at - FROM cand c - JOIN rc ON TRUE - WHERE c.dist < $3 - AND c.chunk_ids && (rc.chunk_arr::varchar[]) - ORDER BY c.dist, c.id - LIMIT $4; + SELECT r.source_id as src_id, r.target_id as tgt_id, + EXTRACT(EPOCH FROM r.create_time)::BIGINT as created_at + FROM LIGHTRAG_VDB_RELATION r + WHERE r.workspace = $1 + AND r.content_vector <=> '[{embedding_string}]'::vector < $2 + ORDER BY r.content_vector <=> '[{embedding_string}]'::vector + LIMIT $3 """, "entities": """ - WITH relevant_chunks AS (SELECT id as chunk_id - FROM LIGHTRAG_VDB_CHUNKS - WHERE $2 - :: varchar [] IS NULL OR full_doc_id = ANY ($2:: varchar []) - ) - , rc AS ( - SELECT array_agg(chunk_id) AS chunk_arr - FROM relevant_chunks - ), cand AS ( - SELECT - e.id, e.entity_name, e.chunk_ids, e.create_time, e.content_vector <=> '[{embedding_string}]'::vector AS dist + SELECT e.entity_name, + EXTRACT(EPOCH FROM e.create_time)::BIGINT as created_at FROM LIGHTRAG_VDB_ENTITY e WHERE e.workspace = $1 + AND e.content_vector <=> '[{embedding_string}]'::vector < $2 ORDER BY e.content_vector <=> '[{embedding_string}]'::vector - LIMIT ($4 * 50) - ) - SELECT c.entity_name, - EXTRACT(EPOCH FROM c.create_time) ::BIGINT AS created_at - FROM cand c - JOIN rc ON TRUE - WHERE c.dist < $3 - AND c.chunk_ids && (rc.chunk_arr::varchar[]) - ORDER BY c.dist, c.id - LIMIT $4; + LIMIT $3 """, "chunks": """ - WITH relevant_chunks AS (SELECT id as chunk_id - FROM LIGHTRAG_VDB_CHUNKS - WHERE $2 - :: varchar [] IS NULL OR full_doc_id = ANY ($2:: varchar []) - ) - , rc AS ( - SELECT array_agg(chunk_id) AS chunk_arr - FROM relevant_chunks - ), cand AS ( - SELECT - id, content, file_path, create_time, content_vector <=> '[{embedding_string}]'::vector AS dist - FROM LIGHTRAG_VDB_CHUNKS - WHERE workspace = $1 - ORDER BY content_vector <=> '[{embedding_string}]'::vector - LIMIT ($4 * 50) - ) - SELECT c.id, - c.content, - c.file_path, - EXTRACT(EPOCH FROM c.create_time) ::BIGINT AS created_at - FROM cand c - JOIN rc ON TRUE - WHERE c.dist < $3 - AND c.id = ANY (rc.chunk_arr) - ORDER BY c.dist, c.id - LIMIT $4; + SELECT id, content, file_path, + EXTRACT(EPOCH FROM create_time)::BIGINT as created_at + FROM LIGHTRAG_VDB_CHUNKS + WHERE workspace = $1 + AND content_vector <=> '[{embedding_string}]'::vector < $2 + ORDER BY content_vector <=> '[{embedding_string}]'::vector + LIMIT $3 """, # DROP tables "drop_specifiy_table_workspace": """ diff --git a/lightrag/kg/qdrant_impl.py b/lightrag/kg/qdrant_impl.py index 4ece163c..e8565ac7 100644 --- a/lightrag/kg/qdrant_impl.py +++ b/lightrag/kg/qdrant_impl.py @@ -200,7 +200,7 @@ class QdrantVectorDBStorage(BaseVectorStorage): return results async def query( - self, query: str, top_k: int, ids: list[str] | None = None + self, query: str, top_k: int ) -> list[dict[str, Any]]: embedding = await self.embedding_func( [query], _priority=5 diff --git a/lightrag/operate.py b/lightrag/operate.py index acb75f0f..0876e06c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -2055,7 +2055,7 @@ async def _get_vector_context( # Use chunk_top_k if specified, otherwise fall back to top_k search_top_k = query_param.chunk_top_k or query_param.top_k - results = await chunks_vdb.query(query, top_k=search_top_k, ids=query_param.ids) + results = await chunks_vdb.query(query, top_k=search_top_k) if not results: return [] @@ -2599,7 +2599,7 @@ async def _get_node_data( ) results = await entities_vdb.query( - query, top_k=query_param.top_k, ids=query_param.ids + query, top_k=query_param.top_k ) if not len(results): @@ -2875,7 +2875,7 @@ async def _get_edge_data( ) results = await relationships_vdb.query( - keywords, top_k=query_param.top_k, ids=query_param.ids + keywords, top_k=query_param.top_k ) if not len(results):