perf (metadata): optimized metadata query with gin indexes for postgres

This commit is contained in:
GGrassia 2025-10-09 10:56:34 +02:00
parent f4c2823c82
commit a57d4ec0cc

View file

@ -2104,75 +2104,42 @@ class PGVectorStorage(BaseVectorStorage):
############# Metadata building function ################# ############# Metadata building function #################
@staticmethod @staticmethod
def build_metadata_filter_clause(metadata_filter): def build_metadata_filter_clause(metadata_filter):
def escape_str(val: str) -> str: if not metadata_filter:
return str(val).replace("'", "''") # escape single quotes return ""
def build_single_condition(key, value): def build_single_condition(key, value):
if isinstance(value, list): if isinstance(value, (list, dict)):
# If list contains only scalars, use IN clause # Use GIN-optimized containment operator
if all(isinstance(v, (str, int, float, bool)) for v in value): json_value = json.dumps(value)
escaped_values = ", ".join(f"'{escape_str(str(v))}'" for v in value) return f"metadata @> '{{\"{ key}\" : {json_value}}}'"
return f"metadata->>'{key}' IN ({escaped_values})" else:
else: # Use containment for scalars too - faster with GIN index
# fallback to JSON matching for complex types json_value = json.dumps(value)
json_value = json.dumps(value).replace("'", "''") return f"metadata @> '{{\"{ key}\" : {json_value}}}'"
return f"metadata->'{key}' = '{json_value}'::jsonb"
elif isinstance(value, dict):
json_value = json.dumps(value).replace("'", "''")
return f"metadata->'{key}' = '{json_value}'::jsonb"
elif isinstance(value, (int, float, bool)):
return f"metadata->'{key}' = '{json.dumps(value)}'::jsonb"
else: # string
return f"metadata->>'{key}' = '{escape_str(value)}'"
def build_conditions(filter_dict): try:
conditions = [] if isinstance(metadata_filter, dict):
for key, value in filter_dict.items(): conditions = [build_single_condition(k, v) for k, v in metadata_filter.items()]
if isinstance(value, (list, tuple)): return " AND " + " AND ".join(conditions) if conditions else ""
conds = [build_single_condition(key, v) for v in value] elif hasattr(metadata_filter, 'operands'):
conditions.append("(" + " OR ".join(conds) + ")")
else:
conditions.append(build_single_condition(key, value))
return conditions
def recurse(filter_obj):
if isinstance(filter_obj, dict):
return build_conditions(filter_obj)
if isinstance(filter_obj, MetadataFilter):
sub_conditions = [] sub_conditions = []
for operand in filter_obj.operands: for operand in metadata_filter.operands:
if isinstance(operand, dict): if isinstance(operand, dict):
sub_conditions.append( conds = [build_single_condition(k, v) for k, v in operand.items()]
"(" + " AND ".join(build_conditions(operand)) + ")" if conds:
) sub_conditions.append("(" + " AND ".join(conds) + ")")
elif isinstance(operand, MetadataFilter):
nested = recurse(operand) if sub_conditions:
if nested: op = getattr(metadata_filter, 'operator', 'AND').upper()
sub_conditions.append("(" + " AND ".join(nested) + ")") connector = " OR " if op == "OR" else " AND "
prefix = " AND NOT (" if op == "NOT" else " AND ("
if not sub_conditions: return prefix + connector.join(sub_conditions) + ")"
return [] return ""
except Exception:
op = filter_obj.operator.upper() # Simple fallback
if op == "AND": if isinstance(metadata_filter, dict):
return [" AND ".join(sub_conditions)] return f" AND metadata @> '{json.dumps(metadata_filter)}'"
elif op == "OR": return ""
return [" OR ".join(sub_conditions)]
elif op == "NOT":
if len(sub_conditions) == 1:
return [f"NOT {sub_conditions[0]}"]
else:
return [f"NOT ({' AND '.join(sub_conditions)})"]
return []
conditions = recurse(metadata_filter)
clause = ""
if conditions:
clause = " AND " + " AND ".join(conditions)
return clause
#################### query method ############### #################### query method ###############
async def query( async def query(
@ -2191,7 +2158,7 @@ class PGVectorStorage(BaseVectorStorage):
embedding = embeddings[0] embedding = embeddings[0]
embedding_string = ",".join(map(str, embedding)) embedding_string = ",".join(map(str, embedding))
metadata_filter_clause = self.build_metadata_filter_clause(metadata_filter) metadata_filter_clause = self.build_metadata_filter_clause(metadata_filter) if metadata_filter else ""
sql = SQL_TEMPLATES[self.namespace].format( sql = SQL_TEMPLATES[self.namespace].format(
embedding_string=embedding_string, embedding_string=embedding_string,
metadata_filter_clause=metadata_filter_clause, metadata_filter_clause=metadata_filter_clause,
@ -4909,7 +4876,6 @@ SQL_TEMPLATES = {
""", """,
"relationships": """ "relationships": """
WITH filtered_chunks AS ( WITH filtered_chunks AS (
-- Step 1: Select only the chunk IDs that match the metadata filter
SELECT SELECT
c.id c.id
FROM FROM
@ -4918,7 +4884,6 @@ SQL_TEMPLATES = {
c.workspace = $1 c.workspace = $1
{metadata_filter_clause} {metadata_filter_clause}
) )
-- Step 2 & 3: Join relationships with the filtered chunks and rank by similarity
SELECT SELECT
r.source_id AS src_id, r.source_id AS src_id,
r.target_id AS tgt_id, r.target_id AS tgt_id,
@ -4936,7 +4901,6 @@ SQL_TEMPLATES = {
""", """,
"entities": """ "entities": """
WITH filtered_chunks AS ( WITH filtered_chunks AS (
-- Step 1: Select only the chunk IDs that match the metadata filter
SELECT SELECT
c.id c.id
FROM FROM
@ -4945,7 +4909,6 @@ SQL_TEMPLATES = {
c.workspace = $1 c.workspace = $1
{metadata_filter_clause} {metadata_filter_clause}
) )
-- Step 2 & 3: Join entities with the filtered chunks and rank by similarity
SELECT SELECT
e.entity_name, e.entity_name,
EXTRACT(EPOCH FROM e.create_time)::BIGINT AS created_at EXTRACT(EPOCH FROM e.create_time)::BIGINT AS created_at