perf (metadata): optimized metadata query with gin indexes for postgres
This commit is contained in:
parent
f4c2823c82
commit
a57d4ec0cc
1 changed files with 33 additions and 70 deletions
|
|
@ -2104,75 +2104,42 @@ class PGVectorStorage(BaseVectorStorage):
|
||||||
############# Metadata building function #################
|
############# Metadata building function #################
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build_metadata_filter_clause(metadata_filter):
|
def build_metadata_filter_clause(metadata_filter):
|
||||||
def escape_str(val: str) -> str:
|
if not metadata_filter:
|
||||||
return str(val).replace("'", "''") # escape single quotes
|
return ""
|
||||||
|
|
||||||
def build_single_condition(key, value):
|
def build_single_condition(key, value):
|
||||||
if isinstance(value, list):
|
if isinstance(value, (list, dict)):
|
||||||
# If list contains only scalars, use IN clause
|
# Use GIN-optimized containment operator
|
||||||
if all(isinstance(v, (str, int, float, bool)) for v in value):
|
json_value = json.dumps(value)
|
||||||
escaped_values = ", ".join(f"'{escape_str(str(v))}'" for v in value)
|
return f"metadata @> '{{\"{ key}\" : {json_value}}}'"
|
||||||
return f"metadata->>'{key}' IN ({escaped_values})"
|
else:
|
||||||
else:
|
# Use containment for scalars too - faster with GIN index
|
||||||
# fallback to JSON matching for complex types
|
json_value = json.dumps(value)
|
||||||
json_value = json.dumps(value).replace("'", "''")
|
return f"metadata @> '{{\"{ key}\" : {json_value}}}'"
|
||||||
return f"metadata->'{key}' = '{json_value}'::jsonb"
|
|
||||||
elif isinstance(value, dict):
|
|
||||||
json_value = json.dumps(value).replace("'", "''")
|
|
||||||
return f"metadata->'{key}' = '{json_value}'::jsonb"
|
|
||||||
elif isinstance(value, (int, float, bool)):
|
|
||||||
return f"metadata->'{key}' = '{json.dumps(value)}'::jsonb"
|
|
||||||
else: # string
|
|
||||||
return f"metadata->>'{key}' = '{escape_str(value)}'"
|
|
||||||
|
|
||||||
def build_conditions(filter_dict):
|
try:
|
||||||
conditions = []
|
if isinstance(metadata_filter, dict):
|
||||||
for key, value in filter_dict.items():
|
conditions = [build_single_condition(k, v) for k, v in metadata_filter.items()]
|
||||||
if isinstance(value, (list, tuple)):
|
return " AND " + " AND ".join(conditions) if conditions else ""
|
||||||
conds = [build_single_condition(key, v) for v in value]
|
elif hasattr(metadata_filter, 'operands'):
|
||||||
conditions.append("(" + " OR ".join(conds) + ")")
|
|
||||||
else:
|
|
||||||
conditions.append(build_single_condition(key, value))
|
|
||||||
return conditions
|
|
||||||
|
|
||||||
def recurse(filter_obj):
|
|
||||||
if isinstance(filter_obj, dict):
|
|
||||||
return build_conditions(filter_obj)
|
|
||||||
|
|
||||||
if isinstance(filter_obj, MetadataFilter):
|
|
||||||
sub_conditions = []
|
sub_conditions = []
|
||||||
for operand in filter_obj.operands:
|
for operand in metadata_filter.operands:
|
||||||
if isinstance(operand, dict):
|
if isinstance(operand, dict):
|
||||||
sub_conditions.append(
|
conds = [build_single_condition(k, v) for k, v in operand.items()]
|
||||||
"(" + " AND ".join(build_conditions(operand)) + ")"
|
if conds:
|
||||||
)
|
sub_conditions.append("(" + " AND ".join(conds) + ")")
|
||||||
elif isinstance(operand, MetadataFilter):
|
|
||||||
nested = recurse(operand)
|
if sub_conditions:
|
||||||
if nested:
|
op = getattr(metadata_filter, 'operator', 'AND').upper()
|
||||||
sub_conditions.append("(" + " AND ".join(nested) + ")")
|
connector = " OR " if op == "OR" else " AND "
|
||||||
|
prefix = " AND NOT (" if op == "NOT" else " AND ("
|
||||||
if not sub_conditions:
|
return prefix + connector.join(sub_conditions) + ")"
|
||||||
return []
|
return ""
|
||||||
|
except Exception:
|
||||||
op = filter_obj.operator.upper()
|
# Simple fallback
|
||||||
if op == "AND":
|
if isinstance(metadata_filter, dict):
|
||||||
return [" AND ".join(sub_conditions)]
|
return f" AND metadata @> '{json.dumps(metadata_filter)}'"
|
||||||
elif op == "OR":
|
return ""
|
||||||
return [" OR ".join(sub_conditions)]
|
|
||||||
elif op == "NOT":
|
|
||||||
if len(sub_conditions) == 1:
|
|
||||||
return [f"NOT {sub_conditions[0]}"]
|
|
||||||
else:
|
|
||||||
return [f"NOT ({' AND '.join(sub_conditions)})"]
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
conditions = recurse(metadata_filter)
|
|
||||||
clause = ""
|
|
||||||
if conditions:
|
|
||||||
clause = " AND " + " AND ".join(conditions)
|
|
||||||
|
|
||||||
return clause
|
|
||||||
|
|
||||||
#################### query method ###############
|
#################### query method ###############
|
||||||
async def query(
|
async def query(
|
||||||
|
|
@ -2191,7 +2158,7 @@ class PGVectorStorage(BaseVectorStorage):
|
||||||
embedding = embeddings[0]
|
embedding = embeddings[0]
|
||||||
|
|
||||||
embedding_string = ",".join(map(str, embedding))
|
embedding_string = ",".join(map(str, embedding))
|
||||||
metadata_filter_clause = self.build_metadata_filter_clause(metadata_filter)
|
metadata_filter_clause = self.build_metadata_filter_clause(metadata_filter) if metadata_filter else ""
|
||||||
sql = SQL_TEMPLATES[self.namespace].format(
|
sql = SQL_TEMPLATES[self.namespace].format(
|
||||||
embedding_string=embedding_string,
|
embedding_string=embedding_string,
|
||||||
metadata_filter_clause=metadata_filter_clause,
|
metadata_filter_clause=metadata_filter_clause,
|
||||||
|
|
@ -4909,7 +4876,6 @@ SQL_TEMPLATES = {
|
||||||
""",
|
""",
|
||||||
"relationships": """
|
"relationships": """
|
||||||
WITH filtered_chunks AS (
|
WITH filtered_chunks AS (
|
||||||
-- Step 1: Select only the chunk IDs that match the metadata filter
|
|
||||||
SELECT
|
SELECT
|
||||||
c.id
|
c.id
|
||||||
FROM
|
FROM
|
||||||
|
|
@ -4918,7 +4884,6 @@ SQL_TEMPLATES = {
|
||||||
c.workspace = $1
|
c.workspace = $1
|
||||||
{metadata_filter_clause}
|
{metadata_filter_clause}
|
||||||
)
|
)
|
||||||
-- Step 2 & 3: Join relationships with the filtered chunks and rank by similarity
|
|
||||||
SELECT
|
SELECT
|
||||||
r.source_id AS src_id,
|
r.source_id AS src_id,
|
||||||
r.target_id AS tgt_id,
|
r.target_id AS tgt_id,
|
||||||
|
|
@ -4936,7 +4901,6 @@ SQL_TEMPLATES = {
|
||||||
""",
|
""",
|
||||||
"entities": """
|
"entities": """
|
||||||
WITH filtered_chunks AS (
|
WITH filtered_chunks AS (
|
||||||
-- Step 1: Select only the chunk IDs that match the metadata filter
|
|
||||||
SELECT
|
SELECT
|
||||||
c.id
|
c.id
|
||||||
FROM
|
FROM
|
||||||
|
|
@ -4945,7 +4909,6 @@ SQL_TEMPLATES = {
|
||||||
c.workspace = $1
|
c.workspace = $1
|
||||||
{metadata_filter_clause}
|
{metadata_filter_clause}
|
||||||
)
|
)
|
||||||
-- Step 2 & 3: Join entities with the filtered chunks and rank by similarity
|
|
||||||
SELECT
|
SELECT
|
||||||
e.entity_name,
|
e.entity_name,
|
||||||
EXTRACT(EPOCH FROM e.create_time)::BIGINT AS created_at
|
EXTRACT(EPOCH FROM e.create_time)::BIGINT AS created_at
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue