fix: renamed PostGreSQL options env variable and allowed LRU cache to be an optional env variable
(cherry picked from commit 22a7b482c5)
This commit is contained in:
parent
4e93c9c21d
commit
8f5af8199b
2 changed files with 50 additions and 239 deletions
|
|
@ -351,10 +351,13 @@ POSTGRES_IVFFLAT_LISTS=100
|
||||||
# POSTGRES_SSL_ROOT_CERT=/path/to/ca-cert.pem
|
# POSTGRES_SSL_ROOT_CERT=/path/to/ca-cert.pem
|
||||||
# POSTGRES_SSL_CRL=/path/to/crl.pem
|
# POSTGRES_SSL_CRL=/path/to/crl.pem
|
||||||
|
|
||||||
### PostgreSQL Server Options (for Supabase Supavisor)
|
### PostgreSQL Server Settings (for Supabase Supavisor)
|
||||||
# Use this to pass extra options to the PostgreSQL connection string.
|
# Use this to pass extra options to the PostgreSQL connection string.
|
||||||
# For Supabase, you might need to set it like this:
|
# For Supabase, you might need to set it like this:
|
||||||
# POSTGRES_SERVER_OPTIONS="options=reference%3D[project-ref]"
|
# POSTGRES_SERVER_SETTINGS="options=reference%3D[project-ref]"
|
||||||
|
|
||||||
|
# Default is 100 set to 0 to disable
|
||||||
|
# POSTGRES_STATEMENT_CACHE_SIZE=100
|
||||||
|
|
||||||
### Neo4j Configuration
|
### Neo4j Configuration
|
||||||
NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
|
NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,6 @@ import numpy as np
|
||||||
import configparser
|
import configparser
|
||||||
import ssl
|
import ssl
|
||||||
import itertools
|
import itertools
|
||||||
import hashlib
|
|
||||||
|
|
||||||
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
|
from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
|
||||||
|
|
||||||
|
|
@ -33,7 +32,6 @@ from ..namespace import NameSpace, is_namespace
|
||||||
from ..utils import logger
|
from ..utils import logger
|
||||||
from ..constants import GRAPH_FIELD_SEP
|
from ..constants import GRAPH_FIELD_SEP
|
||||||
from ..kg.shared_storage import get_data_init_lock, get_graph_db_lock, get_storage_lock
|
from ..kg.shared_storage import get_data_init_lock, get_graph_db_lock, get_storage_lock
|
||||||
from ..utils_context import get_current_tenant_id
|
|
||||||
|
|
||||||
import pipmaster as pm
|
import pipmaster as pm
|
||||||
|
|
||||||
|
|
@ -78,6 +76,7 @@ class PostgreSQLDB:
|
||||||
|
|
||||||
# Server settings
|
# Server settings
|
||||||
self.server_settings = config.get("server_settings")
|
self.server_settings = config.get("server_settings")
|
||||||
|
self.statement_cache_size = int(config.get("statement_cache_size"))
|
||||||
|
|
||||||
if self.user is None or self.password is None or self.database is None:
|
if self.user is None or self.password is None or self.database is None:
|
||||||
raise ValueError("Missing database user, password, or database")
|
raise ValueError("Missing database user, password, or database")
|
||||||
|
|
@ -163,8 +162,13 @@ class PostgreSQLDB:
|
||||||
"port": self.port,
|
"port": self.port,
|
||||||
"min_size": 1,
|
"min_size": 1,
|
||||||
"max_size": self.max,
|
"max_size": self.max,
|
||||||
|
"statement_cache_size": self.statement_cache_size,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"PostgreSQL, statement LRU cache size set as: {self.statement_cache_size}"
|
||||||
|
)
|
||||||
|
|
||||||
# Add SSL configuration if provided
|
# Add SSL configuration if provided
|
||||||
ssl_context = self._create_ssl_context()
|
ssl_context = self._create_ssl_context()
|
||||||
if ssl_context is not None:
|
if ssl_context is not None:
|
||||||
|
|
@ -237,40 +241,23 @@ class PostgreSQLDB:
|
||||||
"""Set the Apache AGE environment and creates a graph if it does not exist.
|
"""Set the Apache AGE environment and creates a graph if it does not exist.
|
||||||
|
|
||||||
This method:
|
This method:
|
||||||
- Loads the AGE extension into the current session (required for Cypher functions).
|
|
||||||
- Sets the PostgreSQL `search_path` to include `ag_catalog`, ensuring that Apache AGE functions can be used without specifying the schema.
|
- Sets the PostgreSQL `search_path` to include `ag_catalog`, ensuring that Apache AGE functions can be used without specifying the schema.
|
||||||
- Attempts to create a new graph with the provided `graph_name` if it does not already exist.
|
- Attempts to create a new graph with the provided `graph_name` if it does not already exist.
|
||||||
- Silently ignores errors related to the graph already existing.
|
- Silently ignores errors related to the graph already existing.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Load AGE extension - required for Cypher functions to work
|
|
||||||
await connection.execute("LOAD 'age'") # type: ignore
|
|
||||||
await connection.execute( # type: ignore
|
await connection.execute( # type: ignore
|
||||||
'SET search_path = ag_catalog, "$user", public'
|
'SET search_path = ag_catalog, "$user", public'
|
||||||
)
|
)
|
||||||
|
await connection.execute( # type: ignore
|
||||||
# Check if graph exists first to avoid error logs
|
f"select create_graph('{graph_name}')"
|
||||||
exists = await connection.fetchval(
|
|
||||||
"SELECT count(*) FROM ag_catalog.ag_graph WHERE name = $1", graph_name
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if exists == 0:
|
|
||||||
await connection.execute( # type: ignore
|
|
||||||
f"select create_graph('{graph_name}')"
|
|
||||||
)
|
|
||||||
except (
|
except (
|
||||||
asyncpg.exceptions.InvalidSchemaNameError,
|
asyncpg.exceptions.InvalidSchemaNameError,
|
||||||
asyncpg.exceptions.UniqueViolationError,
|
asyncpg.exceptions.UniqueViolationError,
|
||||||
asyncpg.exceptions.DuplicateObjectError, # Graph already exists
|
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
except Exception as e:
|
|
||||||
# Handle "already exists" error message for AGE graphs
|
|
||||||
if "already exists" in str(e):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def _migrate_llm_cache_schema(self):
|
async def _migrate_llm_cache_schema(self):
|
||||||
"""Migrate LLM cache schema: add new columns and remove deprecated mode field"""
|
"""Migrate LLM cache schema: add new columns and remove deprecated mode field"""
|
||||||
|
|
@ -1089,7 +1076,9 @@ class PostgreSQLDB:
|
||||||
try:
|
try:
|
||||||
# Create index for id column
|
# Create index for id column
|
||||||
index_name = f"idx_{table_name.lower()}_id"
|
index_name = f"idx_{table_name.lower()}_id"
|
||||||
create_index_sql = f"CREATE INDEX {index_name} ON {table_name}(id)"
|
create_index_sql = (
|
||||||
|
f"CREATE INDEX {index_name} ON {table_name}(id)"
|
||||||
|
)
|
||||||
await self.execute(create_index_sql)
|
await self.execute(create_index_sql)
|
||||||
logger.info(f"Created index {index_name} on table {table_name}")
|
logger.info(f"Created index {index_name} on table {table_name}")
|
||||||
|
|
||||||
|
|
@ -1145,11 +1134,6 @@ class PostgreSQLDB:
|
||||||
"sql": "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_lightrag_doc_status_workspace_file_path ON LIGHTRAG_DOC_STATUS (workspace, file_path)",
|
"sql": "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_lightrag_doc_status_workspace_file_path ON LIGHTRAG_DOC_STATUS (workspace, file_path)",
|
||||||
"description": "Index for workspace + file_path sorting",
|
"description": "Index for workspace + file_path sorting",
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "idx_lightrag_doc_status_workspace_external_id",
|
|
||||||
"sql": "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_lightrag_doc_status_workspace_external_id ON LIGHTRAG_DOC_STATUS (workspace, (metadata->>'external_id')) WHERE metadata->>'external_id' IS NOT NULL",
|
|
||||||
"description": "Index for workspace + external_id for idempotency lookups",
|
|
||||||
},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for index in indexes:
|
for index in indexes:
|
||||||
|
|
@ -1265,11 +1249,6 @@ class PostgreSQLDB:
|
||||||
graph_name: str | None = None,
|
graph_name: str | None = None,
|
||||||
) -> dict[str, Any] | None | list[dict[str, Any]]:
|
) -> dict[str, Any] | None | list[dict[str, Any]]:
|
||||||
async with self.pool.acquire() as connection: # type: ignore
|
async with self.pool.acquire() as connection: # type: ignore
|
||||||
# Set tenant context if available
|
|
||||||
tenant_id = get_current_tenant_id()
|
|
||||||
if tenant_id:
|
|
||||||
await connection.execute(f"SET app.current_tenant = '{tenant_id}'")
|
|
||||||
|
|
||||||
if with_age and graph_name:
|
if with_age and graph_name:
|
||||||
await self.configure_age(connection, graph_name) # type: ignore
|
await self.configure_age(connection, graph_name) # type: ignore
|
||||||
elif with_age and not graph_name:
|
elif with_age and not graph_name:
|
||||||
|
|
@ -1277,15 +1256,24 @@ class PostgreSQLDB:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if params:
|
if params:
|
||||||
if multirows:
|
rows = await connection.fetch(sql, *params)
|
||||||
return await connection.fetch(sql, *params)
|
|
||||||
else:
|
|
||||||
return await connection.fetchrow(sql, *params)
|
|
||||||
else:
|
else:
|
||||||
if multirows:
|
rows = await connection.fetch(sql)
|
||||||
return await connection.fetch(sql)
|
|
||||||
|
if multirows:
|
||||||
|
if rows:
|
||||||
|
columns = [col for col in rows[0].keys()]
|
||||||
|
data = [dict(zip(columns, row)) for row in rows]
|
||||||
else:
|
else:
|
||||||
return await connection.fetchrow(sql)
|
data = []
|
||||||
|
else:
|
||||||
|
if rows:
|
||||||
|
columns = rows[0].keys()
|
||||||
|
data = dict(zip(columns, rows[0]))
|
||||||
|
else:
|
||||||
|
data = None
|
||||||
|
|
||||||
|
return data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"PostgreSQL database, error:{e}")
|
logger.error(f"PostgreSQL database, error:{e}")
|
||||||
raise
|
raise
|
||||||
|
|
@ -1301,11 +1289,6 @@ class PostgreSQLDB:
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
async with self.pool.acquire() as connection: # type: ignore
|
async with self.pool.acquire() as connection: # type: ignore
|
||||||
# Set tenant context if available
|
|
||||||
tenant_id = get_current_tenant_id()
|
|
||||||
if tenant_id:
|
|
||||||
await connection.execute(f"SET app.current_tenant = '{tenant_id}'")
|
|
||||||
|
|
||||||
if with_age and graph_name:
|
if with_age and graph_name:
|
||||||
await self.configure_age(connection, graph_name)
|
await self.configure_age(connection, graph_name)
|
||||||
elif with_age and not graph_name:
|
elif with_age and not graph_name:
|
||||||
|
|
@ -1320,7 +1303,6 @@ class PostgreSQLDB:
|
||||||
asyncpg.exceptions.DuplicateTableError,
|
asyncpg.exceptions.DuplicateTableError,
|
||||||
asyncpg.exceptions.DuplicateObjectError, # Catch "already exists" error
|
asyncpg.exceptions.DuplicateObjectError, # Catch "already exists" error
|
||||||
asyncpg.exceptions.InvalidSchemaNameError, # Also catch for AGE extension "already exists"
|
asyncpg.exceptions.InvalidSchemaNameError, # Also catch for AGE extension "already exists"
|
||||||
# asyncpg.exceptions.UndefinedTableError, # Catch "relation does not exist" for index creation
|
|
||||||
) as e:
|
) as e:
|
||||||
if ignore_if_exists:
|
if ignore_if_exists:
|
||||||
# If the flag is set, just ignore these specific errors
|
# If the flag is set, just ignore these specific errors
|
||||||
|
|
@ -1415,9 +1397,13 @@ class ClientManager:
|
||||||
),
|
),
|
||||||
# Server settings for Supabase
|
# Server settings for Supabase
|
||||||
"server_settings": os.environ.get(
|
"server_settings": os.environ.get(
|
||||||
"POSTGRES_SERVER_OPTIONS",
|
"POSTGRES_SERVER_SETTINGS",
|
||||||
config.get("postgres", "server_options", fallback=None),
|
config.get("postgres", "server_options", fallback=None),
|
||||||
),
|
),
|
||||||
|
"statement_cache_size": os.environ.get(
|
||||||
|
"POSTGRES_STATEMENT_CACHE_SIZE",
|
||||||
|
config.get("postgres", "statement_cache_size", fallback=None),
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -1471,9 +1457,6 @@ class PGKVStorage(BaseKVStorage):
|
||||||
# Use "default" for compatibility (lowest priority)
|
# Use "default" for compatibility (lowest priority)
|
||||||
self.workspace = "default"
|
self.workspace = "default"
|
||||||
|
|
||||||
# Apply multi-tenant isolation
|
|
||||||
self.workspace = self._get_composite_workspace()
|
|
||||||
|
|
||||||
async def finalize(self):
|
async def finalize(self):
|
||||||
async with get_storage_lock():
|
async with get_storage_lock():
|
||||||
if self.db is not None:
|
if self.db is not None:
|
||||||
|
|
@ -1524,7 +1507,6 @@ class PGKVStorage(BaseKVStorage):
|
||||||
if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
if is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
||||||
processed_results = {}
|
processed_results = {}
|
||||||
for row in results:
|
for row in results:
|
||||||
row = dict(row)
|
|
||||||
llm_cache_list = row.get("llm_cache_list", [])
|
llm_cache_list = row.get("llm_cache_list", [])
|
||||||
if isinstance(llm_cache_list, str):
|
if isinstance(llm_cache_list, str):
|
||||||
try:
|
try:
|
||||||
|
|
@ -1545,7 +1527,6 @@ class PGKVStorage(BaseKVStorage):
|
||||||
if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_ENTITIES):
|
if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_ENTITIES):
|
||||||
processed_results = {}
|
processed_results = {}
|
||||||
for row in results:
|
for row in results:
|
||||||
row = dict(row)
|
|
||||||
entity_names = row.get("entity_names", [])
|
entity_names = row.get("entity_names", [])
|
||||||
if isinstance(entity_names, str):
|
if isinstance(entity_names, str):
|
||||||
try:
|
try:
|
||||||
|
|
@ -1566,7 +1547,6 @@ class PGKVStorage(BaseKVStorage):
|
||||||
if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_RELATIONS):
|
if is_namespace(self.namespace, NameSpace.KV_STORE_FULL_RELATIONS):
|
||||||
processed_results = {}
|
processed_results = {}
|
||||||
for row in results:
|
for row in results:
|
||||||
row = dict(row)
|
|
||||||
relation_pairs = row.get("relation_pairs", [])
|
relation_pairs = row.get("relation_pairs", [])
|
||||||
if isinstance(relation_pairs, str):
|
if isinstance(relation_pairs, str):
|
||||||
try:
|
try:
|
||||||
|
|
@ -1596,9 +1576,6 @@ class PGKVStorage(BaseKVStorage):
|
||||||
sql = SQL_TEMPLATES["get_by_id_" + self.namespace]
|
sql = SQL_TEMPLATES["get_by_id_" + self.namespace]
|
||||||
params = {"workspace": self.workspace, "id": id}
|
params = {"workspace": self.workspace, "id": id}
|
||||||
response = await self.db.query(sql, list(params.values()))
|
response = await self.db.query(sql, list(params.values()))
|
||||||
|
|
||||||
if response:
|
|
||||||
response = dict(response)
|
|
||||||
|
|
||||||
if response and is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
if response and is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
||||||
# Parse llm_cache_list JSON string back to list
|
# Parse llm_cache_list JSON string back to list
|
||||||
|
|
@ -1679,9 +1656,6 @@ class PGKVStorage(BaseKVStorage):
|
||||||
)
|
)
|
||||||
params = {"workspace": self.workspace}
|
params = {"workspace": self.workspace}
|
||||||
results = await self.db.query(sql, list(params.values()), multirows=True)
|
results = await self.db.query(sql, list(params.values()), multirows=True)
|
||||||
|
|
||||||
if results:
|
|
||||||
results = [dict(r) for r in results]
|
|
||||||
|
|
||||||
if results and is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
if results and is_namespace(self.namespace, NameSpace.KV_STORE_TEXT_CHUNKS):
|
||||||
# Parse llm_cache_list JSON string back to list for each result
|
# Parse llm_cache_list JSON string back to list for each result
|
||||||
|
|
@ -1862,28 +1836,6 @@ class PGKVStorage(BaseKVStorage):
|
||||||
"update_time": current_time,
|
"update_time": current_time,
|
||||||
}
|
}
|
||||||
await self.db.execute(upsert_sql, _data)
|
await self.db.execute(upsert_sql, _data)
|
||||||
elif is_namespace(self.namespace, NameSpace.KV_STORE_TENANTS):
|
|
||||||
for k, v in data.items():
|
|
||||||
upsert_sql = SQL_TEMPLATES["upsert_tenants"]
|
|
||||||
_data = {
|
|
||||||
"workspace": self.workspace,
|
|
||||||
"id": k,
|
|
||||||
"data": json.dumps(v),
|
|
||||||
"create_time": v.get("create_time"),
|
|
||||||
"update_time": v.get("update_time"),
|
|
||||||
}
|
|
||||||
await self.db.execute(upsert_sql, _data)
|
|
||||||
elif is_namespace(self.namespace, NameSpace.KV_STORE_KNOWLEDGE_BASES):
|
|
||||||
for k, v in data.items():
|
|
||||||
upsert_sql = SQL_TEMPLATES["upsert_knowledge_bases"]
|
|
||||||
_data = {
|
|
||||||
"workspace": self.workspace,
|
|
||||||
"id": k,
|
|
||||||
"data": json.dumps(v),
|
|
||||||
"create_time": v.get("create_time"),
|
|
||||||
"update_time": v.get("update_time"),
|
|
||||||
}
|
|
||||||
await self.db.execute(upsert_sql, _data)
|
|
||||||
|
|
||||||
async def index_done_callback(self) -> None:
|
async def index_done_callback(self) -> None:
|
||||||
# PG handles persistence automatically
|
# PG handles persistence automatically
|
||||||
|
|
@ -1971,9 +1923,6 @@ class PGVectorStorage(BaseVectorStorage):
|
||||||
# Use "default" for compatibility (lowest priority)
|
# Use "default" for compatibility (lowest priority)
|
||||||
self.workspace = "default"
|
self.workspace = "default"
|
||||||
|
|
||||||
# Apply multi-tenant isolation
|
|
||||||
self.workspace = self._get_composite_workspace()
|
|
||||||
|
|
||||||
async def finalize(self):
|
async def finalize(self):
|
||||||
async with get_storage_lock():
|
async with get_storage_lock():
|
||||||
if self.db is not None:
|
if self.db is not None:
|
||||||
|
|
@ -2521,62 +2470,6 @@ class PGDocStatusStorage(DocStatusStorage):
|
||||||
track_id=result[0].get("track_id"),
|
track_id=result[0].get("track_id"),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def get_doc_by_external_id(self, external_id: str) -> Union[dict[str, Any], None]:
|
|
||||||
"""Get document by external_id for idempotency support.
|
|
||||||
|
|
||||||
Uses indexed lookup on metadata->>'external_id' for efficient retrieval.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
external_id: The external unique identifier to search for
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Union[dict[str, Any], None]: Document data if found, None otherwise
|
|
||||||
"""
|
|
||||||
sql = """
|
|
||||||
SELECT * FROM LIGHTRAG_DOC_STATUS
|
|
||||||
WHERE workspace=$1 AND metadata->>'external_id' = $2
|
|
||||||
"""
|
|
||||||
params = {"workspace": self.workspace, "external_id": external_id}
|
|
||||||
result = await self.db.query(sql, list(params.values()), True)
|
|
||||||
|
|
||||||
if result is None or result == []:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
# Parse chunks_list JSON string back to list
|
|
||||||
chunks_list = result[0].get("chunks_list", [])
|
|
||||||
if isinstance(chunks_list, str):
|
|
||||||
try:
|
|
||||||
chunks_list = json.loads(chunks_list)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
chunks_list = []
|
|
||||||
|
|
||||||
# Parse metadata JSON string back to dict
|
|
||||||
metadata = result[0].get("metadata", {})
|
|
||||||
if isinstance(metadata, str):
|
|
||||||
try:
|
|
||||||
metadata = json.loads(metadata)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
metadata = {}
|
|
||||||
|
|
||||||
# Convert datetime objects to ISO format strings with timezone info
|
|
||||||
created_at = self._format_datetime_with_timezone(result[0]["created_at"])
|
|
||||||
updated_at = self._format_datetime_with_timezone(result[0]["updated_at"])
|
|
||||||
|
|
||||||
return dict(
|
|
||||||
id=result[0]["id"],
|
|
||||||
content_length=result[0]["content_length"],
|
|
||||||
content_summary=result[0]["content_summary"],
|
|
||||||
status=result[0]["status"],
|
|
||||||
chunks_count=result[0]["chunks_count"],
|
|
||||||
created_at=created_at,
|
|
||||||
updated_at=updated_at,
|
|
||||||
file_path=result[0]["file_path"],
|
|
||||||
chunks_list=chunks_list,
|
|
||||||
metadata=metadata,
|
|
||||||
error_msg=result[0].get("error_msg"),
|
|
||||||
track_id=result[0].get("track_id"),
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_status_counts(self) -> dict[str, int]:
|
async def get_status_counts(self) -> dict[str, int]:
|
||||||
"""Get counts of documents in each status"""
|
"""Get counts of documents in each status"""
|
||||||
sql = """SELECT status as "status", COUNT(1) as "count"
|
sql = """SELECT status as "status", COUNT(1) as "count"
|
||||||
|
|
@ -2700,7 +2593,7 @@ class PGDocStatusStorage(DocStatusStorage):
|
||||||
|
|
||||||
async def get_docs_paginated(
|
async def get_docs_paginated(
|
||||||
self,
|
self,
|
||||||
status_filter: DocStatus | None,
|
status_filter: DocStatus | None = None,
|
||||||
page: int = 1,
|
page: int = 1,
|
||||||
page_size: int = 50,
|
page_size: int = 50,
|
||||||
sort_field: str = "updated_at",
|
sort_field: str = "updated_at",
|
||||||
|
|
@ -3013,19 +2906,7 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
# Ensure names comply with PostgreSQL identifier specifications
|
# Ensure names comply with PostgreSQL identifier specifications
|
||||||
safe_workspace = re.sub(r"[^a-zA-Z0-9_]", "_", workspace.strip())
|
safe_workspace = re.sub(r"[^a-zA-Z0-9_]", "_", workspace.strip())
|
||||||
safe_namespace = re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
|
safe_namespace = re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
|
||||||
graph_name = f"{safe_workspace}_{safe_namespace}"
|
return f"{safe_workspace}_{safe_namespace}"
|
||||||
|
|
||||||
# Ensure graph name starts with a letter (AGE requirement)
|
|
||||||
if not graph_name[0].isalpha():
|
|
||||||
graph_name = f"g_{graph_name}"
|
|
||||||
|
|
||||||
# PostgreSQL identifier limit is 63 bytes
|
|
||||||
if len(graph_name) > 63:
|
|
||||||
# Use MD5 hash to ensure uniqueness and fit within limit
|
|
||||||
hash_object = hashlib.md5(graph_name.encode())
|
|
||||||
graph_name = f"g_{hash_object.hexdigest()}"
|
|
||||||
|
|
||||||
return graph_name
|
|
||||||
else:
|
else:
|
||||||
# When the workspace is "default", use the namespace directly (for backward compatibility with legacy implementations)
|
# When the workspace is "default", use the namespace directly (for backward compatibility with legacy implementations)
|
||||||
return re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
|
return re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
|
||||||
|
|
@ -3108,35 +2989,6 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
graph_name=self.graph_name,
|
graph_name=self.graph_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Verify that essential labels exist by checking the ag_label catalog
|
|
||||||
# This helps catch cases where label creation silently failed
|
|
||||||
try:
|
|
||||||
async with self.db.pool.acquire() as connection:
|
|
||||||
await connection.execute("LOAD 'age'") # Required for AGE functions
|
|
||||||
await connection.execute('SET search_path = ag_catalog, "$user", public')
|
|
||||||
# Check if 'base' label exists for this graph
|
|
||||||
result = await connection.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT l.name
|
|
||||||
FROM ag_catalog.ag_label l
|
|
||||||
JOIN ag_catalog.ag_graph g ON l.graph = g.graphid
|
|
||||||
WHERE l.name = 'base' AND g.name = $1
|
|
||||||
""",
|
|
||||||
self.graph_name
|
|
||||||
)
|
|
||||||
if result is None:
|
|
||||||
logger.warning(
|
|
||||||
f"[{self.workspace}] 'base' vlabel not found for graph '{self.graph_name}', attempting to create..."
|
|
||||||
)
|
|
||||||
# Retry creating the vlabel
|
|
||||||
await connection.execute(
|
|
||||||
f"SELECT create_vlabel('{self.graph_name}', 'base')"
|
|
||||||
)
|
|
||||||
logger.info(f"[{self.workspace}] Successfully created 'base' vlabel for graph '{self.graph_name}'")
|
|
||||||
except Exception as e:
|
|
||||||
if "already exists" not in str(e):
|
|
||||||
logger.error(f"[{self.workspace}] Failed to verify/create 'base' vlabel: {e}")
|
|
||||||
|
|
||||||
async def finalize(self):
|
async def finalize(self):
|
||||||
async with get_graph_db_lock():
|
async with get_graph_db_lock():
|
||||||
if self.db is not None:
|
if self.db is not None:
|
||||||
|
|
@ -3303,7 +3155,6 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
Returns:
|
Returns:
|
||||||
list[dict[str, Any]]: a list of dictionaries containing the result set
|
list[dict[str, Any]]: a list of dictionaries containing the result set
|
||||||
"""
|
"""
|
||||||
logger.info(f"[{self.workspace}] Executing query: {query}")
|
|
||||||
try:
|
try:
|
||||||
if readonly:
|
if readonly:
|
||||||
data = await self.db.query(
|
data = await self.db.query(
|
||||||
|
|
@ -3494,6 +3345,7 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@retry(
|
@retry(
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||||
retry=retry_if_exception_type((PGGraphQueryException,)),
|
retry=retry_if_exception_type((PGGraphQueryException,)),
|
||||||
)
|
)
|
||||||
|
|
@ -3749,10 +3601,8 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
node_id = row["node_id"]
|
node_id = row["node_id"]
|
||||||
if not node_id:
|
if not node_id:
|
||||||
continue
|
continue
|
||||||
out_degree = out_degrees.get(node_id, 0)
|
out_degrees[node_id] = int(row.get("out_degree", 0) or 0)
|
||||||
in_degree = in_degrees.get(node_id, 0)
|
in_degrees[node_id] = int(row.get("in_degree", 0) or 0)
|
||||||
out_degrees[node_id] = out_degree + int(row.get("out_degree", 0) or 0)
|
|
||||||
in_degrees[node_id] = in_degree + int(row.get("in_degree", 0) or 0)
|
|
||||||
|
|
||||||
degrees_dict = {}
|
degrees_dict = {}
|
||||||
for node_id in node_ids:
|
for node_id in node_ids:
|
||||||
|
|
@ -3856,13 +3706,15 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
SELECT * FROM cypher({dollar_quote(self.graph_name)}::name,
|
SELECT * FROM cypher({dollar_quote(self.graph_name)}::name,
|
||||||
{dollar_quote(forward_cypher)}::cstring,
|
{dollar_quote(forward_cypher)}::cstring,
|
||||||
$1::agtype)
|
$1::agtype)
|
||||||
AS (source text, target text, edge_properties agtype)"""
|
AS (source text, target text, edge_properties agtype)
|
||||||
|
"""
|
||||||
|
|
||||||
sql_bwd = f"""
|
sql_bwd = f"""
|
||||||
SELECT * FROM cypher({dollar_quote(self.graph_name)}::name,
|
SELECT * FROM cypher({dollar_quote(self.graph_name)}::name,
|
||||||
{dollar_quote(backward_cypher)}::cstring,
|
{dollar_quote(backward_cypher)}::cstring,
|
||||||
$1::agtype)
|
$1::agtype)
|
||||||
AS (source text, target text, edge_properties agtype)"""
|
AS (source text, target text, edge_properties agtype)
|
||||||
|
"""
|
||||||
|
|
||||||
pg_params = {"params": json.dumps({"pairs": pairs}, ensure_ascii=False)}
|
pg_params = {"params": json.dumps({"pairs": pairs}, ensure_ascii=False)}
|
||||||
|
|
||||||
|
|
@ -4034,8 +3886,7 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
f"[{self.workspace}] Failed to parse node string in batch: {node_dict}"
|
f"[{self.workspace}] Failed to parse node string in batch: {node_dict}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add node id (entity_id) to the dictionary for easier access
|
node_dict["id"] = node_dict["entity_id"]
|
||||||
node_dict["id"] = node_dict.get("entity_id")
|
|
||||||
nodes.append(node_dict)
|
nodes.append(node_dict)
|
||||||
|
|
||||||
return nodes
|
return nodes
|
||||||
|
|
@ -4069,7 +3920,6 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"[{self.workspace}] Failed to parse edge string in batch: {edge_agtype}"
|
f"[{self.workspace}] Failed to parse edge string in batch: {edge_agtype}"
|
||||||
)
|
)
|
||||||
edge_agtype = {}
|
|
||||||
|
|
||||||
source_agtype = item["source"]["properties"]
|
source_agtype = item["source"]["properties"]
|
||||||
# Process string result, parse it to JSON dictionary
|
# Process string result, parse it to JSON dictionary
|
||||||
|
|
@ -4374,10 +4224,6 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
$$) AS (a AGTYPE, r AGTYPE, b AGTYPE)"""
|
$$) AS (a AGTYPE, r AGTYPE, b AGTYPE)"""
|
||||||
results = await self._query(query)
|
results = await self._query(query)
|
||||||
|
|
||||||
logger.info(f"[{self.workspace}] Query results count: {len(results)}")
|
|
||||||
if results:
|
|
||||||
logger.info(f"[{self.workspace}] First result sample: {results[0]}")
|
|
||||||
|
|
||||||
# Process query results, deduplicate nodes and edges
|
# Process query results, deduplicate nodes and edges
|
||||||
nodes_dict = {}
|
nodes_dict = {}
|
||||||
edges_dict = {}
|
edges_dict = {}
|
||||||
|
|
@ -4519,13 +4365,13 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
GROUP BY node_id
|
GROUP BY node_id
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
(ag_catalog.agtype_access_operator(VARIADIC ARRAY[properties, '"entity_id"'::agtype]))::text AS label
|
(ag_catalog.agtype_access_operator(VARIADIC ARRAY[v.properties, '"entity_id"'::agtype]))::text AS label
|
||||||
FROM
|
FROM
|
||||||
node_degrees d
|
node_degrees d
|
||||||
JOIN
|
JOIN
|
||||||
{self.graph_name}._ag_label_vertex v ON d.node_id = v.id
|
{self.graph_name}._ag_label_vertex v ON d.node_id = v.id
|
||||||
WHERE
|
WHERE
|
||||||
ag_catalog.agtype_access_operator(VARIADIC ARRAY[properties, '"entity_id"'::agtype]) IS NOT NULL
|
ag_catalog.agtype_access_operator(VARIADIC ARRAY[v.properties, '"entity_id"'::agtype]) IS NOT NULL
|
||||||
ORDER BY
|
ORDER BY
|
||||||
d.degree DESC,
|
d.degree DESC,
|
||||||
label ASC
|
label ASC
|
||||||
|
|
@ -4615,7 +4461,7 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
drop_query = f"""SELECT * FROM cypher('{self.graph_name}', $$
|
drop_query = f"""SELECT * FROM cypher('{self.graph_name}', $$
|
||||||
MATCH (n)
|
MATCH (n)
|
||||||
DETACH DELETE n
|
DETACH DELETE n
|
||||||
$$) AS (n agtype)"""
|
$$) AS (result agtype)"""
|
||||||
|
|
||||||
await self._query(drop_query, readonly=False)
|
await self._query(drop_query, readonly=False)
|
||||||
return {
|
return {
|
||||||
|
|
@ -4639,8 +4485,6 @@ NAMESPACE_TABLE_MAP = {
|
||||||
NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY",
|
NameSpace.VECTOR_STORE_ENTITIES: "LIGHTRAG_VDB_ENTITY",
|
||||||
NameSpace.VECTOR_STORE_RELATIONSHIPS: "LIGHTRAG_VDB_RELATION",
|
NameSpace.VECTOR_STORE_RELATIONSHIPS: "LIGHTRAG_VDB_RELATION",
|
||||||
NameSpace.DOC_STATUS: "LIGHTRAG_DOC_STATUS",
|
NameSpace.DOC_STATUS: "LIGHTRAG_DOC_STATUS",
|
||||||
NameSpace.KV_STORE_TENANTS: "LIGHTRAG_TENANTS",
|
|
||||||
NameSpace.KV_STORE_KNOWLEDGE_BASES: "LIGHTRAG_KNOWLEDGE_BASES",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -4651,26 +4495,6 @@ def namespace_to_table_name(namespace: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
TABLES = {
|
TABLES = {
|
||||||
"LIGHTRAG_TENANTS": {
|
|
||||||
"ddl": """CREATE TABLE LIGHTRAG_TENANTS (
|
|
||||||
id VARCHAR(255),
|
|
||||||
workspace VARCHAR(255),
|
|
||||||
data JSONB,
|
|
||||||
create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
CONSTRAINT LIGHTRAG_TENANTS_PK PRIMARY KEY (workspace, id)
|
|
||||||
)"""
|
|
||||||
},
|
|
||||||
"LIGHTRAG_KNOWLEDGE_BASES": {
|
|
||||||
"ddl": """CREATE TABLE LIGHTRAG_KNOWLEDGE_BASES (
|
|
||||||
id VARCHAR(255),
|
|
||||||
workspace VARCHAR(255),
|
|
||||||
data JSONB,
|
|
||||||
create_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
update_time TIMESTAMP(0) DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
CONSTRAINT LIGHTRAG_KNOWLEDGE_BASES_PK PRIMARY KEY (workspace, id)
|
|
||||||
)"""
|
|
||||||
},
|
|
||||||
"LIGHTRAG_DOC_FULL": {
|
"LIGHTRAG_DOC_FULL": {
|
||||||
"ddl": """CREATE TABLE LIGHTRAG_DOC_FULL (
|
"ddl": """CREATE TABLE LIGHTRAG_DOC_FULL (
|
||||||
id VARCHAR(255),
|
id VARCHAR(255),
|
||||||
|
|
@ -4853,10 +4677,6 @@ SQL_TEMPLATES = {
|
||||||
EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
|
EXTRACT(EPOCH FROM update_time)::BIGINT as update_time
|
||||||
FROM LIGHTRAG_FULL_RELATIONS WHERE workspace=$1 AND id IN ({ids})
|
FROM LIGHTRAG_FULL_RELATIONS WHERE workspace=$1 AND id IN ({ids})
|
||||||
""",
|
""",
|
||||||
"get_by_id_tenants": """SELECT data FROM LIGHTRAG_TENANTS WHERE workspace=$1 AND id=$2""",
|
|
||||||
"get_by_id_knowledge_bases": """SELECT data FROM LIGHTRAG_KNOWLEDGE_BASES WHERE workspace=$1 AND id=$2""",
|
|
||||||
"get_by_ids_tenants": """SELECT data FROM LIGHTRAG_TENANTS WHERE workspace=$1 AND id IN ({ids})""",
|
|
||||||
"get_by_ids_knowledge_bases": """SELECT data FROM LIGHTRAG_KNOWLEDGE_BASES WHERE workspace=$1 AND id IN ({ids})""",
|
|
||||||
"filter_keys": "SELECT id FROM {table_name} WHERE workspace=$1 AND id IN ({ids})",
|
"filter_keys": "SELECT id FROM {table_name} WHERE workspace=$1 AND id IN ({ids})",
|
||||||
"upsert_doc_full": """INSERT INTO LIGHTRAG_DOC_FULL (id, content, doc_name, workspace)
|
"upsert_doc_full": """INSERT INTO LIGHTRAG_DOC_FULL (id, content, doc_name, workspace)
|
||||||
VALUES ($1, $2, $3, $4)
|
VALUES ($1, $2, $3, $4)
|
||||||
|
|
@ -4904,18 +4724,6 @@ SQL_TEMPLATES = {
|
||||||
count=EXCLUDED.count,
|
count=EXCLUDED.count,
|
||||||
update_time = EXCLUDED.update_time
|
update_time = EXCLUDED.update_time
|
||||||
""",
|
""",
|
||||||
"upsert_tenants": """INSERT INTO LIGHTRAG_TENANTS (workspace, id, data, create_time, update_time)
|
|
||||||
VALUES ($1, $2, $3, $4, $5)
|
|
||||||
ON CONFLICT (workspace,id) DO UPDATE
|
|
||||||
SET data=EXCLUDED.data,
|
|
||||||
update_time = EXCLUDED.update_time
|
|
||||||
""",
|
|
||||||
"upsert_knowledge_bases": """INSERT INTO LIGHTRAG_KNOWLEDGE_BASES (workspace, id, data, create_time, update_time)
|
|
||||||
VALUES ($1, $2, $3, $4, $5)
|
|
||||||
ON CONFLICT (workspace,id) DO UPDATE
|
|
||||||
SET data=EXCLUDED.data,
|
|
||||||
update_time = EXCLUDED.update_time
|
|
||||||
""",
|
|
||||||
# SQL for VectorStorage
|
# SQL for VectorStorage
|
||||||
"upsert_chunk": """INSERT INTO LIGHTRAG_VDB_CHUNKS (workspace, id, tokens,
|
"upsert_chunk": """INSERT INTO LIGHTRAG_VDB_CHUNKS (workspace, id, tokens,
|
||||||
chunk_order_index, full_doc_id, content, content_vector, file_path,
|
chunk_order_index, full_doc_id, content, content_vector, file_path,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue