Merge pull request #1743 from danielaskdd/add-workspace-env
Feat: Implement data isolation for all storage types via WORKSPACE environment variable
This commit is contained in:
commit
d7d59ed973
31 changed files with 672 additions and 206 deletions
53
env.example
53
env.example
|
|
@ -111,25 +111,37 @@ EMBEDDING_BINDING_HOST=http://localhost:11434
|
||||||
###########################
|
###########################
|
||||||
### Data storage selection
|
### Data storage selection
|
||||||
###########################
|
###########################
|
||||||
|
### In-memory database with data persistence to local files
|
||||||
|
# LIGHTRAG_KV_STORAGE=JsonKVStorage
|
||||||
|
# LIGHTRAG_DOC_STATUS_STORAGE=JsonDocStatusStorage
|
||||||
|
# LIGHTRAG_GRAPH_STORAGE=NetworkXStorage
|
||||||
|
# LIGHTRAG_VECTOR_STORAGE=NanoVectorDBStorage
|
||||||
|
# LIGHTRAG_VECTOR_STORAGE=FaissVectorDBStorage
|
||||||
### PostgreSQL
|
### PostgreSQL
|
||||||
# LIGHTRAG_KV_STORAGE=PGKVStorage
|
# LIGHTRAG_KV_STORAGE=PGKVStorage
|
||||||
# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
||||||
# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
|
||||||
# LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
|
# LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
|
||||||
### MongoDB
|
# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
||||||
|
### MongoDB (recommended for production deploy)
|
||||||
# LIGHTRAG_KV_STORAGE=MongoKVStorage
|
# LIGHTRAG_KV_STORAGE=MongoKVStorage
|
||||||
# LIGHTRAG_DOC_STATUS_STORAGE=MongoDocStatusStorage
|
# LIGHTRAG_DOC_STATUS_STORAGE=MongoDocStatusStorage
|
||||||
# LIGHTRAG_VECTOR_STORAGE=MongoVectorDBStorage
|
|
||||||
# LIGHTRAG_GRAPH_STORAGE=MongoGraphStorage
|
# LIGHTRAG_GRAPH_STORAGE=MongoGraphStorage
|
||||||
### KV Storage
|
# LIGHTRAG_VECTOR_STORAGE=MongoVectorDBStorage
|
||||||
|
### Redis Storage (recommended for production deploy)
|
||||||
# LIGHTRAG_KV_STORAGE=RedisKVStorage
|
# LIGHTRAG_KV_STORAGE=RedisKVStorage
|
||||||
# LIGHTRAG_DOC_STATUS_STORAGE=RedisDocStatusStorage
|
# LIGHTRAG_DOC_STATUS_STORAGE=RedisDocStatusStorage
|
||||||
### Vector Storage
|
### Vector Storage (recommended for production deploy)
|
||||||
# LIGHTRAG_VECTOR_STORAGE=FaissVectorDBStorage
|
|
||||||
# LIGHTRAG_VECTOR_STORAGE=MilvusVectorDBStorage
|
# LIGHTRAG_VECTOR_STORAGE=MilvusVectorDBStorage
|
||||||
### Graph Storage
|
# LIGHTRAG_VECTOR_STORAGE=QdrantVectorDBStorage
|
||||||
|
### Graph Storage (recommended for production deploy)
|
||||||
# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
|
# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
|
||||||
# LIGHTRAG_GRAPH_STORAGE=MemgraphStorage
|
|
||||||
|
####################################################################
|
||||||
|
### Default workspace for all storage types
|
||||||
|
### For the purpose of isolation of data for each LightRAG instance
|
||||||
|
### Valid characters: a-z, A-Z, 0-9, and _
|
||||||
|
####################################################################
|
||||||
|
# WORKSPACE=doc—
|
||||||
|
|
||||||
### PostgreSQL Configuration
|
### PostgreSQL Configuration
|
||||||
POSTGRES_HOST=localhost
|
POSTGRES_HOST=localhost
|
||||||
|
|
@ -138,31 +150,19 @@ POSTGRES_USER=your_username
|
||||||
POSTGRES_PASSWORD='your_password'
|
POSTGRES_PASSWORD='your_password'
|
||||||
POSTGRES_DATABASE=your_database
|
POSTGRES_DATABASE=your_database
|
||||||
POSTGRES_MAX_CONNECTIONS=12
|
POSTGRES_MAX_CONNECTIONS=12
|
||||||
### separating all data from difference Lightrag instances
|
# POSTGRES_WORKSPACE=forced_workspace_name
|
||||||
# POSTGRES_WORKSPACE=default
|
|
||||||
|
|
||||||
### Neo4j Configuration
|
### Neo4j Configuration
|
||||||
NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
|
NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
|
||||||
NEO4J_USERNAME=neo4j
|
NEO4J_USERNAME=neo4j
|
||||||
NEO4J_PASSWORD='your_password'
|
NEO4J_PASSWORD='your_password'
|
||||||
|
# NEO4J_WORKSPACE=forced_workspace_name
|
||||||
### Independent AGM Configuration(not for AMG embedded in PostreSQL)
|
|
||||||
# AGE_POSTGRES_DB=
|
|
||||||
# AGE_POSTGRES_USER=
|
|
||||||
# AGE_POSTGRES_PASSWORD=
|
|
||||||
# AGE_POSTGRES_HOST=
|
|
||||||
# AGE_POSTGRES_PORT=8529
|
|
||||||
|
|
||||||
# AGE Graph Name(apply to PostgreSQL and independent AGM)
|
|
||||||
### AGE_GRAPH_NAME is deprecated
|
|
||||||
# AGE_GRAPH_NAME=lightrag
|
|
||||||
|
|
||||||
### MongoDB Configuration
|
### MongoDB Configuration
|
||||||
MONGO_URI=mongodb://root:root@localhost:27017/
|
MONGO_URI=mongodb://root:root@localhost:27017/
|
||||||
|
#MONGO_URI=mongodb+srv://root:rooot@cluster0.xxxx.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0
|
||||||
MONGO_DATABASE=LightRAG
|
MONGO_DATABASE=LightRAG
|
||||||
### separating all data from difference Lightrag instances(deprecating)
|
# MONGODB_WORKSPACE=forced_workspace_name
|
||||||
### separating all data from difference Lightrag instances
|
|
||||||
# MONGODB_WORKSPACE=default
|
|
||||||
|
|
||||||
### Milvus Configuration
|
### Milvus Configuration
|
||||||
MILVUS_URI=http://localhost:19530
|
MILVUS_URI=http://localhost:19530
|
||||||
|
|
@ -170,10 +170,13 @@ MILVUS_DB_NAME=lightrag
|
||||||
# MILVUS_USER=root
|
# MILVUS_USER=root
|
||||||
# MILVUS_PASSWORD=your_password
|
# MILVUS_PASSWORD=your_password
|
||||||
# MILVUS_TOKEN=your_token
|
# MILVUS_TOKEN=your_token
|
||||||
|
# MILVUS_WORKSPACE=forced_workspace_name
|
||||||
|
|
||||||
### Qdrant
|
### Qdrant
|
||||||
QDRANT_URL=http://localhost:16333
|
QDRANT_URL=http://localhost:6333
|
||||||
# QDRANT_API_KEY=your-api-key
|
# QDRANT_API_KEY=your-api-key
|
||||||
|
# QDRANT_WORKSPACE=forced_workspace_name
|
||||||
|
|
||||||
### Redis
|
### Redis
|
||||||
REDIS_URI=redis://localhost:6379
|
REDIS_URI=redis://localhost:6379
|
||||||
|
# REDIS_WORKSPACE=forced_workspace_name
|
||||||
|
|
|
||||||
|
|
@ -1 +1 @@
|
||||||
__api_version__ = "0178"
|
__api_version__ = "0179"
|
||||||
|
|
|
||||||
|
|
@ -184,10 +184,10 @@ def parse_args() -> argparse.Namespace:
|
||||||
|
|
||||||
# Namespace
|
# Namespace
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--namespace-prefix",
|
"--workspace",
|
||||||
type=str,
|
type=str,
|
||||||
default=get_env_value("NAMESPACE_PREFIX", ""),
|
default=get_env_value("WORKSPACE", ""),
|
||||||
help="Prefix of the namespace",
|
help="Default workspace for all storage",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
|
||||||
|
|
@ -112,8 +112,8 @@ def create_app(args):
|
||||||
# Check if API key is provided either through env var or args
|
# Check if API key is provided either through env var or args
|
||||||
api_key = os.getenv("LIGHTRAG_API_KEY") or args.key
|
api_key = os.getenv("LIGHTRAG_API_KEY") or args.key
|
||||||
|
|
||||||
# Initialize document manager
|
# Initialize document manager with workspace support for data isolation
|
||||||
doc_manager = DocumentManager(args.input_dir)
|
doc_manager = DocumentManager(args.input_dir, workspace=args.workspace)
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
|
|
@ -295,6 +295,7 @@ def create_app(args):
|
||||||
if args.llm_binding in ["lollms", "ollama", "openai"]:
|
if args.llm_binding in ["lollms", "ollama", "openai"]:
|
||||||
rag = LightRAG(
|
rag = LightRAG(
|
||||||
working_dir=args.working_dir,
|
working_dir=args.working_dir,
|
||||||
|
workspace=args.workspace,
|
||||||
llm_model_func=lollms_model_complete
|
llm_model_func=lollms_model_complete
|
||||||
if args.llm_binding == "lollms"
|
if args.llm_binding == "lollms"
|
||||||
else ollama_model_complete
|
else ollama_model_complete
|
||||||
|
|
@ -330,6 +331,7 @@ def create_app(args):
|
||||||
else: # azure_openai
|
else: # azure_openai
|
||||||
rag = LightRAG(
|
rag = LightRAG(
|
||||||
working_dir=args.working_dir,
|
working_dir=args.working_dir,
|
||||||
|
workspace=args.workspace,
|
||||||
llm_model_func=azure_openai_model_complete,
|
llm_model_func=azure_openai_model_complete,
|
||||||
chunk_token_size=int(args.chunk_size),
|
chunk_token_size=int(args.chunk_size),
|
||||||
chunk_overlap_token_size=int(args.chunk_overlap_size),
|
chunk_overlap_token_size=int(args.chunk_overlap_size),
|
||||||
|
|
@ -472,6 +474,8 @@ def create_app(args):
|
||||||
"vector_storage": args.vector_storage,
|
"vector_storage": args.vector_storage,
|
||||||
"enable_llm_cache_for_extract": args.enable_llm_cache_for_extract,
|
"enable_llm_cache_for_extract": args.enable_llm_cache_for_extract,
|
||||||
"enable_llm_cache": args.enable_llm_cache,
|
"enable_llm_cache": args.enable_llm_cache,
|
||||||
|
"workspace": args.workspace,
|
||||||
|
"max_graph_nodes": os.getenv("MAX_GRAPH_NODES"),
|
||||||
},
|
},
|
||||||
"auth_mode": auth_mode,
|
"auth_mode": auth_mode,
|
||||||
"pipeline_busy": pipeline_status.get("busy", False),
|
"pipeline_busy": pipeline_status.get("busy", False),
|
||||||
|
|
|
||||||
|
|
@ -475,6 +475,7 @@ class DocumentManager:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_dir: str,
|
input_dir: str,
|
||||||
|
workspace: str = "", # New parameter for workspace isolation
|
||||||
supported_extensions: tuple = (
|
supported_extensions: tuple = (
|
||||||
".txt",
|
".txt",
|
||||||
".md",
|
".md",
|
||||||
|
|
@ -515,10 +516,19 @@ class DocumentManager:
|
||||||
".less", # LESS CSS
|
".less", # LESS CSS
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
self.input_dir = Path(input_dir)
|
# Store the base input directory and workspace
|
||||||
|
self.base_input_dir = Path(input_dir)
|
||||||
|
self.workspace = workspace
|
||||||
self.supported_extensions = supported_extensions
|
self.supported_extensions = supported_extensions
|
||||||
self.indexed_files = set()
|
self.indexed_files = set()
|
||||||
|
|
||||||
|
# Create workspace-specific input directory
|
||||||
|
# If workspace is provided, create a subdirectory for data isolation
|
||||||
|
if workspace:
|
||||||
|
self.input_dir = self.base_input_dir / workspace
|
||||||
|
else:
|
||||||
|
self.input_dir = self.base_input_dir
|
||||||
|
|
||||||
# Create input directory if it doesn't exist
|
# Create input directory if it doesn't exist
|
||||||
self.input_dir.mkdir(parents=True, exist_ok=True)
|
self.input_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
@ -716,7 +726,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
||||||
if content:
|
if content:
|
||||||
# Check if content contains only whitespace characters
|
# Check if content contains only whitespace characters
|
||||||
if not content.strip():
|
if not content.strip():
|
||||||
logger.warning(f"File contains only whitespace characters. file_paths={file_path.name}")
|
logger.warning(
|
||||||
|
f"File contains only whitespace characters. file_paths={file_path.name}"
|
||||||
|
)
|
||||||
|
|
||||||
await rag.apipeline_enqueue_documents(content, file_paths=file_path.name)
|
await rag.apipeline_enqueue_documents(content, file_paths=file_path.name)
|
||||||
logger.info(f"Successfully fetched and enqueued file: {file_path.name}")
|
logger.info(f"Successfully fetched and enqueued file: {file_path.name}")
|
||||||
|
|
|
||||||
|
|
@ -284,8 +284,10 @@ def display_splash_screen(args: argparse.Namespace) -> None:
|
||||||
ASCIIColors.yellow(f"{args.vector_storage}")
|
ASCIIColors.yellow(f"{args.vector_storage}")
|
||||||
ASCIIColors.white(" ├─ Graph Storage: ", end="")
|
ASCIIColors.white(" ├─ Graph Storage: ", end="")
|
||||||
ASCIIColors.yellow(f"{args.graph_storage}")
|
ASCIIColors.yellow(f"{args.graph_storage}")
|
||||||
ASCIIColors.white(" └─ Document Status Storage: ", end="")
|
ASCIIColors.white(" ├─ Document Status Storage: ", end="")
|
||||||
ASCIIColors.yellow(f"{args.doc_status_storage}")
|
ASCIIColors.yellow(f"{args.doc_status_storage}")
|
||||||
|
ASCIIColors.white(" └─ Workspace: ", end="")
|
||||||
|
ASCIIColors.yellow(f"{args.workspace if args.workspace else '-'}")
|
||||||
|
|
||||||
# Server Status
|
# Server Status
|
||||||
ASCIIColors.green("\n✨ Server starting up...\n")
|
ASCIIColors.green("\n✨ Server starting up...\n")
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
2
lightrag/api/webui/index.html
generated
2
lightrag/api/webui/index.html
generated
|
|
@ -8,7 +8,7 @@
|
||||||
<link rel="icon" type="image/png" href="favicon.png" />
|
<link rel="icon" type="image/png" href="favicon.png" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>Lightrag</title>
|
<title>Lightrag</title>
|
||||||
<script type="module" crossorigin src="/webui/assets/index-CRWiopRc.js"></script>
|
<script type="module" crossorigin src="/webui/assets/index-BlAjHenV.js"></script>
|
||||||
<link rel="modulepreload" crossorigin href="/webui/assets/react-vendor-DEwriMA6.js">
|
<link rel="modulepreload" crossorigin href="/webui/assets/react-vendor-DEwriMA6.js">
|
||||||
<link rel="modulepreload" crossorigin href="/webui/assets/ui-vendor-CeCm8EER.js">
|
<link rel="modulepreload" crossorigin href="/webui/assets/ui-vendor-CeCm8EER.js">
|
||||||
<link rel="modulepreload" crossorigin href="/webui/assets/graph-vendor-B-X5JegA.js">
|
<link rel="modulepreload" crossorigin href="/webui/assets/graph-vendor-B-X5JegA.js">
|
||||||
|
|
|
||||||
|
|
@ -103,6 +103,7 @@ class QueryParam:
|
||||||
@dataclass
|
@dataclass
|
||||||
class StorageNameSpace(ABC):
|
class StorageNameSpace(ABC):
|
||||||
namespace: str
|
namespace: str
|
||||||
|
workspace: str
|
||||||
global_config: dict[str, Any]
|
global_config: dict[str, Any]
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
|
|
|
||||||
|
|
@ -38,9 +38,19 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
||||||
self.cosine_better_than_threshold = cosine_threshold
|
self.cosine_better_than_threshold = cosine_threshold
|
||||||
|
|
||||||
# Where to save index file if you want persistent storage
|
# Where to save index file if you want persistent storage
|
||||||
self._faiss_index_file = os.path.join(
|
working_dir = self.global_config["working_dir"]
|
||||||
self.global_config["working_dir"], f"faiss_index_{self.namespace}.index"
|
if self.workspace:
|
||||||
)
|
# Include workspace in the file path for data isolation
|
||||||
|
workspace_dir = os.path.join(working_dir, self.workspace)
|
||||||
|
os.makedirs(workspace_dir, exist_ok=True)
|
||||||
|
self._faiss_index_file = os.path.join(
|
||||||
|
workspace_dir, f"faiss_index_{self.namespace}.index"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Default behavior when workspace is empty
|
||||||
|
self._faiss_index_file = os.path.join(
|
||||||
|
working_dir, f"faiss_index_{self.namespace}.index"
|
||||||
|
)
|
||||||
self._meta_file = self._faiss_index_file + ".meta.json"
|
self._meta_file = self._faiss_index_file + ".meta.json"
|
||||||
|
|
||||||
self._max_batch_size = self.global_config["embedding_batch_num"]
|
self._max_batch_size = self.global_config["embedding_batch_num"]
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,18 @@ class JsonDocStatusStorage(DocStatusStorage):
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
working_dir = self.global_config["working_dir"]
|
working_dir = self.global_config["working_dir"]
|
||||||
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
|
if self.workspace:
|
||||||
|
# Include workspace in the file path for data isolation
|
||||||
|
workspace_dir = os.path.join(working_dir, self.workspace)
|
||||||
|
os.makedirs(workspace_dir, exist_ok=True)
|
||||||
|
self._file_name = os.path.join(
|
||||||
|
workspace_dir, f"kv_store_{self.namespace}.json"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Default behavior when workspace is empty
|
||||||
|
self._file_name = os.path.join(
|
||||||
|
working_dir, f"kv_store_{self.namespace}.json"
|
||||||
|
)
|
||||||
self._data = None
|
self._data = None
|
||||||
self._storage_lock = None
|
self._storage_lock = None
|
||||||
self.storage_updated = None
|
self.storage_updated = None
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,18 @@ from .shared_storage import (
|
||||||
class JsonKVStorage(BaseKVStorage):
|
class JsonKVStorage(BaseKVStorage):
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
working_dir = self.global_config["working_dir"]
|
working_dir = self.global_config["working_dir"]
|
||||||
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
|
if self.workspace:
|
||||||
|
# Include workspace in the file path for data isolation
|
||||||
|
workspace_dir = os.path.join(working_dir, self.workspace)
|
||||||
|
os.makedirs(workspace_dir, exist_ok=True)
|
||||||
|
self._file_name = os.path.join(
|
||||||
|
workspace_dir, f"kv_store_{self.namespace}.json"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Default behavior when workspace is empty
|
||||||
|
self._file_name = os.path.join(
|
||||||
|
working_dir, f"kv_store_{self.namespace}.json"
|
||||||
|
)
|
||||||
self._data = None
|
self._data = None
|
||||||
self._storage_lock = None
|
self._storage_lock = None
|
||||||
self.storage_updated = None
|
self.storage_updated = None
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,6 @@ from lightrag.utils import logger, compute_mdhash_id
|
||||||
from ..base import BaseVectorStorage
|
from ..base import BaseVectorStorage
|
||||||
import pipmaster as pm
|
import pipmaster as pm
|
||||||
|
|
||||||
|
|
||||||
if not pm.is_installed("configparser"):
|
|
||||||
pm.install("configparser")
|
|
||||||
|
|
||||||
if not pm.is_installed("pymilvus"):
|
if not pm.is_installed("pymilvus"):
|
||||||
pm.install("pymilvus")
|
pm.install("pymilvus")
|
||||||
|
|
||||||
|
|
@ -660,6 +656,29 @@ class MilvusVectorDBStorage(BaseVectorStorage):
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
# Check for MILVUS_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all Milvus storage instances
|
||||||
|
milvus_workspace = os.environ.get("MILVUS_WORKSPACE")
|
||||||
|
if milvus_workspace and milvus_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = milvus_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using MILVUS_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})
|
kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})
|
||||||
cosine_threshold = kwargs.get("cosine_better_than_threshold")
|
cosine_threshold = kwargs.get("cosine_better_than_threshold")
|
||||||
if cosine_threshold is None:
|
if cosine_threshold is None:
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@ if not pm.is_installed("pymongo"):
|
||||||
pm.install("pymongo")
|
pm.install("pymongo")
|
||||||
|
|
||||||
from pymongo import AsyncMongoClient # type: ignore
|
from pymongo import AsyncMongoClient # type: ignore
|
||||||
|
from pymongo import UpdateOne # type: ignore
|
||||||
from pymongo.asynchronous.database import AsyncDatabase # type: ignore
|
from pymongo.asynchronous.database import AsyncDatabase # type: ignore
|
||||||
from pymongo.asynchronous.collection import AsyncCollection # type: ignore
|
from pymongo.asynchronous.collection import AsyncCollection # type: ignore
|
||||||
from pymongo.operations import SearchIndexModel # type: ignore
|
from pymongo.operations import SearchIndexModel # type: ignore
|
||||||
|
|
@ -81,7 +82,39 @@ class MongoKVStorage(BaseKVStorage):
|
||||||
db: AsyncDatabase = field(default=None)
|
db: AsyncDatabase = field(default=None)
|
||||||
_data: AsyncCollection = field(default=None)
|
_data: AsyncCollection = field(default=None)
|
||||||
|
|
||||||
|
def __init__(self, namespace, global_config, embedding_func, workspace=None):
|
||||||
|
super().__init__(
|
||||||
|
namespace=namespace,
|
||||||
|
workspace=workspace or "",
|
||||||
|
global_config=global_config,
|
||||||
|
embedding_func=embedding_func,
|
||||||
|
)
|
||||||
|
self.__post_init__()
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
# Check for MONGODB_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all MongoDB storage instances
|
||||||
|
mongodb_workspace = os.environ.get("MONGODB_WORKSPACE")
|
||||||
|
if mongodb_workspace and mongodb_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = mongodb_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using MONGODB_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
self._collection_name = self.namespace
|
self._collection_name = self.namespace
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
|
|
@ -142,7 +175,6 @@ class MongoKVStorage(BaseKVStorage):
|
||||||
|
|
||||||
# Unified handling for all namespaces with flattened keys
|
# Unified handling for all namespaces with flattened keys
|
||||||
# Use bulk_write for better performance
|
# Use bulk_write for better performance
|
||||||
from pymongo import UpdateOne
|
|
||||||
|
|
||||||
operations = []
|
operations = []
|
||||||
current_time = int(time.time()) # Get current Unix timestamp
|
current_time = int(time.time()) # Get current Unix timestamp
|
||||||
|
|
@ -252,7 +284,39 @@ class MongoDocStatusStorage(DocStatusStorage):
|
||||||
db: AsyncDatabase = field(default=None)
|
db: AsyncDatabase = field(default=None)
|
||||||
_data: AsyncCollection = field(default=None)
|
_data: AsyncCollection = field(default=None)
|
||||||
|
|
||||||
|
def __init__(self, namespace, global_config, embedding_func, workspace=None):
|
||||||
|
super().__init__(
|
||||||
|
namespace=namespace,
|
||||||
|
workspace=workspace or "",
|
||||||
|
global_config=global_config,
|
||||||
|
embedding_func=embedding_func,
|
||||||
|
)
|
||||||
|
self.__post_init__()
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
# Check for MONGODB_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all MongoDB storage instances
|
||||||
|
mongodb_workspace = os.environ.get("MONGODB_WORKSPACE")
|
||||||
|
if mongodb_workspace and mongodb_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = mongodb_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using MONGODB_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
self._collection_name = self.namespace
|
self._collection_name = self.namespace
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
|
|
@ -367,12 +431,36 @@ class MongoGraphStorage(BaseGraphStorage):
|
||||||
# edge collection storing source_node_id, target_node_id, and edge_properties
|
# edge collection storing source_node_id, target_node_id, and edge_properties
|
||||||
edgeCollection: AsyncCollection = field(default=None)
|
edgeCollection: AsyncCollection = field(default=None)
|
||||||
|
|
||||||
def __init__(self, namespace, global_config, embedding_func):
|
def __init__(self, namespace, global_config, embedding_func, workspace=None):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
namespace=namespace,
|
namespace=namespace,
|
||||||
|
workspace=workspace or "",
|
||||||
global_config=global_config,
|
global_config=global_config,
|
||||||
embedding_func=embedding_func,
|
embedding_func=embedding_func,
|
||||||
)
|
)
|
||||||
|
# Check for MONGODB_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all MongoDB storage instances
|
||||||
|
mongodb_workspace = os.environ.get("MONGODB_WORKSPACE")
|
||||||
|
if mongodb_workspace and mongodb_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = mongodb_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using MONGODB_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
self._collection_name = self.namespace
|
self._collection_name = self.namespace
|
||||||
self._edge_collection_name = f"{self._collection_name}_edges"
|
self._edge_collection_name = f"{self._collection_name}_edges"
|
||||||
|
|
||||||
|
|
@ -1230,8 +1318,52 @@ class MongoGraphStorage(BaseGraphStorage):
|
||||||
class MongoVectorDBStorage(BaseVectorStorage):
|
class MongoVectorDBStorage(BaseVectorStorage):
|
||||||
db: AsyncDatabase | None = field(default=None)
|
db: AsyncDatabase | None = field(default=None)
|
||||||
_data: AsyncCollection | None = field(default=None)
|
_data: AsyncCollection | None = field(default=None)
|
||||||
|
_index_name: str = field(default="", init=False)
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, namespace, global_config, embedding_func, workspace=None, meta_fields=None
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
namespace=namespace,
|
||||||
|
workspace=workspace or "",
|
||||||
|
global_config=global_config,
|
||||||
|
embedding_func=embedding_func,
|
||||||
|
meta_fields=meta_fields or set(),
|
||||||
|
)
|
||||||
|
self.__post_init__()
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
# Check for MONGODB_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all MongoDB storage instances
|
||||||
|
mongodb_workspace = os.environ.get("MONGODB_WORKSPACE")
|
||||||
|
if mongodb_workspace and mongodb_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = mongodb_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using MONGODB_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
|
# Set index name based on workspace for backward compatibility
|
||||||
|
if effective_workspace:
|
||||||
|
# Use collection-specific index name for workspaced collections to avoid conflicts
|
||||||
|
self._index_name = f"vector_knn_index_{self.namespace}"
|
||||||
|
else:
|
||||||
|
# Keep original index name for backward compatibility with existing deployments
|
||||||
|
self._index_name = "vector_knn_index"
|
||||||
|
|
||||||
kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})
|
kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})
|
||||||
cosine_threshold = kwargs.get("cosine_better_than_threshold")
|
cosine_threshold = kwargs.get("cosine_better_than_threshold")
|
||||||
if cosine_threshold is None:
|
if cosine_threshold is None:
|
||||||
|
|
@ -1261,13 +1393,11 @@ class MongoVectorDBStorage(BaseVectorStorage):
|
||||||
async def create_vector_index_if_not_exists(self):
|
async def create_vector_index_if_not_exists(self):
|
||||||
"""Creates an Atlas Vector Search index."""
|
"""Creates an Atlas Vector Search index."""
|
||||||
try:
|
try:
|
||||||
index_name = "vector_knn_index"
|
|
||||||
|
|
||||||
indexes_cursor = await self._data.list_search_indexes()
|
indexes_cursor = await self._data.list_search_indexes()
|
||||||
indexes = await indexes_cursor.to_list(length=None)
|
indexes = await indexes_cursor.to_list(length=None)
|
||||||
for index in indexes:
|
for index in indexes:
|
||||||
if index["name"] == index_name:
|
if index["name"] == self._index_name:
|
||||||
logger.debug("vector index already exist")
|
logger.info(f"vector index {self._index_name} already exist")
|
||||||
return
|
return
|
||||||
|
|
||||||
search_index_model = SearchIndexModel(
|
search_index_model = SearchIndexModel(
|
||||||
|
|
@ -1281,15 +1411,15 @@ class MongoVectorDBStorage(BaseVectorStorage):
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
name=index_name,
|
name=self._index_name,
|
||||||
type="vectorSearch",
|
type="vectorSearch",
|
||||||
)
|
)
|
||||||
|
|
||||||
await self._data.create_search_index(search_index_model)
|
await self._data.create_search_index(search_index_model)
|
||||||
logger.info("Vector index created successfully.")
|
logger.info(f"Vector index {self._index_name} created successfully.")
|
||||||
|
|
||||||
except PyMongoError as _:
|
except PyMongoError as e:
|
||||||
logger.debug("vector index already exist")
|
logger.error(f"Error creating vector index {self._index_name}: {e}")
|
||||||
|
|
||||||
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
||||||
logger.debug(f"Inserting {len(data)} to {self.namespace}")
|
logger.debug(f"Inserting {len(data)} to {self.namespace}")
|
||||||
|
|
@ -1344,7 +1474,7 @@ class MongoVectorDBStorage(BaseVectorStorage):
|
||||||
pipeline = [
|
pipeline = [
|
||||||
{
|
{
|
||||||
"$vectorSearch": {
|
"$vectorSearch": {
|
||||||
"index": "vector_knn_index", # Ensure this matches the created index name
|
"index": self._index_name, # Use stored index name for consistency
|
||||||
"path": "vector",
|
"path": "vector",
|
||||||
"queryVector": query_vector,
|
"queryVector": query_vector,
|
||||||
"numCandidates": 100, # Adjust for performance
|
"numCandidates": 100, # Adjust for performance
|
||||||
|
|
|
||||||
|
|
@ -41,9 +41,19 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
||||||
)
|
)
|
||||||
self.cosine_better_than_threshold = cosine_threshold
|
self.cosine_better_than_threshold = cosine_threshold
|
||||||
|
|
||||||
self._client_file_name = os.path.join(
|
working_dir = self.global_config["working_dir"]
|
||||||
self.global_config["working_dir"], f"vdb_{self.namespace}.json"
|
if self.workspace:
|
||||||
)
|
# Include workspace in the file path for data isolation
|
||||||
|
workspace_dir = os.path.join(working_dir, self.workspace)
|
||||||
|
os.makedirs(workspace_dir, exist_ok=True)
|
||||||
|
self._client_file_name = os.path.join(
|
||||||
|
workspace_dir, f"vdb_{self.namespace}.json"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Default behavior when workspace is empty
|
||||||
|
self._client_file_name = os.path.join(
|
||||||
|
working_dir, f"vdb_{self.namespace}.json"
|
||||||
|
)
|
||||||
self._max_batch_size = self.global_config["embedding_batch_num"]
|
self._max_batch_size = self.global_config["embedding_batch_num"]
|
||||||
|
|
||||||
self._client = NanoVectorDB(
|
self._client = NanoVectorDB(
|
||||||
|
|
|
||||||
|
|
@ -50,14 +50,25 @@ logging.getLogger("neo4j").setLevel(logging.ERROR)
|
||||||
@final
|
@final
|
||||||
@dataclass
|
@dataclass
|
||||||
class Neo4JStorage(BaseGraphStorage):
|
class Neo4JStorage(BaseGraphStorage):
|
||||||
def __init__(self, namespace, global_config, embedding_func):
|
def __init__(self, namespace, global_config, embedding_func, workspace=None):
|
||||||
|
# Check NEO4J_WORKSPACE environment variable and override workspace if set
|
||||||
|
neo4j_workspace = os.environ.get("NEO4J_WORKSPACE")
|
||||||
|
if neo4j_workspace and neo4j_workspace.strip():
|
||||||
|
workspace = neo4j_workspace
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
namespace=namespace,
|
namespace=namespace,
|
||||||
|
workspace=workspace or "",
|
||||||
global_config=global_config,
|
global_config=global_config,
|
||||||
embedding_func=embedding_func,
|
embedding_func=embedding_func,
|
||||||
)
|
)
|
||||||
self._driver = None
|
self._driver = None
|
||||||
|
|
||||||
|
def _get_workspace_label(self) -> str:
|
||||||
|
"""Get workspace label, return 'base' for compatibility when workspace is empty"""
|
||||||
|
workspace = getattr(self, "workspace", None)
|
||||||
|
return workspace if workspace else "base"
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
|
URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
|
||||||
USERNAME = os.environ.get(
|
USERNAME = os.environ.get(
|
||||||
|
|
@ -153,13 +164,14 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
if connected:
|
if connected:
|
||||||
# Create index for base nodes on entity_id if it doesn't exist
|
# Create index for workspace nodes on entity_id if it doesn't exist
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
try:
|
try:
|
||||||
async with self._driver.session(database=database) as session:
|
async with self._driver.session(database=database) as session:
|
||||||
# Check if index exists first
|
# Check if index exists first
|
||||||
check_query = """
|
check_query = f"""
|
||||||
CALL db.indexes() YIELD name, labelsOrTypes, properties
|
CALL db.indexes() YIELD name, labelsOrTypes, properties
|
||||||
WHERE labelsOrTypes = ['base'] AND properties = ['entity_id']
|
WHERE labelsOrTypes = ['{workspace_label}'] AND properties = ['entity_id']
|
||||||
RETURN count(*) > 0 AS exists
|
RETURN count(*) > 0 AS exists
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
|
@ -172,16 +184,16 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
if not index_exists:
|
if not index_exists:
|
||||||
# Create index only if it doesn't exist
|
# Create index only if it doesn't exist
|
||||||
result = await session.run(
|
result = await session.run(
|
||||||
"CREATE INDEX FOR (n:base) ON (n.entity_id)"
|
f"CREATE INDEX FOR (n:`{workspace_label}`) ON (n.entity_id)"
|
||||||
)
|
)
|
||||||
await result.consume()
|
await result.consume()
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Created index for base nodes on entity_id in {database}"
|
f"Created index for {workspace_label} nodes on entity_id in {database}"
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fallback if db.indexes() is not supported in this Neo4j version
|
# Fallback if db.indexes() is not supported in this Neo4j version
|
||||||
result = await session.run(
|
result = await session.run(
|
||||||
"CREATE INDEX IF NOT EXISTS FOR (n:base) ON (n.entity_id)"
|
f"CREATE INDEX IF NOT EXISTS FOR (n:`{workspace_label}`) ON (n.entity_id)"
|
||||||
)
|
)
|
||||||
await result.consume()
|
await result.consume()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -216,11 +228,12 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
ValueError: If node_id is invalid
|
ValueError: If node_id is invalid
|
||||||
Exception: If there is an error executing the query
|
Exception: If there is an error executing the query
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
try:
|
try:
|
||||||
query = "MATCH (n:base {entity_id: $entity_id}) RETURN count(n) > 0 AS node_exists"
|
query = f"MATCH (n:`{workspace_label}` {{entity_id: $entity_id}}) RETURN count(n) > 0 AS node_exists"
|
||||||
result = await session.run(query, entity_id=node_id)
|
result = await session.run(query, entity_id=node_id)
|
||||||
single_result = await result.single()
|
single_result = await result.single()
|
||||||
await result.consume() # Ensure result is fully consumed
|
await result.consume() # Ensure result is fully consumed
|
||||||
|
|
@ -245,12 +258,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
ValueError: If either node_id is invalid
|
ValueError: If either node_id is invalid
|
||||||
Exception: If there is an error executing the query
|
Exception: If there is an error executing the query
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
try:
|
try:
|
||||||
query = (
|
query = (
|
||||||
"MATCH (a:base {entity_id: $source_entity_id})-[r]-(b:base {entity_id: $target_entity_id}) "
|
f"MATCH (a:`{workspace_label}` {{entity_id: $source_entity_id}})-[r]-(b:`{workspace_label}` {{entity_id: $target_entity_id}}) "
|
||||||
"RETURN COUNT(r) > 0 AS edgeExists"
|
"RETURN COUNT(r) > 0 AS edgeExists"
|
||||||
)
|
)
|
||||||
result = await session.run(
|
result = await session.run(
|
||||||
|
|
@ -282,11 +296,14 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
ValueError: If node_id is invalid
|
ValueError: If node_id is invalid
|
||||||
Exception: If there is an error executing the query
|
Exception: If there is an error executing the query
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
try:
|
try:
|
||||||
query = "MATCH (n:base {entity_id: $entity_id}) RETURN n"
|
query = (
|
||||||
|
f"MATCH (n:`{workspace_label}` {{entity_id: $entity_id}}) RETURN n"
|
||||||
|
)
|
||||||
result = await session.run(query, entity_id=node_id)
|
result = await session.run(query, entity_id=node_id)
|
||||||
try:
|
try:
|
||||||
records = await result.fetch(
|
records = await result.fetch(
|
||||||
|
|
@ -300,12 +317,12 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
if records:
|
if records:
|
||||||
node = records[0]["n"]
|
node = records[0]["n"]
|
||||||
node_dict = dict(node)
|
node_dict = dict(node)
|
||||||
# Remove base label from labels list if it exists
|
# Remove workspace label from labels list if it exists
|
||||||
if "labels" in node_dict:
|
if "labels" in node_dict:
|
||||||
node_dict["labels"] = [
|
node_dict["labels"] = [
|
||||||
label
|
label
|
||||||
for label in node_dict["labels"]
|
for label in node_dict["labels"]
|
||||||
if label != "base"
|
if label != workspace_label
|
||||||
]
|
]
|
||||||
# logger.debug(f"Neo4j query node {query} return: {node_dict}")
|
# logger.debug(f"Neo4j query node {query} return: {node_dict}")
|
||||||
return node_dict
|
return node_dict
|
||||||
|
|
@ -326,12 +343,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
Returns:
|
Returns:
|
||||||
A dictionary mapping each node_id to its node data (or None if not found).
|
A dictionary mapping each node_id to its node data (or None if not found).
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
query = f"""
|
||||||
UNWIND $node_ids AS id
|
UNWIND $node_ids AS id
|
||||||
MATCH (n:base {entity_id: id})
|
MATCH (n:`{workspace_label}` {{entity_id: id}})
|
||||||
RETURN n.entity_id AS entity_id, n
|
RETURN n.entity_id AS entity_id, n
|
||||||
"""
|
"""
|
||||||
result = await session.run(query, node_ids=node_ids)
|
result = await session.run(query, node_ids=node_ids)
|
||||||
|
|
@ -340,10 +358,12 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
entity_id = record["entity_id"]
|
entity_id = record["entity_id"]
|
||||||
node = record["n"]
|
node = record["n"]
|
||||||
node_dict = dict(node)
|
node_dict = dict(node)
|
||||||
# Remove the 'base' label if present in a 'labels' property
|
# Remove the workspace label if present in a 'labels' property
|
||||||
if "labels" in node_dict:
|
if "labels" in node_dict:
|
||||||
node_dict["labels"] = [
|
node_dict["labels"] = [
|
||||||
label for label in node_dict["labels"] if label != "base"
|
label
|
||||||
|
for label in node_dict["labels"]
|
||||||
|
if label != workspace_label
|
||||||
]
|
]
|
||||||
nodes[entity_id] = node_dict
|
nodes[entity_id] = node_dict
|
||||||
await result.consume() # Make sure to consume the result fully
|
await result.consume() # Make sure to consume the result fully
|
||||||
|
|
@ -364,12 +384,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
ValueError: If node_id is invalid
|
ValueError: If node_id is invalid
|
||||||
Exception: If there is an error executing the query
|
Exception: If there is an error executing the query
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
try:
|
try:
|
||||||
query = """
|
query = f"""
|
||||||
MATCH (n:base {entity_id: $entity_id})
|
MATCH (n:`{workspace_label}` {{entity_id: $entity_id}})
|
||||||
OPTIONAL MATCH (n)-[r]-()
|
OPTIONAL MATCH (n)-[r]-()
|
||||||
RETURN COUNT(r) AS degree
|
RETURN COUNT(r) AS degree
|
||||||
"""
|
"""
|
||||||
|
|
@ -403,13 +424,14 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
A dictionary mapping each node_id to its degree (number of relationships).
|
A dictionary mapping each node_id to its degree (number of relationships).
|
||||||
If a node is not found, its degree will be set to 0.
|
If a node is not found, its degree will be set to 0.
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
query = f"""
|
||||||
UNWIND $node_ids AS id
|
UNWIND $node_ids AS id
|
||||||
MATCH (n:base {entity_id: id})
|
MATCH (n:`{workspace_label}` {{entity_id: id}})
|
||||||
RETURN n.entity_id AS entity_id, count { (n)--() } AS degree;
|
RETURN n.entity_id AS entity_id, count {{ (n)--() }} AS degree;
|
||||||
"""
|
"""
|
||||||
result = await session.run(query, node_ids=node_ids)
|
result = await session.run(query, node_ids=node_ids)
|
||||||
degrees = {}
|
degrees = {}
|
||||||
|
|
@ -489,12 +511,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
ValueError: If either node_id is invalid
|
ValueError: If either node_id is invalid
|
||||||
Exception: If there is an error executing the query
|
Exception: If there is an error executing the query
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
try:
|
try:
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
query = f"""
|
||||||
MATCH (start:base {entity_id: $source_entity_id})-[r]-(end:base {entity_id: $target_entity_id})
|
MATCH (start:`{workspace_label}` {{entity_id: $source_entity_id}})-[r]-(end:`{workspace_label}` {{entity_id: $target_entity_id}})
|
||||||
RETURN properties(r) as edge_properties
|
RETURN properties(r) as edge_properties
|
||||||
"""
|
"""
|
||||||
result = await session.run(
|
result = await session.run(
|
||||||
|
|
@ -571,12 +594,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
Returns:
|
Returns:
|
||||||
A dictionary mapping (src, tgt) tuples to their edge properties.
|
A dictionary mapping (src, tgt) tuples to their edge properties.
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
query = f"""
|
||||||
UNWIND $pairs AS pair
|
UNWIND $pairs AS pair
|
||||||
MATCH (start:base {entity_id: pair.src})-[r:DIRECTED]-(end:base {entity_id: pair.tgt})
|
MATCH (start:`{workspace_label}` {{entity_id: pair.src}})-[r:DIRECTED]-(end:`{workspace_label}` {{entity_id: pair.tgt}})
|
||||||
RETURN pair.src AS src_id, pair.tgt AS tgt_id, collect(properties(r)) AS edges
|
RETURN pair.src AS src_id, pair.tgt AS tgt_id, collect(properties(r)) AS edges
|
||||||
"""
|
"""
|
||||||
result = await session.run(query, pairs=pairs)
|
result = await session.run(query, pairs=pairs)
|
||||||
|
|
@ -627,8 +651,9 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
try:
|
try:
|
||||||
query = """MATCH (n:base {entity_id: $entity_id})
|
workspace_label = self._get_workspace_label()
|
||||||
OPTIONAL MATCH (n)-[r]-(connected:base)
|
query = f"""MATCH (n:`{workspace_label}` {{entity_id: $entity_id}})
|
||||||
|
OPTIONAL MATCH (n)-[r]-(connected:`{workspace_label}`)
|
||||||
WHERE connected.entity_id IS NOT NULL
|
WHERE connected.entity_id IS NOT NULL
|
||||||
RETURN n, r, connected"""
|
RETURN n, r, connected"""
|
||||||
results = await session.run(query, entity_id=source_node_id)
|
results = await session.run(query, entity_id=source_node_id)
|
||||||
|
|
@ -689,10 +714,11 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
# Query to get both outgoing and incoming edges
|
# Query to get both outgoing and incoming edges
|
||||||
query = """
|
workspace_label = self._get_workspace_label()
|
||||||
|
query = f"""
|
||||||
UNWIND $node_ids AS id
|
UNWIND $node_ids AS id
|
||||||
MATCH (n:base {entity_id: id})
|
MATCH (n:`{workspace_label}` {{entity_id: id}})
|
||||||
OPTIONAL MATCH (n)-[r]-(connected:base)
|
OPTIONAL MATCH (n)-[r]-(connected:`{workspace_label}`)
|
||||||
RETURN id AS queried_id, n.entity_id AS node_entity_id,
|
RETURN id AS queried_id, n.entity_id AS node_entity_id,
|
||||||
connected.entity_id AS connected_entity_id,
|
connected.entity_id AS connected_entity_id,
|
||||||
startNode(r).entity_id AS start_entity_id
|
startNode(r).entity_id AS start_entity_id
|
||||||
|
|
@ -727,12 +753,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
return edges_dict
|
return edges_dict
|
||||||
|
|
||||||
async def get_nodes_by_chunk_ids(self, chunk_ids: list[str]) -> list[dict]:
|
async def get_nodes_by_chunk_ids(self, chunk_ids: list[str]) -> list[dict]:
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
query = f"""
|
||||||
UNWIND $chunk_ids AS chunk_id
|
UNWIND $chunk_ids AS chunk_id
|
||||||
MATCH (n:base)
|
MATCH (n:`{workspace_label}`)
|
||||||
WHERE n.source_id IS NOT NULL AND chunk_id IN split(n.source_id, $sep)
|
WHERE n.source_id IS NOT NULL AND chunk_id IN split(n.source_id, $sep)
|
||||||
RETURN DISTINCT n
|
RETURN DISTINCT n
|
||||||
"""
|
"""
|
||||||
|
|
@ -748,12 +775,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
return nodes
|
return nodes
|
||||||
|
|
||||||
async def get_edges_by_chunk_ids(self, chunk_ids: list[str]) -> list[dict]:
|
async def get_edges_by_chunk_ids(self, chunk_ids: list[str]) -> list[dict]:
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
query = f"""
|
||||||
UNWIND $chunk_ids AS chunk_id
|
UNWIND $chunk_ids AS chunk_id
|
||||||
MATCH (a:base)-[r]-(b:base)
|
MATCH (a:`{workspace_label}`)-[r]-(b:`{workspace_label}`)
|
||||||
WHERE r.source_id IS NOT NULL AND chunk_id IN split(r.source_id, $sep)
|
WHERE r.source_id IS NOT NULL AND chunk_id IN split(r.source_id, $sep)
|
||||||
RETURN DISTINCT a.entity_id AS source, b.entity_id AS target, properties(r) AS properties
|
RETURN DISTINCT a.entity_id AS source, b.entity_id AS target, properties(r) AS properties
|
||||||
"""
|
"""
|
||||||
|
|
@ -787,6 +815,7 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
node_id: The unique identifier for the node (used as label)
|
node_id: The unique identifier for the node (used as label)
|
||||||
node_data: Dictionary of node properties
|
node_data: Dictionary of node properties
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
properties = node_data
|
properties = node_data
|
||||||
entity_type = properties["entity_type"]
|
entity_type = properties["entity_type"]
|
||||||
if "entity_id" not in properties:
|
if "entity_id" not in properties:
|
||||||
|
|
@ -796,14 +825,11 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
async with self._driver.session(database=self._DATABASE) as session:
|
async with self._driver.session(database=self._DATABASE) as session:
|
||||||
|
|
||||||
async def execute_upsert(tx: AsyncManagedTransaction):
|
async def execute_upsert(tx: AsyncManagedTransaction):
|
||||||
query = (
|
query = f"""
|
||||||
"""
|
MERGE (n:`{workspace_label}` {{entity_id: $entity_id}})
|
||||||
MERGE (n:base {entity_id: $entity_id})
|
|
||||||
SET n += $properties
|
SET n += $properties
|
||||||
SET n:`%s`
|
SET n:`{entity_type}`
|
||||||
"""
|
"""
|
||||||
% entity_type
|
|
||||||
)
|
|
||||||
result = await tx.run(
|
result = await tx.run(
|
||||||
query, entity_id=node_id, properties=properties
|
query, entity_id=node_id, properties=properties
|
||||||
)
|
)
|
||||||
|
|
@ -847,10 +873,11 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
async with self._driver.session(database=self._DATABASE) as session:
|
async with self._driver.session(database=self._DATABASE) as session:
|
||||||
|
|
||||||
async def execute_upsert(tx: AsyncManagedTransaction):
|
async def execute_upsert(tx: AsyncManagedTransaction):
|
||||||
query = """
|
workspace_label = self._get_workspace_label()
|
||||||
MATCH (source:base {entity_id: $source_entity_id})
|
query = f"""
|
||||||
|
MATCH (source:`{workspace_label}` {{entity_id: $source_entity_id}})
|
||||||
WITH source
|
WITH source
|
||||||
MATCH (target:base {entity_id: $target_entity_id})
|
MATCH (target:`{workspace_label}` {{entity_id: $target_entity_id}})
|
||||||
MERGE (source)-[r:DIRECTED]-(target)
|
MERGE (source)-[r:DIRECTED]-(target)
|
||||||
SET r += $properties
|
SET r += $properties
|
||||||
RETURN r, source, target
|
RETURN r, source, target
|
||||||
|
|
@ -889,6 +916,7 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
KnowledgeGraph object containing nodes and edges, with an is_truncated flag
|
KnowledgeGraph object containing nodes and edges, with an is_truncated flag
|
||||||
indicating whether the graph was truncated due to max_nodes limit
|
indicating whether the graph was truncated due to max_nodes limit
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
result = KnowledgeGraph()
|
result = KnowledgeGraph()
|
||||||
seen_nodes = set()
|
seen_nodes = set()
|
||||||
seen_edges = set()
|
seen_edges = set()
|
||||||
|
|
@ -899,7 +927,9 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
try:
|
try:
|
||||||
if node_label == "*":
|
if node_label == "*":
|
||||||
# First check total node count to determine if graph is truncated
|
# First check total node count to determine if graph is truncated
|
||||||
count_query = "MATCH (n) RETURN count(n) as total"
|
count_query = (
|
||||||
|
f"MATCH (n:`{workspace_label}`) RETURN count(n) as total"
|
||||||
|
)
|
||||||
count_result = None
|
count_result = None
|
||||||
try:
|
try:
|
||||||
count_result = await session.run(count_query)
|
count_result = await session.run(count_query)
|
||||||
|
|
@ -915,13 +945,13 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
await count_result.consume()
|
await count_result.consume()
|
||||||
|
|
||||||
# Run main query to get nodes with highest degree
|
# Run main query to get nodes with highest degree
|
||||||
main_query = """
|
main_query = f"""
|
||||||
MATCH (n)
|
MATCH (n:`{workspace_label}`)
|
||||||
OPTIONAL MATCH (n)-[r]-()
|
OPTIONAL MATCH (n)-[r]-()
|
||||||
WITH n, COALESCE(count(r), 0) AS degree
|
WITH n, COALESCE(count(r), 0) AS degree
|
||||||
ORDER BY degree DESC
|
ORDER BY degree DESC
|
||||||
LIMIT $max_nodes
|
LIMIT $max_nodes
|
||||||
WITH collect({node: n}) AS filtered_nodes
|
WITH collect({{node: n}}) AS filtered_nodes
|
||||||
UNWIND filtered_nodes AS node_info
|
UNWIND filtered_nodes AS node_info
|
||||||
WITH collect(node_info.node) AS kept_nodes, filtered_nodes
|
WITH collect(node_info.node) AS kept_nodes, filtered_nodes
|
||||||
OPTIONAL MATCH (a)-[r]-(b)
|
OPTIONAL MATCH (a)-[r]-(b)
|
||||||
|
|
@ -943,20 +973,21 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
else:
|
else:
|
||||||
# return await self._robust_fallback(node_label, max_depth, max_nodes)
|
# return await self._robust_fallback(node_label, max_depth, max_nodes)
|
||||||
# First try without limit to check if we need to truncate
|
# First try without limit to check if we need to truncate
|
||||||
full_query = """
|
full_query = f"""
|
||||||
MATCH (start)
|
MATCH (start:`{workspace_label}`)
|
||||||
WHERE start.entity_id = $entity_id
|
WHERE start.entity_id = $entity_id
|
||||||
WITH start
|
WITH start
|
||||||
CALL apoc.path.subgraphAll(start, {
|
CALL apoc.path.subgraphAll(start, {{
|
||||||
relationshipFilter: '',
|
relationshipFilter: '',
|
||||||
|
labelFilter: '{workspace_label}',
|
||||||
minLevel: 0,
|
minLevel: 0,
|
||||||
maxLevel: $max_depth,
|
maxLevel: $max_depth,
|
||||||
bfs: true
|
bfs: true
|
||||||
})
|
}})
|
||||||
YIELD nodes, relationships
|
YIELD nodes, relationships
|
||||||
WITH nodes, relationships, size(nodes) AS total_nodes
|
WITH nodes, relationships, size(nodes) AS total_nodes
|
||||||
UNWIND nodes AS node
|
UNWIND nodes AS node
|
||||||
WITH collect({node: node}) AS node_info, relationships, total_nodes
|
WITH collect({{node: node}}) AS node_info, relationships, total_nodes
|
||||||
RETURN node_info, relationships, total_nodes
|
RETURN node_info, relationships, total_nodes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -994,20 +1025,21 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run limited query
|
# Run limited query
|
||||||
limited_query = """
|
limited_query = f"""
|
||||||
MATCH (start)
|
MATCH (start:`{workspace_label}`)
|
||||||
WHERE start.entity_id = $entity_id
|
WHERE start.entity_id = $entity_id
|
||||||
WITH start
|
WITH start
|
||||||
CALL apoc.path.subgraphAll(start, {
|
CALL apoc.path.subgraphAll(start, {{
|
||||||
relationshipFilter: '',
|
relationshipFilter: '',
|
||||||
|
labelFilter: '{workspace_label}',
|
||||||
minLevel: 0,
|
minLevel: 0,
|
||||||
maxLevel: $max_depth,
|
maxLevel: $max_depth,
|
||||||
limit: $max_nodes,
|
limit: $max_nodes,
|
||||||
bfs: true
|
bfs: true
|
||||||
})
|
}})
|
||||||
YIELD nodes, relationships
|
YIELD nodes, relationships
|
||||||
UNWIND nodes AS node
|
UNWIND nodes AS node
|
||||||
WITH collect({node: node}) AS node_info, relationships
|
WITH collect({{node: node}}) AS node_info, relationships
|
||||||
RETURN node_info, relationships
|
RETURN node_info, relationships
|
||||||
"""
|
"""
|
||||||
result_set = None
|
result_set = None
|
||||||
|
|
@ -1094,11 +1126,12 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
visited_edge_pairs = set()
|
visited_edge_pairs = set()
|
||||||
|
|
||||||
# Get the starting node's data
|
# Get the starting node's data
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
query = f"""
|
||||||
MATCH (n:base {entity_id: $entity_id})
|
MATCH (n:`{workspace_label}` {{entity_id: $entity_id}})
|
||||||
RETURN id(n) as node_id, n
|
RETURN id(n) as node_id, n
|
||||||
"""
|
"""
|
||||||
node_result = await session.run(query, entity_id=node_label)
|
node_result = await session.run(query, entity_id=node_label)
|
||||||
|
|
@ -1156,8 +1189,9 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
query = """
|
workspace_label = self._get_workspace_label()
|
||||||
MATCH (a:base {entity_id: $entity_id})-[r]-(b)
|
query = f"""
|
||||||
|
MATCH (a:`{workspace_label}` {{entity_id: $entity_id}})-[r]-(b)
|
||||||
WITH r, b, id(r) as edge_id, id(b) as target_id
|
WITH r, b, id(r) as edge_id, id(b) as target_id
|
||||||
RETURN r, b, edge_id, target_id
|
RETURN r, b, edge_id, target_id
|
||||||
"""
|
"""
|
||||||
|
|
@ -1241,6 +1275,7 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
Returns:
|
Returns:
|
||||||
["Person", "Company", ...] # Alphabetically sorted label list
|
["Person", "Company", ...] # Alphabetically sorted label list
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
async with self._driver.session(
|
async with self._driver.session(
|
||||||
database=self._DATABASE, default_access_mode="READ"
|
database=self._DATABASE, default_access_mode="READ"
|
||||||
) as session:
|
) as session:
|
||||||
|
|
@ -1248,8 +1283,8 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
# query = "CALL db.labels() YIELD label RETURN label"
|
# query = "CALL db.labels() YIELD label RETURN label"
|
||||||
|
|
||||||
# Method 2: Query compatible with older versions
|
# Method 2: Query compatible with older versions
|
||||||
query = """
|
query = f"""
|
||||||
MATCH (n:base)
|
MATCH (n:`{workspace_label}`)
|
||||||
WHERE n.entity_id IS NOT NULL
|
WHERE n.entity_id IS NOT NULL
|
||||||
RETURN DISTINCT n.entity_id AS label
|
RETURN DISTINCT n.entity_id AS label
|
||||||
ORDER BY label
|
ORDER BY label
|
||||||
|
|
@ -1285,8 +1320,9 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
async def _do_delete(tx: AsyncManagedTransaction):
|
async def _do_delete(tx: AsyncManagedTransaction):
|
||||||
query = """
|
workspace_label = self._get_workspace_label()
|
||||||
MATCH (n:base {entity_id: $entity_id})
|
query = f"""
|
||||||
|
MATCH (n:`{workspace_label}` {{entity_id: $entity_id}})
|
||||||
DETACH DELETE n
|
DETACH DELETE n
|
||||||
"""
|
"""
|
||||||
result = await tx.run(query, entity_id=node_id)
|
result = await tx.run(query, entity_id=node_id)
|
||||||
|
|
@ -1342,8 +1378,9 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
for source, target in edges:
|
for source, target in edges:
|
||||||
|
|
||||||
async def _do_delete_edge(tx: AsyncManagedTransaction):
|
async def _do_delete_edge(tx: AsyncManagedTransaction):
|
||||||
query = """
|
workspace_label = self._get_workspace_label()
|
||||||
MATCH (source:base {entity_id: $source_entity_id})-[r]-(target:base {entity_id: $target_entity_id})
|
query = f"""
|
||||||
|
MATCH (source:`{workspace_label}` {{entity_id: $source_entity_id}})-[r]-(target:`{workspace_label}` {{entity_id: $target_entity_id}})
|
||||||
DELETE r
|
DELETE r
|
||||||
"""
|
"""
|
||||||
result = await tx.run(
|
result = await tx.run(
|
||||||
|
|
@ -1360,26 +1397,32 @@ class Neo4JStorage(BaseGraphStorage):
|
||||||
raise
|
raise
|
||||||
|
|
||||||
async def drop(self) -> dict[str, str]:
|
async def drop(self) -> dict[str, str]:
|
||||||
"""Drop all data from storage and clean up resources
|
"""Drop all data from current workspace storage and clean up resources
|
||||||
|
|
||||||
This method will delete all nodes and relationships in the Neo4j database.
|
This method will delete all nodes and relationships in the current workspace only.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict[str, str]: Operation status and message
|
dict[str, str]: Operation status and message
|
||||||
- On success: {"status": "success", "message": "data dropped"}
|
- On success: {"status": "success", "message": "workspace data dropped"}
|
||||||
- On failure: {"status": "error", "message": "<error details>"}
|
- On failure: {"status": "error", "message": "<error details>"}
|
||||||
"""
|
"""
|
||||||
|
workspace_label = self._get_workspace_label()
|
||||||
try:
|
try:
|
||||||
async with self._driver.session(database=self._DATABASE) as session:
|
async with self._driver.session(database=self._DATABASE) as session:
|
||||||
# Delete all nodes and relationships
|
# Delete all nodes and relationships in current workspace only
|
||||||
query = "MATCH (n) DETACH DELETE n"
|
query = f"MATCH (n:`{workspace_label}`) DETACH DELETE n"
|
||||||
result = await session.run(query)
|
result = await session.run(query)
|
||||||
await result.consume() # Ensure result is fully consumed
|
await result.consume() # Ensure result is fully consumed
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Process {os.getpid()} drop Neo4j database {self._DATABASE}"
|
f"Process {os.getpid()} drop Neo4j workspace '{workspace_label}' in database {self._DATABASE}"
|
||||||
)
|
)
|
||||||
return {"status": "success", "message": "data dropped"}
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": f"workspace '{workspace_label}' data dropped",
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error dropping Neo4j database {self._DATABASE}: {e}")
|
logger.error(
|
||||||
|
f"Error dropping Neo4j workspace '{workspace_label}' in database {self._DATABASE}: {e}"
|
||||||
|
)
|
||||||
return {"status": "error", "message": str(e)}
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
|
||||||
|
|
@ -46,9 +46,19 @@ class NetworkXStorage(BaseGraphStorage):
|
||||||
nx.write_graphml(graph, file_name)
|
nx.write_graphml(graph, file_name)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self._graphml_xml_file = os.path.join(
|
working_dir = self.global_config["working_dir"]
|
||||||
self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
|
if self.workspace:
|
||||||
)
|
# Include workspace in the file path for data isolation
|
||||||
|
workspace_dir = os.path.join(working_dir, self.workspace)
|
||||||
|
os.makedirs(workspace_dir, exist_ok=True)
|
||||||
|
self._graphml_xml_file = os.path.join(
|
||||||
|
workspace_dir, f"graph_{self.namespace}.graphml"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Default behavior when workspace is empty
|
||||||
|
self._graphml_xml_file = os.path.join(
|
||||||
|
working_dir, f"graph_{self.namespace}.graphml"
|
||||||
|
)
|
||||||
self._storage_lock = None
|
self._storage_lock = None
|
||||||
self.storage_updated = None
|
self.storage_updated = None
|
||||||
self._graph = None
|
self._graph = None
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import datetime
|
import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
@ -319,7 +320,7 @@ class PostgreSQLDB:
|
||||||
# Get all old format data
|
# Get all old format data
|
||||||
old_data_sql = """
|
old_data_sql = """
|
||||||
SELECT id, mode, original_prompt, return_value, chunk_id,
|
SELECT id, mode, original_prompt, return_value, chunk_id,
|
||||||
create_time, update_time
|
workspace, create_time, update_time
|
||||||
FROM LIGHTRAG_LLM_CACHE
|
FROM LIGHTRAG_LLM_CACHE
|
||||||
WHERE id NOT LIKE '%:%'
|
WHERE id NOT LIKE '%:%'
|
||||||
"""
|
"""
|
||||||
|
|
@ -364,7 +365,9 @@ class PostgreSQLDB:
|
||||||
await self.execute(
|
await self.execute(
|
||||||
insert_sql,
|
insert_sql,
|
||||||
{
|
{
|
||||||
"workspace": self.workspace,
|
"workspace": record[
|
||||||
|
"workspace"
|
||||||
|
], # Use original record's workspace
|
||||||
"id": new_key,
|
"id": new_key,
|
||||||
"mode": record["mode"],
|
"mode": record["mode"],
|
||||||
"original_prompt": record["original_prompt"],
|
"original_prompt": record["original_prompt"],
|
||||||
|
|
@ -384,7 +387,9 @@ class PostgreSQLDB:
|
||||||
await self.execute(
|
await self.execute(
|
||||||
delete_sql,
|
delete_sql,
|
||||||
{
|
{
|
||||||
"workspace": self.workspace,
|
"workspace": record[
|
||||||
|
"workspace"
|
||||||
|
], # Use original record's workspace
|
||||||
"mode": record["mode"],
|
"mode": record["mode"],
|
||||||
"id": record["id"], # Old id
|
"id": record["id"], # Old id
|
||||||
},
|
},
|
||||||
|
|
@ -505,6 +510,29 @@ class PostgreSQLDB:
|
||||||
f"PostgreSQL, Failed to create index on table {k}, Got: {e}"
|
f"PostgreSQL, Failed to create index on table {k}, Got: {e}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Create composite index for (workspace, id) columns in each table
|
||||||
|
try:
|
||||||
|
composite_index_name = f"idx_{k.lower()}_workspace_id"
|
||||||
|
check_composite_index_sql = f"""
|
||||||
|
SELECT 1 FROM pg_indexes
|
||||||
|
WHERE indexname = '{composite_index_name}'
|
||||||
|
AND tablename = '{k.lower()}'
|
||||||
|
"""
|
||||||
|
composite_index_exists = await self.query(check_composite_index_sql)
|
||||||
|
|
||||||
|
if not composite_index_exists:
|
||||||
|
create_composite_index_sql = (
|
||||||
|
f"CREATE INDEX {composite_index_name} ON {k}(workspace, id)"
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"PostgreSQL, Creating composite index {composite_index_name} on table {k}"
|
||||||
|
)
|
||||||
|
await self.execute(create_composite_index_sql)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"PostgreSQL, Failed to create composite index on table {k}, Got: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
# After all tables are created, attempt to migrate timestamp fields
|
# After all tables are created, attempt to migrate timestamp fields
|
||||||
try:
|
try:
|
||||||
await self._migrate_timestamp_columns()
|
await self._migrate_timestamp_columns()
|
||||||
|
|
@ -670,7 +698,7 @@ class ClientManager:
|
||||||
),
|
),
|
||||||
"workspace": os.environ.get(
|
"workspace": os.environ.get(
|
||||||
"POSTGRES_WORKSPACE",
|
"POSTGRES_WORKSPACE",
|
||||||
config.get("postgres", "workspace", fallback="default"),
|
config.get("postgres", "workspace", fallback=None),
|
||||||
),
|
),
|
||||||
"max_connections": os.environ.get(
|
"max_connections": os.environ.get(
|
||||||
"POSTGRES_MAX_CONNECTIONS",
|
"POSTGRES_MAX_CONNECTIONS",
|
||||||
|
|
@ -716,6 +744,18 @@ class PGKVStorage(BaseKVStorage):
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
if self.db is None:
|
if self.db is None:
|
||||||
self.db = await ClientManager.get_client()
|
self.db = await ClientManager.get_client()
|
||||||
|
# Implement workspace priority: PostgreSQLDB.workspace > self.workspace > "default"
|
||||||
|
if self.db.workspace:
|
||||||
|
# Use PostgreSQLDB's workspace (highest priority)
|
||||||
|
final_workspace = self.db.workspace
|
||||||
|
elif hasattr(self, "workspace") and self.workspace:
|
||||||
|
# Use storage class's workspace (medium priority)
|
||||||
|
final_workspace = self.workspace
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
else:
|
||||||
|
# Use "default" for compatibility (lowest priority)
|
||||||
|
final_workspace = "default"
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
|
||||||
async def finalize(self):
|
async def finalize(self):
|
||||||
if self.db is not None:
|
if self.db is not None:
|
||||||
|
|
@ -1047,6 +1087,18 @@ class PGVectorStorage(BaseVectorStorage):
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
if self.db is None:
|
if self.db is None:
|
||||||
self.db = await ClientManager.get_client()
|
self.db = await ClientManager.get_client()
|
||||||
|
# Implement workspace priority: PostgreSQLDB.workspace > self.workspace > "default"
|
||||||
|
if self.db.workspace:
|
||||||
|
# Use PostgreSQLDB's workspace (highest priority)
|
||||||
|
final_workspace = self.db.workspace
|
||||||
|
elif hasattr(self, "workspace") and self.workspace:
|
||||||
|
# Use storage class's workspace (medium priority)
|
||||||
|
final_workspace = self.workspace
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
else:
|
||||||
|
# Use "default" for compatibility (lowest priority)
|
||||||
|
final_workspace = "default"
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
|
||||||
async def finalize(self):
|
async def finalize(self):
|
||||||
if self.db is not None:
|
if self.db is not None:
|
||||||
|
|
@ -1328,6 +1380,18 @@ class PGDocStatusStorage(DocStatusStorage):
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
if self.db is None:
|
if self.db is None:
|
||||||
self.db = await ClientManager.get_client()
|
self.db = await ClientManager.get_client()
|
||||||
|
# Implement workspace priority: PostgreSQLDB.workspace > self.workspace > "default"
|
||||||
|
if self.db.workspace:
|
||||||
|
# Use PostgreSQLDB's workspace (highest priority)
|
||||||
|
final_workspace = self.db.workspace
|
||||||
|
elif hasattr(self, "workspace") and self.workspace:
|
||||||
|
# Use storage class's workspace (medium priority)
|
||||||
|
final_workspace = self.workspace
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
else:
|
||||||
|
# Use "default" for compatibility (lowest priority)
|
||||||
|
final_workspace = "default"
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
|
||||||
async def finalize(self):
|
async def finalize(self):
|
||||||
if self.db is not None:
|
if self.db is not None:
|
||||||
|
|
@ -1606,9 +1670,34 @@ class PGGraphQueryException(Exception):
|
||||||
@dataclass
|
@dataclass
|
||||||
class PGGraphStorage(BaseGraphStorage):
|
class PGGraphStorage(BaseGraphStorage):
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self.graph_name = self.namespace or os.environ.get("AGE_GRAPH_NAME", "lightrag")
|
# Graph name will be dynamically generated in initialize() based on workspace
|
||||||
self.db: PostgreSQLDB | None = None
|
self.db: PostgreSQLDB | None = None
|
||||||
|
|
||||||
|
def _get_workspace_graph_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Generate graph name based on workspace and namespace for data isolation.
|
||||||
|
Rules:
|
||||||
|
- If workspace is empty: graph_name = namespace
|
||||||
|
- If workspace has value: graph_name = workspace_namespace
|
||||||
|
|
||||||
|
Args:
|
||||||
|
None
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The graph name for the current workspace
|
||||||
|
"""
|
||||||
|
workspace = getattr(self, "workspace", None)
|
||||||
|
namespace = self.namespace or os.environ.get("AGE_GRAPH_NAME", "lightrag")
|
||||||
|
|
||||||
|
if workspace and workspace.strip():
|
||||||
|
# Ensure names comply with PostgreSQL identifier specifications
|
||||||
|
safe_workspace = re.sub(r"[^a-zA-Z0-9_]", "_", workspace.strip())
|
||||||
|
safe_namespace = re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
|
||||||
|
return f"{safe_workspace}_{safe_namespace}"
|
||||||
|
else:
|
||||||
|
# When workspace is empty, use namespace directly
|
||||||
|
return re.sub(r"[^a-zA-Z0-9_]", "_", namespace)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _normalize_node_id(node_id: str) -> str:
|
def _normalize_node_id(node_id: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|
@ -1629,6 +1718,27 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
if self.db is None:
|
if self.db is None:
|
||||||
self.db = await ClientManager.get_client()
|
self.db = await ClientManager.get_client()
|
||||||
|
# Implement workspace priority: PostgreSQLDB.workspace > self.workspace > None
|
||||||
|
if self.db.workspace:
|
||||||
|
# Use PostgreSQLDB's workspace (highest priority)
|
||||||
|
final_workspace = self.db.workspace
|
||||||
|
elif hasattr(self, "workspace") and self.workspace:
|
||||||
|
# Use storage class's workspace (medium priority)
|
||||||
|
final_workspace = self.workspace
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
else:
|
||||||
|
# Use None for compatibility (lowest priority)
|
||||||
|
final_workspace = None
|
||||||
|
self.db.workspace = final_workspace
|
||||||
|
|
||||||
|
# Dynamically generate graph name based on workspace
|
||||||
|
self.workspace = self.db.workspace
|
||||||
|
self.graph_name = self._get_workspace_graph_name()
|
||||||
|
|
||||||
|
# Log the graph initialization for debugging
|
||||||
|
logger.info(
|
||||||
|
f"PostgreSQL Graph initialized: workspace='{self.workspace}', graph_name='{self.graph_name}'"
|
||||||
|
)
|
||||||
|
|
||||||
# Execute each statement separately and ignore errors
|
# Execute each statement separately and ignore errors
|
||||||
queries = [
|
queries = [
|
||||||
|
|
@ -2833,7 +2943,10 @@ class PGGraphStorage(BaseGraphStorage):
|
||||||
$$) AS (result agtype)"""
|
$$) AS (result agtype)"""
|
||||||
|
|
||||||
await self._query(drop_query, readonly=False)
|
await self._query(drop_query, readonly=False)
|
||||||
return {"status": "success", "message": "graph data dropped"}
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": f"workspace '{self.workspace}' graph data dropped",
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error dropping graph: {e}")
|
logger.error(f"Error dropping graph: {e}")
|
||||||
return {"status": "error", "message": str(e)}
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,18 @@ def compute_mdhash_id_for_qdrant(
|
||||||
@final
|
@final
|
||||||
@dataclass
|
@dataclass
|
||||||
class QdrantVectorDBStorage(BaseVectorStorage):
|
class QdrantVectorDBStorage(BaseVectorStorage):
|
||||||
|
def __init__(
|
||||||
|
self, namespace, global_config, embedding_func, workspace=None, meta_fields=None
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
namespace=namespace,
|
||||||
|
workspace=workspace or "",
|
||||||
|
global_config=global_config,
|
||||||
|
embedding_func=embedding_func,
|
||||||
|
meta_fields=meta_fields or set(),
|
||||||
|
)
|
||||||
|
self.__post_init__()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_collection_if_not_exist(
|
def create_collection_if_not_exist(
|
||||||
client: QdrantClient, collection_name: str, **kwargs
|
client: QdrantClient, collection_name: str, **kwargs
|
||||||
|
|
@ -59,6 +71,29 @@ class QdrantVectorDBStorage(BaseVectorStorage):
|
||||||
client.create_collection(collection_name, **kwargs)
|
client.create_collection(collection_name, **kwargs)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
# Check for QDRANT_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all Qdrant storage instances
|
||||||
|
qdrant_workspace = os.environ.get("QDRANT_WORKSPACE")
|
||||||
|
if qdrant_workspace and qdrant_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = qdrant_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using QDRANT_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})
|
kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})
|
||||||
cosine_threshold = kwargs.get("cosine_better_than_threshold")
|
cosine_threshold = kwargs.get("cosine_better_than_threshold")
|
||||||
if cosine_threshold is None:
|
if cosine_threshold is None:
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,29 @@ class RedisConnectionManager:
|
||||||
@dataclass
|
@dataclass
|
||||||
class RedisKVStorage(BaseKVStorage):
|
class RedisKVStorage(BaseKVStorage):
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
# Check for REDIS_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all Redis storage instances
|
||||||
|
redis_workspace = os.environ.get("REDIS_WORKSPACE")
|
||||||
|
if redis_workspace and redis_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = redis_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using REDIS_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
redis_url = os.environ.get(
|
redis_url = os.environ.get(
|
||||||
"REDIS_URI", config.get("redis", "uri", fallback="redis://localhost:6379")
|
"REDIS_URI", config.get("redis", "uri", fallback="redis://localhost:6379")
|
||||||
)
|
)
|
||||||
|
|
@ -461,6 +484,29 @@ class RedisDocStatusStorage(DocStatusStorage):
|
||||||
"""Redis implementation of document status storage"""
|
"""Redis implementation of document status storage"""
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
# Check for REDIS_WORKSPACE environment variable first (higher priority)
|
||||||
|
# This allows administrators to force a specific workspace for all Redis storage instances
|
||||||
|
redis_workspace = os.environ.get("REDIS_WORKSPACE")
|
||||||
|
if redis_workspace and redis_workspace.strip():
|
||||||
|
# Use environment variable value, overriding the passed workspace parameter
|
||||||
|
effective_workspace = redis_workspace.strip()
|
||||||
|
logger.info(
|
||||||
|
f"Using REDIS_WORKSPACE environment variable: '{effective_workspace}' (overriding passed workspace: '{self.workspace}')"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use the workspace parameter passed during initialization
|
||||||
|
effective_workspace = self.workspace
|
||||||
|
if effective_workspace:
|
||||||
|
logger.debug(
|
||||||
|
f"Using passed workspace parameter: '{effective_workspace}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build namespace with workspace prefix for data isolation
|
||||||
|
if effective_workspace:
|
||||||
|
self.namespace = f"{effective_workspace}_{self.namespace}"
|
||||||
|
logger.debug(f"Final namespace with workspace prefix: '{self.namespace}'")
|
||||||
|
# When workspace is empty, keep the original namespace unchanged
|
||||||
|
|
||||||
redis_url = os.environ.get(
|
redis_url = os.environ.get(
|
||||||
"REDIS_URI", config.get("redis", "uri", fallback="redis://localhost:6379")
|
"REDIS_URI", config.get("redis", "uri", fallback="redis://localhost:6379")
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ from .base import (
|
||||||
StoragesStatus,
|
StoragesStatus,
|
||||||
DeletionResult,
|
DeletionResult,
|
||||||
)
|
)
|
||||||
from .namespace import NameSpace, make_namespace
|
from .namespace import NameSpace
|
||||||
from .operate import (
|
from .operate import (
|
||||||
chunking_by_token_size,
|
chunking_by_token_size,
|
||||||
extract_entities,
|
extract_entities,
|
||||||
|
|
@ -97,9 +97,7 @@ class LightRAG:
|
||||||
# Directory
|
# Directory
|
||||||
# ---
|
# ---
|
||||||
|
|
||||||
working_dir: str = field(
|
working_dir: str = field(default="./rag_storage")
|
||||||
default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
|
|
||||||
)
|
|
||||||
"""Directory where cache and temporary files are stored."""
|
"""Directory where cache and temporary files are stored."""
|
||||||
|
|
||||||
# Storage
|
# Storage
|
||||||
|
|
@ -117,6 +115,12 @@ class LightRAG:
|
||||||
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
||||||
"""Storage type for tracking document processing statuses."""
|
"""Storage type for tracking document processing statuses."""
|
||||||
|
|
||||||
|
# Workspace
|
||||||
|
# ---
|
||||||
|
|
||||||
|
workspace: str = field(default_factory=lambda: os.getenv("WORKSPACE", ""))
|
||||||
|
"""Workspace for data isolation. Defaults to empty string if WORKSPACE environment variable is not set."""
|
||||||
|
|
||||||
# Logging (Deprecated, use setup_logger in utils.py instead)
|
# Logging (Deprecated, use setup_logger in utils.py instead)
|
||||||
# ---
|
# ---
|
||||||
log_level: int | None = field(default=None)
|
log_level: int | None = field(default=None)
|
||||||
|
|
@ -242,10 +246,6 @@ class LightRAG:
|
||||||
vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
|
vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
|
||||||
"""Additional parameters for vector database storage."""
|
"""Additional parameters for vector database storage."""
|
||||||
|
|
||||||
# TODO:deprecated, remove in the future, use WORKSPACE instead
|
|
||||||
namespace_prefix: str = field(default="")
|
|
||||||
"""Prefix for namespacing stored data across different environments."""
|
|
||||||
|
|
||||||
enable_llm_cache: bool = field(default=True)
|
enable_llm_cache: bool = field(default=True)
|
||||||
"""Enables caching for LLM responses to avoid redundant computations."""
|
"""Enables caching for LLM responses to avoid redundant computations."""
|
||||||
|
|
||||||
|
|
@ -382,61 +382,53 @@ class LightRAG:
|
||||||
self.doc_status_storage_cls = self._get_storage_class(self.doc_status_storage)
|
self.doc_status_storage_cls = self._get_storage_class(self.doc_status_storage)
|
||||||
|
|
||||||
self.llm_response_cache: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
self.llm_response_cache: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
||||||
namespace=make_namespace(
|
namespace=NameSpace.KV_STORE_LLM_RESPONSE_CACHE,
|
||||||
self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
|
workspace=self.workspace,
|
||||||
),
|
global_config=global_config,
|
||||||
global_config=asdict(
|
|
||||||
self
|
|
||||||
), # Add global_config to ensure cache works properly
|
|
||||||
embedding_func=self.embedding_func,
|
embedding_func=self.embedding_func,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.full_docs: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
self.full_docs: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
||||||
namespace=make_namespace(
|
namespace=NameSpace.KV_STORE_FULL_DOCS,
|
||||||
self.namespace_prefix, NameSpace.KV_STORE_FULL_DOCS
|
workspace=self.workspace,
|
||||||
),
|
|
||||||
embedding_func=self.embedding_func,
|
embedding_func=self.embedding_func,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
self.text_chunks: BaseKVStorage = self.key_string_value_json_storage_cls( # type: ignore
|
||||||
namespace=make_namespace(
|
namespace=NameSpace.KV_STORE_TEXT_CHUNKS,
|
||||||
self.namespace_prefix, NameSpace.KV_STORE_TEXT_CHUNKS
|
workspace=self.workspace,
|
||||||
),
|
|
||||||
embedding_func=self.embedding_func,
|
embedding_func=self.embedding_func,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( # type: ignore
|
self.chunk_entity_relation_graph: BaseGraphStorage = self.graph_storage_cls( # type: ignore
|
||||||
namespace=make_namespace(
|
namespace=NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION,
|
||||||
self.namespace_prefix, NameSpace.GRAPH_STORE_CHUNK_ENTITY_RELATION
|
workspace=self.workspace,
|
||||||
),
|
|
||||||
embedding_func=self.embedding_func,
|
embedding_func=self.embedding_func,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.entities_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
|
self.entities_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
|
||||||
namespace=make_namespace(
|
namespace=NameSpace.VECTOR_STORE_ENTITIES,
|
||||||
self.namespace_prefix, NameSpace.VECTOR_STORE_ENTITIES
|
workspace=self.workspace,
|
||||||
),
|
|
||||||
embedding_func=self.embedding_func,
|
embedding_func=self.embedding_func,
|
||||||
meta_fields={"entity_name", "source_id", "content", "file_path"},
|
meta_fields={"entity_name", "source_id", "content", "file_path"},
|
||||||
)
|
)
|
||||||
self.relationships_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
|
self.relationships_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
|
||||||
namespace=make_namespace(
|
namespace=NameSpace.VECTOR_STORE_RELATIONSHIPS,
|
||||||
self.namespace_prefix, NameSpace.VECTOR_STORE_RELATIONSHIPS
|
workspace=self.workspace,
|
||||||
),
|
|
||||||
embedding_func=self.embedding_func,
|
embedding_func=self.embedding_func,
|
||||||
meta_fields={"src_id", "tgt_id", "source_id", "content", "file_path"},
|
meta_fields={"src_id", "tgt_id", "source_id", "content", "file_path"},
|
||||||
)
|
)
|
||||||
self.chunks_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
|
self.chunks_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
|
||||||
namespace=make_namespace(
|
namespace=NameSpace.VECTOR_STORE_CHUNKS,
|
||||||
self.namespace_prefix, NameSpace.VECTOR_STORE_CHUNKS
|
workspace=self.workspace,
|
||||||
),
|
|
||||||
embedding_func=self.embedding_func,
|
embedding_func=self.embedding_func,
|
||||||
meta_fields={"full_doc_id", "content", "file_path"},
|
meta_fields={"full_doc_id", "content", "file_path"},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize document status storage
|
# Initialize document status storage
|
||||||
self.doc_status: DocStatusStorage = self.doc_status_storage_cls(
|
self.doc_status: DocStatusStorage = self.doc_status_storage_cls(
|
||||||
namespace=make_namespace(self.namespace_prefix, NameSpace.DOC_STATUS),
|
namespace=NameSpace.DOC_STATUS,
|
||||||
|
workspace=self.workspace,
|
||||||
global_config=global_config,
|
global_config=global_config,
|
||||||
embedding_func=None,
|
embedding_func=None,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -17,10 +17,6 @@ class NameSpace:
|
||||||
DOC_STATUS = "doc_status"
|
DOC_STATUS = "doc_status"
|
||||||
|
|
||||||
|
|
||||||
def make_namespace(prefix: str, base_namespace: str):
|
|
||||||
return prefix + base_namespace
|
|
||||||
|
|
||||||
|
|
||||||
def is_namespace(namespace: str, base_namespace: str | Iterable[str]):
|
def is_namespace(namespace: str, base_namespace: str | Iterable[str]):
|
||||||
if isinstance(base_namespace, str):
|
if isinstance(base_namespace, str):
|
||||||
return namespace.endswith(base_namespace)
|
return namespace.endswith(base_namespace)
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,8 @@ export type LightragStatus = {
|
||||||
doc_status_storage: string
|
doc_status_storage: string
|
||||||
graph_storage: string
|
graph_storage: string
|
||||||
vector_storage: string
|
vector_storage: string
|
||||||
|
workspace?: string
|
||||||
|
max_graph_nodes?: string
|
||||||
}
|
}
|
||||||
update_status?: Record<string, any>
|
update_status?: Record<string, any>
|
||||||
core_version?: string
|
core_version?: string
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,10 @@ const StatusCard = ({ status }: { status: LightragStatus | null }) => {
|
||||||
<span>{status.configuration.graph_storage}</span>
|
<span>{status.configuration.graph_storage}</span>
|
||||||
<span>{t('graphPanel.statusCard.vectorStorage')}:</span>
|
<span>{t('graphPanel.statusCard.vectorStorage')}:</span>
|
||||||
<span>{status.configuration.vector_storage}</span>
|
<span>{status.configuration.vector_storage}</span>
|
||||||
|
<span>{t('graphPanel.statusCard.workspace')}:</span>
|
||||||
|
<span>{status.configuration.workspace || '-'}</span>
|
||||||
|
<span>{t('graphPanel.statusCard.maxGraphNodes')}:</span>
|
||||||
|
<span>{status.configuration.max_graph_nodes || '-'}</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
|
|
@ -263,7 +263,9 @@
|
||||||
"kvStorage": "تخزين المفتاح-القيمة",
|
"kvStorage": "تخزين المفتاح-القيمة",
|
||||||
"docStatusStorage": "تخزين حالة المستند",
|
"docStatusStorage": "تخزين حالة المستند",
|
||||||
"graphStorage": "تخزين الرسم البياني",
|
"graphStorage": "تخزين الرسم البياني",
|
||||||
"vectorStorage": "تخزين المتجهات"
|
"vectorStorage": "تخزين المتجهات",
|
||||||
|
"workspace": "مساحة العمل",
|
||||||
|
"maxGraphNodes": "الحد الأقصى لعقد الرسم البياني"
|
||||||
},
|
},
|
||||||
"propertiesView": {
|
"propertiesView": {
|
||||||
"editProperty": "تعديل {{property}}",
|
"editProperty": "تعديل {{property}}",
|
||||||
|
|
|
||||||
|
|
@ -263,7 +263,9 @@
|
||||||
"kvStorage": "KV Storage",
|
"kvStorage": "KV Storage",
|
||||||
"docStatusStorage": "Doc Status Storage",
|
"docStatusStorage": "Doc Status Storage",
|
||||||
"graphStorage": "Graph Storage",
|
"graphStorage": "Graph Storage",
|
||||||
"vectorStorage": "Vector Storage"
|
"vectorStorage": "Vector Storage",
|
||||||
|
"workspace": "Workspace",
|
||||||
|
"maxGraphNodes": "Max Graph Nodes"
|
||||||
},
|
},
|
||||||
"propertiesView": {
|
"propertiesView": {
|
||||||
"editProperty": "Edit {{property}}",
|
"editProperty": "Edit {{property}}",
|
||||||
|
|
|
||||||
|
|
@ -263,7 +263,9 @@
|
||||||
"kvStorage": "Stockage clé-valeur",
|
"kvStorage": "Stockage clé-valeur",
|
||||||
"docStatusStorage": "Stockage de l'état des documents",
|
"docStatusStorage": "Stockage de l'état des documents",
|
||||||
"graphStorage": "Stockage du graphe",
|
"graphStorage": "Stockage du graphe",
|
||||||
"vectorStorage": "Stockage vectoriel"
|
"vectorStorage": "Stockage vectoriel",
|
||||||
|
"workspace": "Espace de travail",
|
||||||
|
"maxGraphNodes": "Nombre maximum de nœuds du graphe"
|
||||||
},
|
},
|
||||||
"propertiesView": {
|
"propertiesView": {
|
||||||
"editProperty": "Modifier {{property}}",
|
"editProperty": "Modifier {{property}}",
|
||||||
|
|
|
||||||
|
|
@ -263,7 +263,9 @@
|
||||||
"kvStorage": "KV存储",
|
"kvStorage": "KV存储",
|
||||||
"docStatusStorage": "文档状态存储",
|
"docStatusStorage": "文档状态存储",
|
||||||
"graphStorage": "图存储",
|
"graphStorage": "图存储",
|
||||||
"vectorStorage": "向量存储"
|
"vectorStorage": "向量存储",
|
||||||
|
"workspace": "工作空间",
|
||||||
|
"maxGraphNodes": "最大图节点数"
|
||||||
},
|
},
|
||||||
"propertiesView": {
|
"propertiesView": {
|
||||||
"editProperty": "编辑{{property}}",
|
"editProperty": "编辑{{property}}",
|
||||||
|
|
|
||||||
|
|
@ -263,7 +263,9 @@
|
||||||
"kvStorage": "KV 儲存",
|
"kvStorage": "KV 儲存",
|
||||||
"docStatusStorage": "文件狀態儲存",
|
"docStatusStorage": "文件狀態儲存",
|
||||||
"graphStorage": "圖形儲存",
|
"graphStorage": "圖形儲存",
|
||||||
"vectorStorage": "向量儲存"
|
"vectorStorage": "向量儲存",
|
||||||
|
"workspace": "工作空間",
|
||||||
|
"maxGraphNodes": "最大圖形節點數"
|
||||||
},
|
},
|
||||||
"propertiesView": {
|
"propertiesView": {
|
||||||
"editProperty": "編輯{{property}}",
|
"editProperty": "編輯{{property}}",
|
||||||
|
|
|
||||||
|
|
@ -28,9 +28,10 @@ def run_queries_and_save_to_json(
|
||||||
):
|
):
|
||||||
loop = always_get_an_event_loop()
|
loop = always_get_an_event_loop()
|
||||||
|
|
||||||
with open(output_file, "a", encoding="utf-8") as result_file, open(
|
with (
|
||||||
error_file, "a", encoding="utf-8"
|
open(output_file, "a", encoding="utf-8") as result_file,
|
||||||
) as err_file:
|
open(error_file, "a", encoding="utf-8") as err_file,
|
||||||
|
):
|
||||||
result_file.write("[\n")
|
result_file.write("[\n")
|
||||||
first_entry = True
|
first_entry = True
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -59,9 +59,10 @@ def run_queries_and_save_to_json(
|
||||||
):
|
):
|
||||||
loop = always_get_an_event_loop()
|
loop = always_get_an_event_loop()
|
||||||
|
|
||||||
with open(output_file, "a", encoding="utf-8") as result_file, open(
|
with (
|
||||||
error_file, "a", encoding="utf-8"
|
open(output_file, "a", encoding="utf-8") as result_file,
|
||||||
) as err_file:
|
open(error_file, "a", encoding="utf-8") as err_file,
|
||||||
|
):
|
||||||
result_file.write("[\n")
|
result_file.write("[\n")
|
||||||
first_entry = True
|
first_entry = True
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue