Add workspace-aware MongoDB indexing and Atlas Search support

• Add workspace attribute to storage classes • Use workspace-specific index names • Implement Atlas Search with fallbacks • Add entity search and popular labels • Improve index migration strategy
2025-09-20 12:38:41 +08:00 · 2025-09-20 12:38:41 +08:00 · 6f85bd6b19
commit 6f85bd6b19
parent 223397a247
1 changed files with 444 additions and 50 deletions
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@ -1,4 +1,5 @@
 import os
 import re
 import time
 from dataclasses import dataclass, field
 import numpy as np
@ -112,6 +113,7 @@ class MongoKVStorage(BaseKVStorage):
        # Keep original namespace unchanged for type detection logic
        if effective_workspace:
            self.final_namespace = f"{effective_workspace}_{self.namespace}"
            self.workspace = effective_workspace
            logger.debug(
                f"Final namespace with workspace prefix: '{self.final_namespace}'"
            )
@ -335,6 +337,7 @@ class MongoDocStatusStorage(DocStatusStorage):
        # Keep original namespace unchanged for type detection logic
        if effective_workspace:
            self.final_namespace = f"{effective_workspace}_{self.namespace}"
            self.workspace = effective_workspace
            logger.debug(
                f"Final namespace with workspace prefix: '{self.final_namespace}'"
            )
@ -474,6 +477,7 @@ class MongoDocStatusStorage(DocStatusStorage):
    async def create_and_migrate_indexes_if_not_exists(self):
        """Create indexes to optimize pagination queries and migrate file_path indexes for Chinese collation"""
        try:
            # Get indexes for the current collection only
            indexes_cursor = await self._data.list_indexes()
            existing_indexes = await indexes_cursor.to_list(length=None)
            existing_index_names = {idx.get("name", "") for idx in existing_indexes}
@ -481,80 +485,67 @@ class MongoDocStatusStorage(DocStatusStorage):
            # Define collation configuration for Chinese pinyin sorting
            collation_config = {"locale": "zh", "numericOrdering": True}
-            # 1. Define all indexes needed (including original pagination indexes and new collation indexes)
+            # Use workspace-specific index names to avoid cross-workspace conflicts
            workspace_prefix = f"{self.workspace}_" if self.workspace != "_" else ""
            # 1. Define all indexes needed with workspace-specific names
            all_indexes = [
                # Original pagination indexes
                {
-                    "name": "status_updated_at",
+                    "name": f"{workspace_prefix}status_updated_at",
                    "keys": [("status", 1), ("updated_at", -1)],
                },
                {
-                    "name": "status_created_at",
+                    "name": f"{workspace_prefix}status_created_at",
                    "keys": [("status", 1), ("created_at", -1)],
                },
-                {"name": "updated_at", "keys": [("updated_at", -1)]},
+                {"name": f"{workspace_prefix}updated_at", "keys": [("updated_at", -1)]},
-                {"name": "created_at", "keys": [("created_at", -1)]},
+                {"name": f"{workspace_prefix}created_at", "keys": [("created_at", -1)]},
-                {"name": "id", "keys": [("_id", 1)]},
+                {"name": f"{workspace_prefix}id", "keys": [("_id", 1)]},
-                {"name": "track_id", "keys": [("track_id", 1)]},
+                {"name": f"{workspace_prefix}track_id", "keys": [("track_id", 1)]},
-                # New file_path indexes with Chinese collation
+                # New file_path indexes with Chinese collation and workspace-specific names
                {
-                    "name": "file_path_zh_collation",
+                    "name": f"{workspace_prefix}file_path_zh_collation",
                    "keys": [("file_path", 1)],
                    "collation": collation_config,
                },
                {
-                    "name": "status_file_path_zh_collation",
+                    "name": f"{workspace_prefix}status_file_path_zh_collation",
                    "keys": [("status", 1), ("file_path", 1)],
                    "collation": collation_config,
                },
            ]
-            # 2. Handle index migration: drop conflicting indexes with different names but same key patterns
+            # 2. Handle legacy index cleanup: only drop old indexes that exist in THIS collection
-            for index_info in all_indexes:
+            legacy_index_names = [
-                target_keys = index_info["keys"]
+                "file_path_zh_collation",
-                target_name = index_info["name"]
+                "status_file_path_zh_collation",
-                target_collation = index_info.get("collation")
+                "status_updated_at",
                "status_created_at",
                "updated_at",
                "created_at",
                "id",
                "track_id",
            ]
-                # Find existing indexes with the same key pattern but different names or collation
+            for legacy_name in legacy_index_names:
-                conflicting_indexes = []
+                if (
-                for idx in existing_indexes:
+                    legacy_name in existing_index_names
-                    idx_name = idx.get("name", "")
+                    and legacy_name
-                    idx_keys = list(idx.get("key", {}).items())
+                    != f"{workspace_prefix}{legacy_name.replace(workspace_prefix, '')}"
-                    idx_collation = idx.get("collation")
+                ):
                    # Skip the _id_ index (MongoDB default)
                    if idx_name == "_id_":
                        continue
                    # Check if keys match but name or collation differs
                    if idx_keys == target_keys:
                        if (
                            idx_name != target_name
                            or (target_collation and not idx_collation)
                            or (not target_collation and idx_collation)
                            or (
                                target_collation
                                and idx_collation
                                and target_collation != idx_collation
                            )
                        ):
                            conflicting_indexes.append(idx_name)
                # Drop conflicting indexes
                for conflicting_name in conflicting_indexes:
                    try:
-                        await self._data.drop_index(conflicting_name)
+                        await self._data.drop_index(legacy_name)
-                        logger.info(
+                        logger.debug(
-                            f"[{self.workspace}] Migrated: dropped conflicting index '{conflicting_name}' for collection {self._collection_name}"
+                            f"[{self.workspace}] Migrated: dropped legacy index '{legacy_name}' from collection {self._collection_name}"
                        )
-                        # Remove from existing_index_names to allow recreation
+                        existing_index_names.discard(legacy_name)
                        existing_index_names.discard(conflicting_name)
                    except PyMongoError as drop_error:
                        logger.warning(
-                            f"[{self.workspace}] Failed to drop conflicting index '{conflicting_name}': {drop_error}"
+                            f"[{self.workspace}] Failed to drop legacy index '{legacy_name}' from collection {self._collection_name}: {drop_error}"
                        )
-            # 3. Create all needed indexes
+            # 3. Create all needed indexes with workspace-specific names
            for index_info in all_indexes:
                index_name = index_info["name"]
                if index_name not in existing_index_names:
@ -566,7 +557,7 @@ class MongoDocStatusStorage(DocStatusStorage):
                        await self._data.create_index(
                            index_info["keys"], **create_kwargs
                        )
-                        logger.info(
+                        logger.debug(
                            f"[{self.workspace}] Created index '{index_name}' for collection {self._collection_name}"
                        )
                    except PyMongoError as create_error:
@ -734,6 +725,7 @@ class MongoGraphStorage(BaseGraphStorage):
        # Keep original namespace unchanged for type detection logic
        if effective_workspace:
            self.final_namespace = f"{effective_workspace}_{self.namespace}"
            self.workspace = effective_workspace
            logger.debug(
                f"Final namespace with workspace prefix: '{self.final_namespace}'"
            )
@ -757,6 +749,10 @@ class MongoGraphStorage(BaseGraphStorage):
            self.edge_collection = await get_or_create_collection(
                self.db, self._edge_collection_name
            )
            # Create Atlas Search index for better search performance if possible
            await self.create_search_index_if_not_exists()
            logger.debug(
                f"[{self.workspace}] Use MongoDB as KG {self._collection_name}"
            )
@ -1612,6 +1608,403 @@ class MongoGraphStorage(BaseGraphStorage):
            edges.append(edge_dict)
        return edges
    async def get_popular_labels(self, limit: int = 300) -> list[str]:
        """Get popular labels by node degree (most connected entities)
        Args:
            limit: Maximum number of labels to return
        Returns:
            List of labels sorted by degree (highest first)
        """
        try:
            # Use aggregation pipeline to count edges per node and sort by degree
            pipeline = [
                # Count outbound edges
                {"$group": {"_id": "$source_node_id", "out_degree": {"$sum": 1}}},
                # Union with inbound edges count
                {
                    "$unionWith": {
                        "coll": self._edge_collection_name,
                        "pipeline": [
                            {
                                "$group": {
                                    "_id": "$target_node_id",
                                    "in_degree": {"$sum": 1},
                                }
                            }
                        ],
                    }
                },
                # Group by node_id and sum degrees
                {
                    "$group": {
                        "_id": "$_id",
                        "total_degree": {
                            "$sum": {
                                "$add": [
                                    {"$ifNull": ["$out_degree", 0]},
                                    {"$ifNull": ["$in_degree", 0]},
                                ]
                            }
                        },
                    }
                },
                # Sort by degree descending, then by label ascending
                {"$sort": {"total_degree": -1, "_id": 1}},
                # Limit results
                {"$limit": limit},
                # Project only the label
                {"$project": {"_id": 1}},
            ]
            cursor = await self.edge_collection.aggregate(pipeline, allowDiskUse=True)
            labels = []
            async for doc in cursor:
                if doc.get("_id"):
                    labels.append(doc["_id"])
            logger.debug(
                f"[{self.workspace}] Retrieved {len(labels)} popular labels (limit: {limit})"
            )
            return labels
        except Exception as e:
            logger.error(f"[{self.workspace}] Error getting popular labels: {str(e)}")
            return []
    async def _try_atlas_text_search(self, query_strip: str, limit: int) -> list[str]:
        """Try Atlas Search using simple text search."""
        try:
            pipeline = [
                {
                    "$search": {
                        "index": "entity_id_search_idx",
                        "text": {"query": query_strip, "path": "_id"},
                    }
                },
                {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}},
                {"$limit": limit},
            ]
            cursor = await self.collection.aggregate(pipeline)
            labels = [doc["_id"] async for doc in cursor if doc.get("_id")]
            if labels:
                logger.debug(
                    f"[{self.workspace}] Atlas text search returned {len(labels)} results"
                )
                return labels
            return []
        except PyMongoError as e:
            logger.debug(f"[{self.workspace}] Atlas text search failed: {e}")
            return []
    async def _try_atlas_autocomplete_search(
        self, query_strip: str, limit: int
    ) -> list[str]:
        """Try Atlas Search using autocomplete for prefix matching."""
        try:
            pipeline = [
                {
                    "$search": {
                        "index": "entity_id_search_idx",
                        "autocomplete": {
                            "query": query_strip,
                            "path": "_id",
                            "fuzzy": {"maxEdits": 1, "prefixLength": 1},
                        },
                    }
                },
                {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}},
                {"$limit": limit},
            ]
            cursor = await self.collection.aggregate(pipeline)
            labels = [doc["_id"] async for doc in cursor if doc.get("_id")]
            if labels:
                logger.debug(
                    f"[{self.workspace}] Atlas autocomplete search returned {len(labels)} results"
                )
                return labels
            return []
        except PyMongoError as e:
            logger.debug(f"[{self.workspace}] Atlas autocomplete search failed: {e}")
            return []
    async def _try_atlas_compound_search(
        self, query_strip: str, limit: int
    ) -> list[str]:
        """Try Atlas Search using compound query for comprehensive matching."""
        try:
            pipeline = [
                {
                    "$search": {
                        "index": "entity_id_search_idx",
                        "compound": {
                            "should": [
                                {
                                    "text": {
                                        "query": query_strip,
                                        "path": "_id",
                                        "score": {"boost": {"value": 10}},
                                    }
                                },
                                {
                                    "autocomplete": {
                                        "query": query_strip,
                                        "path": "_id",
                                        "score": {"boost": {"value": 5}},
                                        "fuzzy": {"maxEdits": 1, "prefixLength": 1},
                                    }
                                },
                                {
                                    "wildcard": {
                                        "query": f"*{query_strip}*",
                                        "path": "_id",
                                        "score": {"boost": {"value": 2}},
                                    }
                                },
                            ],
                            "minimumShouldMatch": 1,
                        },
                    }
                },
                {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}},
                {"$sort": {"score": {"$meta": "searchScore"}}},
                {"$limit": limit},
            ]
            cursor = await self.collection.aggregate(pipeline)
            labels = [doc["_id"] async for doc in cursor if doc.get("_id")]
            if labels:
                logger.debug(
                    f"[{self.workspace}] Atlas compound search returned {len(labels)} results"
                )
                return labels
            return []
        except PyMongoError as e:
            logger.debug(f"[{self.workspace}] Atlas compound search failed: {e}")
            return []
    async def _fallback_regex_search(self, query_strip: str, limit: int) -> list[str]:
        """Fallback to regex-based search when Atlas Search fails."""
        try:
            logger.debug(
                f"[{self.workspace}] Using regex fallback search for: '{query_strip}'"
            )
            escaped_query = re.escape(query_strip)
            regex_condition = {"_id": {"$regex": escaped_query, "$options": "i"}}
            cursor = self.collection.find(regex_condition, {"_id": 1}).limit(limit * 2)
            docs = await cursor.to_list(length=limit * 2)
            # Extract labels
            labels = []
            for doc in docs:
                doc_id = doc.get("_id")
                if doc_id:
                    labels.append(doc_id)
            # Sort results to prioritize exact matches and starts-with matches
            def sort_key(label):
                label_lower = label.lower()
                query_lower_strip = query_strip.lower()
                if label_lower == query_lower_strip:
                    return (0, label_lower)  # Exact match - highest priority
                elif label_lower.startswith(query_lower_strip):
                    return (1, label_lower)  # Starts with - medium priority
                else:
                    return (2, label_lower)  # Contains - lowest priority
            labels.sort(key=sort_key)
            labels = labels[:limit]  # Apply final limit after sorting
            logger.debug(
                f"[{self.workspace}] Regex fallback search returned {len(labels)} results (limit: {limit})"
            )
            return labels
        except Exception as e:
            logger.error(f"[{self.workspace}] Regex fallback search failed: {e}")
            import traceback
            logger.error(f"[{self.workspace}] Traceback: {traceback.format_exc()}")
            return []
    async def search_labels(self, query: str, limit: int = 50) -> list[str]:
        """
        Search labels with progressive fallback strategy:
        1. Atlas text search (simple and fast)
        2. Atlas autocomplete search (prefix matching with fuzzy)
        3. Atlas compound search (comprehensive matching)
        4. Regex fallback (when Atlas Search is unavailable)
        """
        query_strip = query.strip()
        if not query_strip:
            return []
        # First check if we have any nodes at all
        try:
            node_count = await self.collection.count_documents({})
            if node_count == 0:
                logger.debug(
                    f"[{self.workspace}] No nodes found in collection {self._collection_name}"
                )
                return []
        except PyMongoError as e:
            logger.error(f"[{self.workspace}] Error counting nodes: {e}")
            return []
        # Progressive search strategy
        search_methods = [
            ("text", self._try_atlas_text_search),
            ("autocomplete", self._try_atlas_autocomplete_search),
            ("compound", self._try_atlas_compound_search),
        ]
        # Try Atlas Search methods in order
        for method_name, search_method in search_methods:
            try:
                labels = await search_method(query_strip, limit)
                if labels:
                    logger.debug(
                        f"[{self.workspace}] Search successful using {method_name} method: {len(labels)} results"
                    )
                    return labels
                else:
                    logger.debug(
                        f"[{self.workspace}] {method_name} search returned no results, trying next method"
                    )
            except Exception as e:
                logger.debug(
                    f"[{self.workspace}] {method_name} search failed: {e}, trying next method"
                )
                continue
        # If all Atlas Search methods fail, use regex fallback
        logger.info(
            f"[{self.workspace}] All Atlas Search methods failed, using regex fallback search for: '{query_strip}'"
        )
        return await self._fallback_regex_search(query_strip, limit)
    async def _check_if_index_needs_rebuild(
        self, indexes: list, index_name: str
    ) -> bool:
        """Check if the existing index needs to be rebuilt due to configuration issues."""
        for index in indexes:
            if index["name"] == index_name:
                # Check if the index has the old problematic configuration
                definition = index.get("latestDefinition", {})
                mappings = definition.get("mappings", {})
                fields = mappings.get("fields", {})
                id_field = fields.get("_id", {})
                # If it's the old single-type autocomplete configuration, rebuild
                if (
                    isinstance(id_field, dict)
                    and id_field.get("type") == "autocomplete"
                ):
                    logger.info(
                        f"[{self.workspace}] Found old index configuration for '{index_name}', will rebuild"
                    )
                    return True
                # If it's not a list (multi-type configuration), rebuild
                if not isinstance(id_field, list):
                    logger.info(
                        f"[{self.workspace}] Index '{index_name}' needs upgrade to multi-type configuration"
                    )
                    return True
                logger.info(
                    f"[{self.workspace}] Index '{index_name}' has correct configuration"
                )
                return False
        return True  # Index doesn't exist, needs creation
    async def _safely_drop_old_index(self, index_name: str):
        """Safely drop the old search index."""
        try:
            await self.collection.drop_search_index(index_name)
            logger.info(
                f"[{self.workspace}] Successfully dropped old search index '{index_name}'"
            )
        except PyMongoError as e:
            logger.warning(
                f"[{self.workspace}] Could not drop old index '{index_name}': {e}"
            )
    async def _create_improved_search_index(self, index_name: str):
        """Create an improved search index with multiple field types."""
        search_index_model = SearchIndexModel(
            definition={
                "mappings": {
                    "dynamic": False,
                    "fields": {
                        "_id": [
                            {
                                "type": "string",
                            },
                            {
                                "type": "token",
                            },
                            {
                                "type": "autocomplete",
                                "maxGrams": 15,
                                "minGrams": 2,
                            },
                        ]
                    },
                },
                "analyzer": "lucene.standard",  # Index-level analyzer for text processing
            },
            name=index_name,
            type="search",
        )
        await self.collection.create_search_index(search_index_model)
        logger.info(
            f"[{self.workspace}] Created improved Atlas Search index '{index_name}' for collection {self._collection_name}. "
        )
        logger.info(
            f"[{self.workspace}] Index will be built asynchronously, using regex fallback until ready."
        )
    async def create_search_index_if_not_exists(self):
        """Creates an improved Atlas Search index for entity search, rebuilding if necessary."""
        index_name = "entity_id_search_idx"
        try:
            # Check if we're using MongoDB Atlas (has search index capabilities)
            indexes_cursor = await self.collection.list_search_indexes()
            indexes = await indexes_cursor.to_list(length=None)
            # Check if we need to rebuild the index
            needs_rebuild = await self._check_if_index_needs_rebuild(
                indexes, index_name
            )
            if needs_rebuild:
                # Check if index exists and drop it
                index_exists = any(idx["name"] == index_name for idx in indexes)
                if index_exists:
                    await self._safely_drop_old_index(index_name)
                # Create the improved search index (async, no waiting)
                await self._create_improved_search_index(index_name)
            else:
                logger.info(
                    f"[{self.workspace}] Atlas Search index '{index_name}' already exists with correct configuration"
                )
        except PyMongoError as e:
            # This is expected if not using MongoDB Atlas or if search indexes are not supported
            logger.info(
                f"[{self.workspace}] Could not create Atlas Search index for {self._collection_name}: {e}. "
                "This is normal if not using MongoDB Atlas - search will use regex fallback."
            )
        except Exception as e:
            logger.warning(
                f"[{self.workspace}] Unexpected error creating Atlas Search index for {self._collection_name}: {e}"
            )
    async def drop(self) -> dict[str, str]:
        """Drop the storage by removing all documents in the collection.
@ -1685,6 +2078,7 @@ class MongoVectorDBStorage(BaseVectorStorage):
        # Keep original namespace unchanged for type detection logic
        if effective_workspace:
            self.final_namespace = f"{effective_workspace}_{self.namespace}"
            self.workspace = effective_workspace
            logger.debug(
                f"Final namespace with workspace prefix: '{self.final_namespace}'"
            )