Rewrite get_knowledge_graph with label * by degree

2025-06-28 17:10:39 +08:00 · 2025-06-28 17:10:39 +08:00 · 5739f52d29
commit 5739f52d29
parent d0f4eee404
1 changed files with 80 additions and 32 deletions
--- a/lightrag/kg/mongo_impl.py
+++ b/lightrag/kg/mongo_impl.py
@ -743,10 +743,85 @@ class MongoGraphStorage(BaseGraphStorage):
                    "source_node_id",
                    "target_node_id",
                    "relationship",
                    "source_ids",
                ]
            },
        )
    async def get_knowledge_graph_all_by_degree(
        self, max_depth: int = 3, max_nodes: int = MAX_GRAPH_NODES
    ) -> KnowledgeGraph:
        """
        It's possible that the node with one or multiple relationships is retrieved,
        while its neighbor is not.  Then this node might seem like disconnected in UI.
        """
        total_node_count = await self.collection.count_documents({})
        result = KnowledgeGraph()
        seen_edges = set()
        result.is_truncated = total_node_count > max_nodes
        if result.is_truncated:
            # Get all node_ids ranked by degree if max_nodes exceeds total node count
            pipeline = [
                {"$project": {"source_node_id": 1, "_id": 0}},
                {"$group": {"_id": "$source_node_id", "degree": {"$sum": 1}}},
                {
                    "$unionWith": {
                        "coll": self._edge_collection_name,
                        "pipeline": [
                            {"$project": {"target_node_id": 1, "_id": 0}},
                            {
                                "$group": {
                                    "_id": "$target_node_id",
                                    "degree": {"$sum": 1},
                                }
                            },
                        ],
                    }
                },
                {"$group": {"_id": "$_id", "degree": {"$sum": "$degree"}}},
                {"$sort": {"degree": -1}},
                {"$limit": max_nodes},
            ]
            cursor = await self.edge_collection.aggregate(pipeline, allowDiskUse=True)
            node_ids = []
            async for doc in cursor:
                node_id = str(doc["_id"])
                node_ids.append(node_id)
            cursor = self.collection.find({"_id": {"$in": node_ids}}, {"source_ids": 0})
            async for doc in cursor:
                result.nodes.append(self._construct_graph_node(doc["_id"], doc))
            # As node count reaches the limit, only need to fetch the edges that directly connect to these nodes
            edge_cursor = self.edge_collection.find(
                {
                    "$and": [
                        {"source_node_id": {"$in": node_ids}},
                        {"target_node_id": {"$in": node_ids}},
                    ]
                }
            )
        else:
            # All nodes and edges are needed
            cursor = self.collection.find({}, {"source_ids": 0})
            async for doc in cursor:
                node_id = str(doc["_id"])
                result.nodes.append(self._construct_graph_node(doc["_id"], doc))
            edge_cursor = self.edge_collection.find({})
        async for edge in edge_cursor:
            edge_id = f"{edge['source_node_id']}-{edge['target_node_id']}"
            if edge_id not in seen_edges:
                seen_edges.add(edge_id)
                result.edges.append(self._construct_graph_edge(edge_id, edge))
        return result
    async def get_knowledge_graph(
        self,
        node_label: str,
@ -778,31 +853,9 @@ class MongoGraphStorage(BaseGraphStorage):
        try:
            # Optimize pipeline to avoid memory issues with large datasets
            if label == "*":
-                # For getting all nodes, use a simpler pipeline to avoid memory issues
+                return await self.get_knowledge_graph_all_by_degree(
-                pipeline = [
+                    max_depth, max_nodes
-                    {"$limit": max_nodes + 1},  # Limit early to reduce memory usage
+                )
                    {
                        "$project": project_doc
                    },  # Load without internal fields or unneeded ones shown in WebUI
                    {
                        "$graphLookup": {
                            "from": self._edge_collection_name,
                            "startWith": "$_id",
                            "connectFromField": "target_node_id",
                            "connectToField": "source_node_id",
                            "maxDepth": max_depth,
                            "depthField": "depth",
                            "as": "connected_edges",
                        },
                    },
                    # {"$addFields": {"edge_count": {"$size": "$connected_edges"}}},
                    # {"$sort": {"edge_count": -1}},
                    # {"$limit": max_nodes},
                ]
                # Check if we need to set truncation flag
                all_node_count = await self.collection.count_documents({})
                result.is_truncated = all_node_count > max_nodes
            else:
                # Verify if starting node exists
                start_node = await self.collection.find_one({"_id": label})
@ -827,7 +880,7 @@ class MongoGraphStorage(BaseGraphStorage):
                    },
                    {
                        "$unionWith": {
-                            "coll": "chunk_entity_relation",
+                            "coll": self._collection_name,
                            "pipeline": [
                                {"$match": {"_id": label}},
                                {"$project": project_doc},
@ -929,13 +982,8 @@ class MongoGraphStorage(BaseGraphStorage):
                try:
                    simple_cursor = self.collection.find({}).limit(max_nodes)
                    async for doc in simple_cursor:
                        node_id = str(doc["_id"])
                        result.nodes.append(
-                            KnowledgeGraphNode(
+                            self._construct_graph_node(str(doc["_id"]), doc)
                                id=node_id,
                                labels=[node_id],
                                properties={k: v for k, v in doc.items() if k != "_id"},
                            )
                        )
                    result.is_truncated = True
                    logger.info(