Release semphore before merge stage

2025-07-09 09:24:44 +08:00 · 2025-07-09 09:24:44 +08:00 · cb3bfc0e5b
commit cb3bfc0e5b
parent 56d43de58a
2 changed files with 58 additions and 59 deletions
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -1069,27 +1069,27 @@ class LightRAG:
                                }
                            )
-                        # Semphore is NOT released here, because the merge_nodes_and_edges function is highly concurrent
+                        # Semphore is released here
-                        # and more importantly, it is the bottleneck and needs to be resource controlled in massively
+                        # Concurrency is controlled by graph db lock for individual entities and relationships
                        # parallel insertions
-                        if file_extraction_stage_ok:
+
-                            try:
+                    if file_extraction_stage_ok:
-                                # Get chunk_results from entity_relation_task
+                        try:
-                                chunk_results = await entity_relation_task
+                            # Get chunk_results from entity_relation_task
-                                await merge_nodes_and_edges(
+                            chunk_results = await entity_relation_task
-                                    chunk_results=chunk_results,  # result collected from entity_relation_task
+                            await merge_nodes_and_edges(
-                                    knowledge_graph_inst=self.chunk_entity_relation_graph,
+                                chunk_results=chunk_results,  # result collected from entity_relation_task
-                                    entity_vdb=self.entities_vdb,
+                                knowledge_graph_inst=self.chunk_entity_relation_graph,
-                                    relationships_vdb=self.relationships_vdb,
+                                entity_vdb=self.entities_vdb,
-                                    global_config=asdict(self),
+                                relationships_vdb=self.relationships_vdb,
-                                    pipeline_status=pipeline_status,
+                                global_config=asdict(self),
-                                    pipeline_status_lock=pipeline_status_lock,
+                                pipeline_status=pipeline_status,
-                                    llm_response_cache=self.llm_response_cache,
+                                pipeline_status_lock=pipeline_status_lock,
-                                    current_file_number=current_file_number,
+                                llm_response_cache=self.llm_response_cache,
-                                    total_files=total_files,
+                                current_file_number=current_file_number,
-                                    file_path=file_path,
+                                total_files=total_files,
-                                )
+                                file_path=file_path,
                            )
                            await self.doc_status.upsert(
                                {
@ -1111,46 +1111,46 @@ class LightRAG:
                                }
                            )
-                                # Call _insert_done after processing each file
+                            # Call _insert_done after processing each file
-                                await self._insert_done()
+                            await self._insert_done()
-                                async with pipeline_status_lock:
+                            async with pipeline_status_lock:
-                                    log_message = f"Completed processing file {current_file_number}/{total_files}: {file_path}"
+                                log_message = f"Completed processing file {current_file_number}/{total_files}: {file_path}"
-                                    logger.info(log_message)
+                                logger.info(log_message)
-                                    pipeline_status["latest_message"] = log_message
+                                pipeline_status["latest_message"] = log_message
-                                    pipeline_status["history_messages"].append(log_message)
+                                pipeline_status["history_messages"].append(log_message)
-                            except Exception as e:
+                        except Exception as e:
-                                # Log error and update pipeline status
+                            # Log error and update pipeline status
-                                logger.error(traceback.format_exc())
+                            logger.error(traceback.format_exc())
-                                error_msg = f"Merging stage failed in document {current_file_number}/{total_files}: {file_path}"
+                            error_msg = f"Merging stage failed in document {current_file_number}/{total_files}: {file_path}"
-                                logger.error(error_msg)
+                            logger.error(error_msg)
-                                async with pipeline_status_lock:
+                            async with pipeline_status_lock:
-                                    pipeline_status["latest_message"] = error_msg
+                                pipeline_status["latest_message"] = error_msg
-                                    pipeline_status["history_messages"].append(
+                                pipeline_status["history_messages"].append(
-                                        traceback.format_exc()
+                                    traceback.format_exc()
                                    )
                                    pipeline_status["history_messages"].append(error_msg)
                                # Persistent llm cache
                                if self.llm_response_cache:
                                    await self.llm_response_cache.index_done_callback()
                                # Update document status to failed
                                await self.doc_status.upsert(
                                    {
                                        doc_id: {
                                            "status": DocStatus.FAILED,
                                            "error": str(e),
                                            "content": status_doc.content,
                                            "content_summary": status_doc.content_summary,
                                            "content_length": status_doc.content_length,
                                            "created_at": status_doc.created_at,
                                            "updated_at": datetime.now().isoformat(),
                                            "file_path": file_path,
                                        }
                                    }
                                )
                                pipeline_status["history_messages"].append(error_msg)
                            # Persistent llm cache
                            if self.llm_response_cache:
                                await self.llm_response_cache.index_done_callback()
                            # Update document status to failed
                            await self.doc_status.upsert(
                                {
                                    doc_id: {
                                        "status": DocStatus.FAILED,
                                        "error": str(e),
                                        "content": status_doc.content,
                                        "content_summary": status_doc.content_summary,
                                        "content_length": status_doc.content_length,
                                        "created_at": status_doc.created_at,
                                        "updated_at": datetime.now().isoformat(),
                                        "file_path": file_path,
                                    }
                                }
                            )
                # Create processing tasks for all documents
                doc_tasks = []
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -36,6 +36,7 @@ from .base import (
 )
 from .prompt import PROMPTS
 from .constants import GRAPH_FIELD_SEP
 from .kg.shared_storage import get_graph_db_lock_keyed
 import time
 from dotenv import load_dotenv
@ -1121,8 +1122,6 @@ async def merge_nodes_and_edges(
        pipeline_status_lock: Lock for pipeline status
        llm_response_cache: LLM response cache
    """
    # Get lock manager from shared storage
    from .kg.shared_storage import get_graph_db_lock_keyed
    # Collect all nodes and edges from all chunks