import json import asyncio import numpy as np from typing import Protocol, Optional, List, Dict, Any, Union from dataclasses import dataclass from abc import ABC, abstractmethod from functools import partial try: from fuzzywuzzy import process FUZZYWUZZY_AVAILABLE = True except ImportError: process = None FUZZYWUZZY_AVAILABLE = False try: from scipy.cluster.hierarchy import linkage, fcluster from sklearn.cluster import KMeans CLUSTERING_AVAILABLE = True except ImportError: linkage = fcluster = KMeans = None CLUSTERING_AVAILABLE = False try: from json_repair import repair_json JSON_REPAIR_AVAILABLE = True except ImportError: repair_json = json.loads # Fallback to standard json.loads JSON_REPAIR_AVAILABLE = False from .utils import logger from collections import defaultdict from collections import deque from .prompt import PROMPTS # ======================= Data Structures ======================= @dataclass class EntityInfo: """Standard entity information structure""" entity_id: str entity_type: str description: Optional[str] = None # ======================= Service Interfaces ======================= class DeduplicationService(Protocol): """Protocol for deduplication service that provides access to RAG functionality""" @property def rag_instance(self): """Get the RAG instance""" ... async def process_with_llm( self, prompt: str, system_prompt: str = "", **kwargs ) -> Optional[str]: """Process text using RAG's LLM function""" ... async def merge_entities( self, source_entities: List[str], target_entity: str ) -> None: """Merge entities using RAG's merge function""" ... async def get_embeddings(self, texts: List[str]) -> Any: """Get embeddings using RAG's embedding function""" ... class LightRAGDeduplicationService: """Concrete implementation of DeduplicationService for LightRAG""" def __init__(self, rag_instance): self._rag = rag_instance @property def rag_instance(self): """Get the RAG instance""" return self._rag async def process_with_llm( self, prompt: str, system_prompt: str = "", **kwargs ) -> Optional[str]: """Process text using RAG's LLM function""" use_model_func = partial(self._rag.llm_model_func, _priority=5) return await use_model_func(prompt, system_prompt=system_prompt, **kwargs) async def merge_entities( self, source_entities: List[str], target_entity: str ) -> None: """Merge entities using RAG's merge function""" return await self._rag.amerge_entities( source_entities=source_entities, target_entity=target_entity ) async def get_embeddings(self, texts: List[str]) -> Any: """Get embeddings using RAG's embedding function""" if not self._rag.embedding_func: raise ValueError("RAG instance does not have embedding function configured") return await self._rag.embedding_func(texts) # ======================= Configuration System ======================= @dataclass class BaseDeduplicationConfig: """Base configuration for all deduplication strategies""" strategy_name: str @classmethod def from_dict(cls, data: Dict[str, Any]) -> "BaseDeduplicationConfig": """Create config instance from dictionary""" return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__}) @dataclass class LLMBasedConfig(BaseDeduplicationConfig): """Configuration specific to LLM-based deduplication strategy""" strategy_name: str = "llm_based" target_batch_size: int = 20 # Reduce batch size to ease ollama burden max_batch_size: Optional[int] = None min_batch_size: Optional[int] = None similarity_threshold: float = 0.75 # Relax embedding threshold system_prompt: Optional[str] = None strictness_level: str = "strict" def __post_init__(self): if self.max_batch_size is None: self.max_batch_size = int(self.target_batch_size * 1.25) if self.min_batch_size is None: self.min_batch_size = int(self.target_batch_size * 0.75) # Configuration factory class ConfigFactory: """Factory for creating strategy-specific configurations""" _config_classes = { "llm_based": LLMBasedConfig, } @classmethod def register_config(cls, strategy_name: str, config_class: type): """Register a new configuration class""" cls._config_classes[strategy_name] = config_class @classmethod def create_config( cls, strategy_name: str, config_data: Dict[str, Any] ) -> BaseDeduplicationConfig: """Create strategy-specific configuration""" if strategy_name not in cls._config_classes: available_configs = list(cls._config_classes.keys()) raise ValueError( f"Unknown configuration for strategy '{strategy_name}'. " f"Available configurations: {available_configs}" ) config_class = cls._config_classes[strategy_name] return config_class(**config_data) # ======================= Clustering Processor ======================= class SemanticClusterBatcher: """Semantic clustering batch processor using RAG's embedding function""" def __init__( self, config: BaseDeduplicationConfig, deduplication_service: DeduplicationService, ): self.config = config self.deduplication_service = deduplication_service def _validate_input(self, nodes: List[str]): """Validate input nodes""" if not nodes: raise ValueError("Input node list cannot be empty") async def _get_embeddings(self, nodes: List[str]) -> Any: """Get embeddings using RAG's embedding function with intelligent batch processing""" try: # For ollama, limit batch size to prevent decode errors max_embedding_batch_size = 1 # Conservative batch size for Ollama stability if len(nodes) <= max_embedding_batch_size: # Small batch, process directly embeddings = await self.deduplication_service.get_embeddings(nodes) return embeddings else: # Large batch, split into smaller chunks logger.info( f"Splitting {len(nodes)} nodes into smaller batches for embedding" ) all_embeddings = [] for i in range(0, len(nodes), max_embedding_batch_size): batch = nodes[i : i + max_embedding_batch_size] logger.debug( f"Processing embedding batch {i//max_embedding_batch_size + 1}/{(len(nodes) + max_embedding_batch_size - 1)//max_embedding_batch_size}" ) try: batch_embeddings = ( await self.deduplication_service.get_embeddings(batch) ) all_embeddings.append(batch_embeddings) # Add delay between batches to prevent overwhelming Ollama if i + max_embedding_batch_size < len(nodes): await asyncio.sleep(0.2) except Exception as batch_error: logger.error( f"Failed to process embedding batch: {batch_error}" ) # Try individual processing as fallback logger.info( "Attempting individual text embedding for failed batch" ) individual_embeddings = [] for node in batch: try: single_embedding = ( await self.deduplication_service.get_embeddings( [node] ) ) individual_embeddings.append(single_embedding[0]) await asyncio.sleep(0.1) except Exception as single_error: logger.error( f"Failed to embed individual node '{node}': {single_error}" ) # Create zero embedding as fallback if individual_embeddings: zero_embedding = np.zeros_like( individual_embeddings[0] ) else: # Default dimension, adjust if needed for your embedding model zero_embedding = np.zeros(1024) individual_embeddings.append(zero_embedding) if individual_embeddings: all_embeddings.append(np.array(individual_embeddings)) if not all_embeddings: raise RuntimeError("Failed to generate any embeddings") # Concatenate all embeddings return np.vstack(all_embeddings) except Exception as e: logger.error(f"Failed to get embeddings from RAG: {e}") raise def _hierarchical_clustering(self, embeddings, target_size: int): """Hierarchical clustering algorithm""" if not CLUSTERING_AVAILABLE: raise ImportError("scipy and scikit-learn are required for clustering") try: Z = linkage(embeddings, method="ward", metric="euclidean") target_clusters = max(1, len(embeddings) // target_size) return fcluster(Z, t=target_clusters, criterion="maxclust") except Exception as e: logger.error(f"Hierarchical clustering failed: {str(e)}") raise def _split_large_clusters( self, clusters: List[List[str]], embeddings, original_nodes: List[str] ): """Split oversized clusters using KMeans""" if not hasattr(self.config, "max_batch_size") or not hasattr( self.config, "min_batch_size" ): # For strategies without batch size limits, return as-is return clusters final_clusters = [] for cluster in clusters: original_count = len(cluster) if original_count <= self.config.max_batch_size: final_clusters.append(cluster) continue try: indices = [i for i, n in enumerate(original_nodes) if n in cluster] sub_embeddings = embeddings[indices] assert ( len(indices) == original_count ), "Indices don't match cluster elements" n_sub = max(2, original_count // self.config.min_batch_size + 1) kmeans = KMeans(n_clusters=n_sub, n_init=10, random_state=42) sub_labels = kmeans.fit_predict(sub_embeddings) sub_clusters = defaultdict(list) for node, label in zip(cluster, sub_labels): sub_clusters[label].append(node) split_total = sum(len(v) for v in sub_clusters.values()) if split_total != original_count: raise ValueError( f"Element loss: original {original_count} after split {split_total}" ) final_clusters.extend(sub_clusters.values()) logger.info( f"Split cluster of size {original_count} into {len(sub_clusters)} sub-clusters" ) except Exception as e: logger.warning( f"Sub-clustering failed, keeping original cluster: {str(e)}" ) final_clusters.append(cluster) return final_clusters def _optimize_batches(self, clusters: List[List[str]]) -> List[List[str]]: """Optimize batch grouping using greedy algorithm""" if not hasattr(self.config, "target_batch_size"): # For strategies without batch optimization, return as-is return clusters batches = [] current_batch = [] cluster_queue = deque(sorted(clusters, key=len, reverse=True)) while cluster_queue: cluster = cluster_queue.popleft() if len(current_batch) + len(cluster) <= self.config.target_batch_size: current_batch.extend(cluster) continue remaining_space = self.config.target_batch_size - len(current_batch) if ( remaining_space >= self.config.min_batch_size and len(cluster) > remaining_space ): current_batch.extend(cluster[:remaining_space]) cluster_queue.appendleft(cluster[remaining_space:]) else: if current_batch: batches.append(current_batch) current_batch = list(cluster) if current_batch: batches.append(current_batch) # Validate element count consistency input_count = sum(len(c) for c in clusters) output_count = sum(len(b) for b in batches) if input_count != output_count: error_msg = ( f"Critical error: element count changed ({input_count}→{output_count})" ) logger.error(error_msg) raise ValueError(error_msg) return batches async def cluster_and_batch(self, nodes: List[str]) -> List[List[str]]: """Main processing pipeline for clustering and batching using RAG's embedding function""" self._validate_input(nodes) logger.info("Generating semantic embeddings using RAG's embedding function...") embeddings = await self._get_embeddings(nodes) logger.info("Performing hierarchical clustering...") target_size = getattr(self.config, "target_batch_size", 30) cluster_ids = self._hierarchical_clustering(embeddings, target_size) cluster_dict = defaultdict(list) for node, cid in zip(nodes, cluster_ids): cluster_dict[cid].append(node) initial_clusters = list(cluster_dict.values()) logger.info("Optimizing cluster sizes...") optimized_clusters = self._split_large_clusters( initial_clusters, embeddings, nodes ) logger.info("Creating final batches...") batches = self._optimize_batches(optimized_clusters) return batches # ======================= Base Strategy Class ======================= class BaseDeduplicationStrategy(ABC): """Base class for deduplication strategies""" def __init__(self, service: DeduplicationService, config: BaseDeduplicationConfig): self.service = service self.config = config self._check_dependencies() @abstractmethod def _check_dependencies(self) -> None: """Check strategy-specific dependencies""" pass @abstractmethod async def classify_nodes_by_similarity( self, node_data: List[Dict[str, Any]] ) -> List[List[str]]: """Classify nodes by similarity and return batches""" pass @abstractmethod async def clean_nodes(self, nodes_batches: List[List[str]]) -> None: """Clean nodes by removing duplicates""" pass def _normalize_node_data(self, node_data: List[Dict[str, Any]]) -> List[EntityInfo]: """Normalize node data to standard EntityInfo structure""" normalized = [] for node in node_data: if isinstance(node, dict): # Check different possible field names entity_id = node.get("entity_id") or node.get("entity_name") entity_type = node.get("entity_type") if entity_type and entity_id: entity_info = EntityInfo( entity_id=entity_id, entity_type=entity_type, description=node.get("description"), ) normalized.append(entity_info) else: logger.warning( f"Node missing required fields (entity_id/entity_name and entity_type): {node}" ) else: logger.warning( f"Invalid node format (expected dict, got {type(node)}): {node}" ) return normalized # ======================= Strategy Factory ======================= class DeduplicationStrategyFactory: """Factory for creating deduplication strategies""" _strategies = {} @classmethod def register_strategy(cls, strategy_name: str, strategy_class: type): """Register a new deduplication strategy""" cls._strategies[strategy_name] = strategy_class @classmethod def create_strategy( cls, strategy_name: str, service: DeduplicationService, config: Union[BaseDeduplicationConfig, Dict[str, Any]], ) -> BaseDeduplicationStrategy: """Create a deduplication strategy instance""" if strategy_name not in cls._strategies: available_strategies = list(cls._strategies.keys()) raise ValueError( f"Unknown deduplication strategy '{strategy_name}'. " f"Available strategies: {available_strategies}" ) # Convert dict config to proper config object if needed if isinstance(config, dict): config = ConfigFactory.create_config(strategy_name, config) strategy_class = cls._strategies[strategy_name] return strategy_class(service, config) @classmethod def get_available_strategies(cls) -> List[str]: """Get list of available strategy names""" return list(cls._strategies.keys()) # ======================= LLM-based Cleaning Strategy ======================= class LLMBasedCleaning(BaseDeduplicationStrategy): """LLM-based node cleaning strategy""" def __init__(self, service: DeduplicationService, config: LLMBasedConfig): super().__init__(service, config) def _check_dependencies(self) -> None: """Check LLM-based strategy specific dependencies""" missing_deps = [] if not FUZZYWUZZY_AVAILABLE: missing_deps.append("fuzzywuzzy") if not CLUSTERING_AVAILABLE: missing_deps.append("scipy and scikit-learn") if not JSON_REPAIR_AVAILABLE: missing_deps.append("json-repair") if missing_deps: raise ImportError( f"Missing dependencies for LLM-based deduplication: {', '.join(missing_deps)}. " f"Install with: pip install lightrag-hku[deduplication]" ) async def classify_nodes_by_similarity( self, node_data: List[Dict[str, Any]] ) -> List[List[str]]: """Classify nodes by similarity and return batches""" logger.info( f"Classifying nodes by similarity with batch size: {self.config.target_batch_size}" ) # Normalize input data entities = self._normalize_node_data(node_data) # Group by entity type classified_data = defaultdict(list) for entity in entities: classified_data[entity.entity_type].append(entity.entity_id) # Process node batches nodes_batches = [] short_batches = [] # Use improved SemanticClusterBatcher, pass in deduplication_service batcher = SemanticClusterBatcher(self.config, self.service) # logger.info(f"classified_data: {classified_data}") for entity_type, items in classified_data.items(): if len(items) <= self.config.max_batch_size: if len(items) >= self.config.min_batch_size: nodes_batches.append(items) else: short_batches.append(items) else: # Use semantic clustering for large groups split_batches = await batcher.cluster_and_batch(items) nodes_batches.extend(split_batches) # Handle small batches if short_batches: combined_short = [item for sublist in short_batches for item in sublist] if len(combined_short) >= self.config.min_batch_size: if len(combined_short) <= self.config.max_batch_size: nodes_batches.append(combined_short) else: # Apply clustering to combined short batches split_batches = await batcher.cluster_and_batch(combined_short) nodes_batches.extend(split_batches) logger.info( f"Processed {len(short_batches)} small batches into {len(combined_short)} items" ) logger.info(f"Created {len(nodes_batches)} batches for processing") # logger.info(f"nodes_batches: {nodes_batches}") return nodes_batches async def clean_nodes(self, nodes_batches: List[List[str]]) -> None: """Main method for cleaning nodes with improved error handling""" failed_batches = [] for i, batch in enumerate(nodes_batches): logger.info("\n" + "-" * 100) logger.info( f"CLEANING BATCH [{i + 1}/{len(nodes_batches)}] - Size: {len(batch)}" ) try: success = await self._process_single_batch(batch) if not success: failed_batches.append((i, batch)) except Exception as e: logger.error(f"Failed to process batch {i + 1}: {str(e)}") failed_batches.append((i, batch)) if failed_batches: logger.warning(f"Failed to process {len(failed_batches)} batches") logger.info( f"NODE CLEANING COMPLETE - Total: {len(nodes_batches)}, " f"Success: {len(nodes_batches) - len(failed_batches)}, " f"Failed: {len(failed_batches)}" ) async def _get_entity_descriptions(self, entity_names: List[str]) -> Dict[str, str]: """Get entity description information""" descriptions = {} try: # Get entity information from knowledge graph for entity_name in entity_names: entity_data = await self.service.rag_instance.chunk_entity_relation_graph.get_node( entity_name ) if entity_data: # Check if entity_data is a dictionary if isinstance(entity_data, dict): descriptions[entity_name] = entity_data.get( "description", "No description" ) elif isinstance(entity_data, list) and len(entity_data) > 0: # If it's a list, take the first element first_item = ( entity_data[0] if isinstance(entity_data[0], dict) else {} ) descriptions[entity_name] = first_item.get( "description", "No description" ) else: descriptions[entity_name] = "No description" else: descriptions[entity_name] = "No description" except Exception as e: logger.warning(f"Failed to get entity descriptions: {e}") # If failed to get, use default description for entity_name in entity_names: descriptions[entity_name] = "No description" return descriptions async def _process_single_batch( self, batch: List[str], max_retries: int = 2 ) -> bool: """Process a single batch with proper error handling and retry mechanism""" for attempt in range(max_retries + 1): try: if attempt > 0: logger.info(f"Retrying attempt {attempt}...") # First analysis uses only entity names, without descriptions # Prepare system prompt based on strictness level if self.config.system_prompt: system_prompt = self.config.system_prompt else: # Choose prompt based on strictness level strictness_prompts = { "strict": PROMPTS["goal_clean_strict"], "medium": PROMPTS["goal_clean_medium"], "loose": PROMPTS["goal_clean_loose"], } system_prompt = strictness_prompts.get( self.config.strictness_level, PROMPTS[ "goal_clean_medium" ], # Default to medium if invalid level ) system_prompt = ( str(system_prompt) + "\n" + PROMPTS["goal_clean_examples"] ) # Add specific analysis instruction for name-only analysis analysis_instruction = PROMPTS["name_only_analysis_instruction"] full_system_prompt = system_prompt + analysis_instruction # Call LLM with entity names only (first analysis) response = await self.service.process_with_llm( str(batch), system_prompt=full_system_prompt ) # Add delay for ollama to recover context await asyncio.sleep(0.5) if not response or response.strip().lower() in [ "null", "", "[]", "none", ]: logger.info("No cleaning needed for this batch") return True # Parse response with improved error handling try: repaired = repair_json(response) data = json.loads(repaired) # Handle both dict and list formats if isinstance(data, dict): merge_operations = data.get("merge", []) elif isinstance(data, list): # If LLM returns a list directly, treat it as merge operations merge_operations = data else: logger.error( f"Unexpected data format: {type(data)}, data: {data}" ) if attempt < max_retries: continue return False except Exception as e: logger.error(f"Failed to parse JSON response: {e}") logger.debug(f"Raw response: {response}") if attempt < max_retries: continue return False # Process merge operations if not merge_operations: logger.info("No merge operations found") return True logger.info(f"Found {len(merge_operations)} merge suggestions") for i, op in enumerate(merge_operations, 1): if not isinstance(op, dict): logger.warning(f"Invalid merge operation format: {op}") return await self._execute_merge_operations(merge_operations, batch) except Exception as e: logger.error( f"Error processing batch (attempt {attempt + 1}): {str(e)}" ) if attempt < max_retries: continue return False return False async def _verify_merge_with_descriptions( self, entities_to_merge: List[str], summary: str ) -> List[Dict]: """ Perform secondary verification using entity descriptions to return refined merge suggestions """ try: logger.info(f"Verifying merge: {entities_to_merge} → {summary}") # Get entity descriptions for all entities to be merged entity_descriptions = await self._get_entity_descriptions(entities_to_merge) # Format entities with descriptions for LLM analysis entities_text_list = [] for i, entity_name in enumerate(entities_to_merge, 1): description = entity_descriptions.get(entity_name, "No description") entities_text_list.append(f"{i}. {entity_name}") entities_text_list.append(f" Description: {description}") entities_with_descriptions = "\n".join(entities_text_list) # Build verification prompt verification_prompt = PROMPTS["secondary_merge_verification"].replace( "{entities_with_descriptions}", entities_with_descriptions ) # Add examples to the system prompt full_system_prompt = PROMPTS["secondary_verification_examples"] # Call LLM for verification response = await self.service.process_with_llm( verification_prompt, system_prompt=full_system_prompt ) # Add delay for ollama to recover context await asyncio.sleep(0.3) if not response: logger.warning("No response from LLM for merge verification") return [] # Parse the verification response with improved error handling try: # Clean the response first cleaned_response = response.strip() # Try to extract JSON from the response if it's wrapped in markdown if "```json" in cleaned_response: start = cleaned_response.find("```json") + 7 end = cleaned_response.find("```", start) if end != -1: cleaned_response = cleaned_response[start:end].strip() elif "```" in cleaned_response: start = cleaned_response.find("```") + 3 end = cleaned_response.rfind("```") if end != -1 and end > start: cleaned_response = cleaned_response[start:end].strip() # Remove any leading/trailing whitespace or newlines cleaned_response = cleaned_response.strip() # Try json-repair first if JSON_REPAIR_AVAILABLE: try: repaired = repair_json(cleaned_response) verification_result = json.loads(repaired) except Exception as repair_error: logger.warning( f"JSON repair failed, trying direct parse: {repair_error}" ) verification_result = json.loads(cleaned_response) else: verification_result = json.loads(cleaned_response) merge_operations = verification_result.get("merge", []) return merge_operations except Exception as e: logger.error(f"Failed to parse verification response: {e}") logger.debug(f"Raw verification response: {response}") return [] except Exception as e: logger.error(f"Error during merge verification: {e}") return [] async def _execute_merge_operations( self, merge_operations: List[Dict], batch: List[str] ) -> bool: """Execute merge operations with secondary verification and atomic transaction support""" successful_merges = [] try: for i, op in enumerate(merge_operations, 1): # Validate operation format if not isinstance(op, dict): continue nodes_to_merge = op.get("keywords", []) summarized_node = op.get("summary", "") if not nodes_to_merge or not summarized_node: continue if len(nodes_to_merge) <= 1: continue # Validate that summary is one of the keywords (fuzzywuzzy validation) summary_match = process.extractOne(summarized_node, nodes_to_merge) if not summary_match or summary_match[1] < ( self.config.similarity_threshold * 100 ): logger.info( f"Skipping illegal merge operation {i}: summary '{summarized_node}' not in keywords list {nodes_to_merge}, best match: {summary_match}" ) continue # Use the matched keyword as the actual summary to ensure exact matching actual_summary = summary_match[0] logger.debug( f"Summary validation passed: '{summarized_node}' → '{actual_summary}'" ) # Find matching nodes with configurable threshold found_nodes = [] for node in nodes_to_merge: if not node: # Skip empty nodes continue match = process.extractOne(node, batch) if match and match[1] >= (self.config.similarity_threshold * 100): found_nodes.append(match[0]) if len(found_nodes) < 2: logger.info( f"Insufficient matched nodes for merge: found_nodes: {found_nodes}, summarized_node: {actual_summary}" ) continue # Update summarized_node to use the validated actual summary summarized_node = actual_summary try: verified_merge_operations = ( await self._verify_merge_with_descriptions( found_nodes, summarized_node ) ) if not verified_merge_operations: logger.info( f" -> Skipping merge due to failed verification: {found_nodes} → {summarized_node}" ) continue # Execute verified merge operations for j, verified_op in enumerate(verified_merge_operations, 1): if not isinstance(verified_op, dict): continue verified_nodes = verified_op.get("keywords", []) verified_summary = verified_op.get("summary", "") if len(verified_nodes) <= 1 or not verified_summary: logger.warning( f"Skipping invalid verification result {j}: nodes={verified_nodes}, summary={verified_summary}" ) continue # Validate that verified summary is one of the verified keywords (fuzzywuzzy validation) verified_summary_match = process.extractOne( verified_summary, verified_nodes ) if not verified_summary_match or verified_summary_match[1] < ( self.config.similarity_threshold * 100 ): logger.warning( f"Skipping verification result {j}: summary '{verified_summary}' not in keywords list {verified_nodes}, best match: {verified_summary_match}" ) continue # Use the matched keyword as the actual verified summary actual_verified_summary = verified_summary_match[0] logger.debug( f"Verified summary validation passed: '{verified_summary}' → '{actual_verified_summary}'" ) # Find matching nodes from the verified list verified_found_nodes = [] for node in verified_nodes: if not node: # Skip empty nodes continue match = process.extractOne(node, batch) if match and match[1] >= ( self.config.similarity_threshold * 100 ): verified_found_nodes.append(match[0]) if len(verified_found_nodes) >= 2: await self.service.merge_entities( verified_found_nodes, actual_verified_summary ) # Update batch atomically for node in verified_found_nodes: if node in batch: batch.remove(node) if actual_verified_summary not in batch: batch.append(actual_verified_summary) successful_merges.append( (verified_found_nodes, actual_verified_summary) ) logger.info( f" √ Successfully merged: {actual_verified_summary} ← {verified_found_nodes}\n" ) except Exception as e: logger.error( f" X Failed to merge entities {found_nodes} → {summarized_node}: {e}" ) # Continue with next operation instead of failing the whole batch continue logger.info( f"Merge operations completed: {len(successful_merges)} successful" ) return True except Exception as e: logger.error(f"Critical error during merge operations: {str(e)}") return False # Register LLM-based strategy DeduplicationStrategyFactory.register_strategy("llm_based", LLMBasedCleaning)