Refactor extraction result processing to reduce code duplication

• Extract shared processing logic • Add delimiter pattern fixes • Improve bracket standardization
2025-09-02 01:22:29 +08:00 · 2025-09-02 01:22:29 +08:00 · 3f8a9abe7e
commit 3f8a9abe7e
parent 3cdc98f366
1 changed files with 101 additions and 128 deletions
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -793,6 +793,87 @@ async def _get_cached_extraction_results(
    return sorted_cached_results
 async def _process_extraction_result(
    result: str,
    chunk_key: str,
    file_path: str = "unknown_source",
    tuple_delimiter: str = "<|>",
    record_delimiter: str = "##",
    completion_delimiter: str = "<|COMPLETE|>",
 ) -> tuple[dict, dict]:
    """Process a single extraction result (either initial or gleaning)
    Args:
        result (str): The extraction result to process
        chunk_key (str): The chunk key for source tracking
        file_path (str): The file path for citation
        tuple_delimiter (str): Delimiter for tuple fields
        record_delimiter (str): Delimiter for records
        completion_delimiter (str): Delimiter for completion
    Returns:
        tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
    """
    maybe_nodes = defaultdict(list)
    maybe_edges = defaultdict(list)
    # Standardize Chinese brackets around record_delimiter to English brackets
    bracket_pattern = f"[）)](\\s*{re.escape(record_delimiter)}\\s*)[（(]"
    result = re.sub(bracket_pattern, ")\\1(", result)
    records = split_string_by_multi_markers(
        result,
        [record_delimiter, completion_delimiter],
    )
    for record in records:
        # Remove outer brackets (support English and Chinese brackets)
        record = record.strip()
        if record.startswith("(") or record.startswith("（"):
            record = record[1:]
        if record.endswith(")") or record.endswith("）"):
            record = record[:-1]
        record = record.strip()
        if record is None:
            continue
        if tuple_delimiter == "<|>":
            # fix entity<| with entity<|>
            record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record)
            # fix relationship<| with relationship<|>
            record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record)
            # fix <||> with <|>
            record = record.replace("<||>", "<|>")
            # fix  < | > with <|>
            record = record.replace("< | >", "<|>")
            # fix <<|>> with <|>
            record = record.replace("<<|>>", "<|>")
            # fix <|>> with <|>
            record = record.replace("<|>>", "<|>")
            # fix <<|> with <|>
            record = record.replace("<<|>", "<|>")
        record_attributes = split_string_by_multi_markers(record, [tuple_delimiter])
        # Try to parse as entity
        entity_data = await _handle_single_entity_extraction(
            record_attributes, chunk_key, file_path
        )
        if entity_data is not None:
            maybe_nodes[entity_data["entity_name"]].append(entity_data)
            continue
        # Try to parse as relationship
        relationship_data = await _handle_single_relationship_extraction(
            record_attributes, chunk_key, file_path
        )
        if relationship_data is not None:
            maybe_edges[
                (relationship_data["src_id"], relationship_data["tgt_id"])
            ].append(relationship_data)
    return dict(maybe_nodes), dict(maybe_edges)
 async def _parse_extraction_result(
    text_chunks_storage: BaseKVStorage, extraction_result: str, chunk_id: str
 ) -> tuple[dict, dict]:
@ -814,69 +895,16 @@ async def _parse_extraction_result(
        if chunk_data
        else "unknown_source"
    )
-    context_base = dict(
+
    # Call the shared processing function
    return await _process_extraction_result(
        extraction_result,
        chunk_id,
        file_path,
        tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
        record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
        completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
    )
    maybe_nodes = defaultdict(list)
    maybe_edges = defaultdict(list)
    # Standardize Chinese brackets around record_delimiter to English brackets
    record_delimiter = context_base["record_delimiter"]
    bracket_pattern = f"[）)](\\s*{re.escape(record_delimiter)}\\s*)[（(]"
    extraction_result = re.sub(bracket_pattern, ")\\1(", extraction_result)
    # Parse the extraction result using the same logic as in extract_entities
    records = split_string_by_multi_markers(
        extraction_result,
        [context_base["record_delimiter"], context_base["completion_delimiter"]],
    )
    for record in records:
        # Remove outer brackets
        record = record.strip()
        if record.startswith("(") or record.startswith("（"):
            record = record[1:]
        if record.endswith(")") or record.endswith("）"):
            record = record[:-1]
        record = record.strip()
        if record is None:
            continue
        if context_base["tuple_delimiter"] == "<|>":
            # fix entity<| with entity<|>
            record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record)
            # fix relationship<| with relationship<|>
            record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record)
            # fix <||> with <|>
            record = record.replace("<||>", "<|>")
            # fix  < | > with <|>
            record = record.replace("< | >", "<|>")
        record_attributes = split_string_by_multi_markers(
            record, [context_base["tuple_delimiter"]]
        )
        # Try to parse as entity
        entity_data = await _handle_single_entity_extraction(
            record_attributes, chunk_id, file_path
        )
        if entity_data is not None:
            maybe_nodes[entity_data["entity_name"]].append(entity_data)
            continue
        # Try to parse as relationship
        relationship_data = await _handle_single_relationship_extraction(
            record_attributes, chunk_id, file_path
        )
        if relationship_data is not None:
            maybe_edges[
                (relationship_data["src_id"], relationship_data["tgt_id"])
            ].append(relationship_data)
    return dict(maybe_nodes), dict(maybe_edges)
 async def _rebuild_single_entity(
@ -1738,73 +1766,6 @@ async def extract_entities(
    processed_chunks = 0
    total_chunks = len(ordered_chunks)
    async def _process_extraction_result(
        result: str, chunk_key: str, file_path: str = "unknown_source"
    ):
        """Process a single extraction result (either initial or gleaning)
        Args:
            result (str): The extraction result to process
            chunk_key (str): The chunk key for source tracking
            file_path (str): The file path for citation
        Returns:
            tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
        """
        maybe_nodes = defaultdict(list)
        maybe_edges = defaultdict(list)
        # Standardize Chinese brackets around record_delimiter to English brackets
        record_delimiter = context_base["record_delimiter"]
        bracket_pattern = f"[）)](\\s*{re.escape(record_delimiter)}\\s*)[（(]"
        result = re.sub(bracket_pattern, ")\\1(", result)
        records = split_string_by_multi_markers(
            result,
            [context_base["record_delimiter"], context_base["completion_delimiter"]],
        )
        for record in records:
            # Remove outer brackets (support English and Chinese brackets)
            record = record.strip()
            if record.startswith("(") or record.startswith("（"):
                record = record[1:]
            if record.endswith(")") or record.endswith("）"):
                record = record[:-1]
            record = record.strip()
            if record is None:
                continue
            if context_base["tuple_delimiter"] == "<|>":
                # fix entity<| with entity<|>
                record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record)
                # fix relationship<| with relationship<|>
                record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record)
                # fix <||> with <|>
                record = record.replace("<||>", "<|>")
                # fix  < | > with <|>
                record = record.replace("< | >", "<|>")
            record_attributes = split_string_by_multi_markers(
                record, [context_base["tuple_delimiter"]]
            )
            if_entities = await _handle_single_entity_extraction(
                record_attributes, chunk_key, file_path
            )
            if if_entities is not None:
                maybe_nodes[if_entities["entity_name"]].append(if_entities)
                continue
            if_relation = await _handle_single_relationship_extraction(
                record_attributes, chunk_key, file_path
            )
            if if_relation is not None:
                maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
                    if_relation
                )
        return maybe_nodes, maybe_edges
    async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
        """Process a single chunk
        Args:
@ -1842,7 +1803,12 @@ async def extract_entities(
        # Process initial extraction with file path
        maybe_nodes, maybe_edges = await _process_extraction_result(
-            final_result, chunk_key, file_path
+            final_result,
            chunk_key,
            file_path,
            tuple_delimiter=context_base["tuple_delimiter"],
            record_delimiter=context_base["record_delimiter"],
            completion_delimiter=context_base["completion_delimiter"],
        )
        # Process additional gleaning results
@ -1861,7 +1827,12 @@ async def extract_entities(
            # Process gleaning result separately with file path
            glean_nodes, glean_edges = await _process_extraction_result(
-                glean_result, chunk_key, file_path
+                glean_result,
                chunk_key,
                file_path,
                tuple_delimiter=context_base["tuple_delimiter"],
                record_delimiter=context_base["record_delimiter"],
                completion_delimiter=context_base["completion_delimiter"],
            )
            # Merge results - only add entities and edges with new names
@ -1869,11 +1840,13 @@ async def extract_entities(
                if (
                    entity_name not in maybe_nodes
                ):  # Only accetp entities with new name in gleaning stage
                    maybe_nodes[entity_name] = []  # Explicitly create the list
                    maybe_nodes[entity_name].extend(entities)
            for edge_key, edges in glean_edges.items():
                if (
                    edge_key not in maybe_edges
                ):  # Only accetp edges with new name in gleaning stage
                    maybe_edges[edge_key] = []  # Explicitly create the list
                    maybe_edges[edge_key].extend(edges)
            if now_glean_index == entity_extract_max_gleaning - 1: