Refactor tuple delimiter corruption fix into reusable utility function
- Extract regex fixes to utils module - Add case-insensitive delimiter handling
This commit is contained in:
parent
b9f80263b8
commit
40688def20
3 changed files with 78 additions and 51 deletions
|
|
@ -3,7 +3,6 @@ from functools import partial
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import re
|
|
||||||
import json_repair
|
import json_repair
|
||||||
from typing import Any, AsyncIterator
|
from typing import Any, AsyncIterator
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
|
|
@ -30,6 +29,7 @@ from .utils import (
|
||||||
build_file_path,
|
build_file_path,
|
||||||
safe_vdb_operation_with_exception,
|
safe_vdb_operation_with_exception,
|
||||||
create_prefixed_exception,
|
create_prefixed_exception,
|
||||||
|
fix_tuple_delimiter_corruption,
|
||||||
)
|
)
|
||||||
from .base import (
|
from .base import (
|
||||||
BaseGraphStorage,
|
BaseGraphStorage,
|
||||||
|
|
@ -875,55 +875,12 @@ async def _process_extraction_result(
|
||||||
if record is None:
|
if record is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Fix various forms of tuple_delimiter corruption from the LLM output.
|
# Fix various forms of tuple_delimiter corruption from the LLM output using the dedicated function
|
||||||
# It handles missing or replaced characters around the core delimiter.
|
delimiter_core = tuple_delimiter[2:-2] # Extract "SEP" from "<|SEP|>"
|
||||||
# 1. There might be extra characters inserted between the bracket and pipeline.
|
record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter)
|
||||||
# 2. `|` may be missing or replaced by another character.
|
# change delimiter_core to lower case, and fix again
|
||||||
# 3. Missing opening `<` or closing `>`
|
delimiter_core = delimiter_core.lower()
|
||||||
# Example transformations:
|
record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter)
|
||||||
# <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|> ((one extra characters outside pipes)
|
|
||||||
# <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
|
|
||||||
# <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (where one | is replace by other charater)
|
|
||||||
# |SEP|> -> <|SEP|>, <|SEP| -> <|SEP|> (where one | is missing)
|
|
||||||
|
|
||||||
escaped_delimiter_core = re.escape(
|
|
||||||
tuple_delimiter[2:-2]
|
|
||||||
) # Extract "SEP" from "<|SEP|>"
|
|
||||||
|
|
||||||
# Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|> (one extra characters outside pipes)
|
|
||||||
record = re.sub(
|
|
||||||
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
|
||||||
tuple_delimiter,
|
|
||||||
record,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
|
|
||||||
record = re.sub(
|
|
||||||
rf"<\|?{escaped_delimiter_core}\|?>",
|
|
||||||
tuple_delimiter,
|
|
||||||
record,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
|
|
||||||
record = re.sub(
|
|
||||||
rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
|
|
||||||
tuple_delimiter,
|
|
||||||
record,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fix: |SEP|> -> <|SEP|> (missing opening <)
|
|
||||||
record = re.sub(
|
|
||||||
rf"(?<!<)\|{escaped_delimiter_core}\|>",
|
|
||||||
tuple_delimiter,
|
|
||||||
record,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fix: <|SEP| -> <|SEP|> (missing closing >)
|
|
||||||
record = re.sub(
|
|
||||||
rf"<\|{escaped_delimiter_core}\|(?!>)",
|
|
||||||
tuple_delimiter,
|
|
||||||
record,
|
|
||||||
)
|
|
||||||
|
|
||||||
record_attributes = split_string_by_multi_markers(record, [tuple_delimiter])
|
record_attributes = split_string_by_multi_markers(record, [tuple_delimiter])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ from typing import Any
|
||||||
|
|
||||||
PROMPTS: dict[str, Any] = {}
|
PROMPTS: dict[str, Any] = {}
|
||||||
|
|
||||||
# Delimiter must be bracketed in "<|...|>"
|
# All delimiters must be formatted as "<|UPPER_CASE_STRING|>"
|
||||||
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|SEP|>"
|
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|SEP|>"
|
||||||
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
|
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2544,6 +2544,76 @@ def get_pinyin_sort_key(text: str) -> str:
|
||||||
return text.lower()
|
return text.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def fix_tuple_delimiter_corruption(
|
||||||
|
record: str, delimiter_core: str, tuple_delimiter: str
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Fix various forms of tuple_delimiter corruption from LLM output.
|
||||||
|
|
||||||
|
This function handles missing or replaced characters around the core delimiter.
|
||||||
|
It fixes common corruption patterns where the LLM output doesn't match the expected
|
||||||
|
tuple_delimiter format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
record: The text record to fix
|
||||||
|
delimiter_core: The core delimiter (e.g., "SEP" from "<|SEP|>")
|
||||||
|
tuple_delimiter: The complete tuple delimiter (e.g., "<|SEP|>")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The corrected record with proper tuple_delimiter format
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> fix_tuple_delimiter_corruption("entity<X|SEP|>name", "SEP", "<|SEP|>")
|
||||||
|
"entity<|SEP|>name"
|
||||||
|
>>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|SEP|>")
|
||||||
|
"entity<|SEP|>name"
|
||||||
|
>>> fix_tuple_delimiter_corruption("entity|SEP|>name", "SEP", "<|SEP|>")
|
||||||
|
"entity<|SEP|>name"
|
||||||
|
"""
|
||||||
|
if not record or not delimiter_core or not tuple_delimiter:
|
||||||
|
return record
|
||||||
|
|
||||||
|
# Escape the delimiter core for regex use
|
||||||
|
escaped_delimiter_core = re.escape(delimiter_core)
|
||||||
|
|
||||||
|
# Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|> (one extra characters outside pipes)
|
||||||
|
record = re.sub(
|
||||||
|
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
||||||
|
tuple_delimiter,
|
||||||
|
record,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
|
||||||
|
record = re.sub(
|
||||||
|
rf"<\|?{escaped_delimiter_core}\|?>",
|
||||||
|
tuple_delimiter,
|
||||||
|
record,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
|
||||||
|
record = re.sub(
|
||||||
|
rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
|
||||||
|
tuple_delimiter,
|
||||||
|
record,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fix: |SEP|> -> <|SEP|> (missing opening <)
|
||||||
|
record = re.sub(
|
||||||
|
rf"(?<!<)\|{escaped_delimiter_core}\|>",
|
||||||
|
tuple_delimiter,
|
||||||
|
record,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fix: <|SEP| -> <|SEP|> (missing closing >)
|
||||||
|
record = re.sub(
|
||||||
|
rf"<\|{escaped_delimiter_core}\|(?!>)",
|
||||||
|
tuple_delimiter,
|
||||||
|
record,
|
||||||
|
)
|
||||||
|
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
def create_prefixed_exception(original_exception: Exception, prefix: str) -> Exception:
|
def create_prefixed_exception(original_exception: Exception, prefix: str) -> Exception:
|
||||||
"""
|
"""
|
||||||
Safely create a prefixed exception that adapts to all error types.
|
Safely create a prefixed exception that adapts to all error types.
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue