Fix tuple delimiter regex patterns and add debug logging

- Add debug logs for malformed records
- Fix regex for consecutive delimiters
- Handle missing closing brackets
This commit is contained in:
yangdx 2025-09-14 17:29:27 +08:00
parent 3792f86de3
commit 4dafec8884
2 changed files with 9 additions and 6 deletions

View file

@ -325,6 +325,7 @@ async def _handle_single_entity_extraction(
logger.warning(
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`"
)
logger.debug(record_attributes)
return None
try:
@ -398,6 +399,7 @@ async def _handle_single_relationship_extraction(
logger.warning(
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`"
)
logger.debug(record_attributes)
return None
try:

View file

@ -2583,9 +2583,9 @@ def fix_tuple_delimiter_corruption(
# Escape the delimiter core for regex use
escaped_delimiter_core = re.escape(delimiter_core)
# Fix: <|#||#|> -> <|#|>, <|#|||#|> -> <|#|>
# Fix: <|##|> -> <|#|>, <|#||#|> -> <|#|>, <|#|||#|> -> <|#|>
record = re.sub(
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
rf"<\|{escaped_delimiter_core}\|*?{escaped_delimiter_core}\|>",
tuple_delimiter,
record,
)
@ -2604,9 +2604,9 @@ def fix_tuple_delimiter_corruption(
record,
)
# Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|> (one extra characters outside pipes)
# Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|>, <||#> -> <|#|> (one extra characters outside pipes)
record = re.sub(
rf"<.?\|{escaped_delimiter_core}\|.?>",
rf"<.?\|{escaped_delimiter_core}\|*?>",
tuple_delimiter,
record,
)
@ -2625,9 +2625,10 @@ def fix_tuple_delimiter_corruption(
record,
)
# Fix: <|#| -> <|#|> (missing closing >)
# Fix: <|#| -> <|#|>, <|#|| -> <|#|> (missing closing >)
#
record = re.sub(
rf"<\|{escaped_delimiter_core}\|(?!>)",
rf"<\|{escaped_delimiter_core}\|+(?!>)",
tuple_delimiter,
record,
)