Fix tuple delimiter regex patterns and add debug logging

- Add debug logs for malformed records
- Fix regex for consecutive delimiters
- Handle missing closing brackets
This commit is contained in:
yangdx 2025-09-14 17:29:27 +08:00
parent 4de1473875
commit 87bb8a023b
2 changed files with 9 additions and 6 deletions

View file

@ -326,6 +326,7 @@ async def _handle_single_entity_extraction(
logger.warning( logger.warning(
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`" f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`"
) )
logger.debug(record_attributes)
return None return None
try: try:
@ -399,6 +400,7 @@ async def _handle_single_relationship_extraction(
logger.warning( logger.warning(
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`" f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`"
) )
logger.debug(record_attributes)
return None return None
try: try:

View file

@ -2583,9 +2583,9 @@ def fix_tuple_delimiter_corruption(
# Escape the delimiter core for regex use # Escape the delimiter core for regex use
escaped_delimiter_core = re.escape(delimiter_core) escaped_delimiter_core = re.escape(delimiter_core)
# Fix: <|#||#|> -> <|#|>, <|#|||#|> -> <|#|> # Fix: <|##|> -> <|#|>, <|#||#|> -> <|#|>, <|#|||#|> -> <|#|>
record = re.sub( record = re.sub(
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>", rf"<\|{escaped_delimiter_core}\|*?{escaped_delimiter_core}\|>",
tuple_delimiter, tuple_delimiter,
record, record,
) )
@ -2604,9 +2604,9 @@ def fix_tuple_delimiter_corruption(
record, record,
) )
# Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|> (one extra characters outside pipes) # Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|>, <||#> -> <|#|> (one extra characters outside pipes)
record = re.sub( record = re.sub(
rf"<.?\|{escaped_delimiter_core}\|.?>", rf"<.?\|{escaped_delimiter_core}\|*?>",
tuple_delimiter, tuple_delimiter,
record, record,
) )
@ -2625,9 +2625,10 @@ def fix_tuple_delimiter_corruption(
record, record,
) )
# Fix: <|#| -> <|#|> (missing closing >) # Fix: <|#| -> <|#|>, <|#|| -> <|#|> (missing closing >)
#
record = re.sub( record = re.sub(
rf"<\|{escaped_delimiter_core}\|(?!>)", rf"<\|{escaped_delimiter_core}\|+(?!>)",
tuple_delimiter, tuple_delimiter,
record, record,
) )