Fix tuple delimiter regex patterns and add debug logging
- Add debug logs for malformed records - Fix regex for consecutive delimiters - Handle missing closing brackets
This commit is contained in:
parent
4de1473875
commit
87bb8a023b
2 changed files with 9 additions and 6 deletions
|
|
@ -326,6 +326,7 @@ async def _handle_single_entity_extraction(
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`"
|
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`"
|
||||||
)
|
)
|
||||||
|
logger.debug(record_attributes)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -399,6 +400,7 @@ async def _handle_single_relationship_extraction(
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`"
|
f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`"
|
||||||
)
|
)
|
||||||
|
logger.debug(record_attributes)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -2583,9 +2583,9 @@ def fix_tuple_delimiter_corruption(
|
||||||
# Escape the delimiter core for regex use
|
# Escape the delimiter core for regex use
|
||||||
escaped_delimiter_core = re.escape(delimiter_core)
|
escaped_delimiter_core = re.escape(delimiter_core)
|
||||||
|
|
||||||
# Fix: <|#||#|> -> <|#|>, <|#|||#|> -> <|#|>
|
# Fix: <|##|> -> <|#|>, <|#||#|> -> <|#|>, <|#|||#|> -> <|#|>
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
|
rf"<\|{escaped_delimiter_core}\|*?{escaped_delimiter_core}\|>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
@ -2604,9 +2604,9 @@ def fix_tuple_delimiter_corruption(
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|> (one extra characters outside pipes)
|
# Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|>, <||#> -> <|#|> (one extra characters outside pipes)
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
rf"<.?\|{escaped_delimiter_core}\|*?>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
@ -2625,9 +2625,10 @@ def fix_tuple_delimiter_corruption(
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <|#| -> <|#|> (missing closing >)
|
# Fix: <|#| -> <|#|>, <|#|| -> <|#|> (missing closing >)
|
||||||
|
#
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<\|{escaped_delimiter_core}\|(?!>)",
|
rf"<\|{escaped_delimiter_core}\|+(?!>)",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue