Improve extraction parsing with better bracket handling and delimiter fixes
• Standardize Chinese/English brackets • Fix incomplete tuple delimiters • Remove duplicate delimiter fix code • Support mixed bracket formats • Enhance record parsing robustness
This commit is contained in:
parent
8bbf307aeb
commit
3cdc98f366
1 changed files with 47 additions and 16 deletions
|
|
@ -822,23 +822,39 @@ async def _parse_extraction_result(
|
||||||
maybe_nodes = defaultdict(list)
|
maybe_nodes = defaultdict(list)
|
||||||
maybe_edges = defaultdict(list)
|
maybe_edges = defaultdict(list)
|
||||||
|
|
||||||
# Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues
|
# Standardize Chinese brackets around record_delimiter to English brackets
|
||||||
if context_base["tuple_delimiter"] == "<|>":
|
record_delimiter = context_base["record_delimiter"]
|
||||||
# 1. Convert <||> to <|>
|
bracket_pattern = f"[))](\\s*{re.escape(record_delimiter)}\\s*)[((]"
|
||||||
extraction_result = extraction_result.replace("<||>", "<|>")
|
extraction_result = re.sub(bracket_pattern, ")\\1(", extraction_result)
|
||||||
# 2. Convert < | > to <|>
|
|
||||||
extraction_result = extraction_result.replace("< | >", "<|>")
|
|
||||||
|
|
||||||
# Parse the extraction result using the same logic as in extract_entities
|
# Parse the extraction result using the same logic as in extract_entities
|
||||||
records = split_string_by_multi_markers(
|
records = split_string_by_multi_markers(
|
||||||
extraction_result,
|
extraction_result,
|
||||||
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
||||||
)
|
)
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
record = re.search(r"\((.*)\)", record, re.DOTALL)
|
# Remove outer brackets
|
||||||
|
record = record.strip()
|
||||||
|
if record.startswith("(") or record.startswith("("):
|
||||||
|
record = record[1:]
|
||||||
|
if record.endswith(")") or record.endswith(")"):
|
||||||
|
record = record[:-1]
|
||||||
|
|
||||||
|
record = record.strip()
|
||||||
if record is None:
|
if record is None:
|
||||||
continue
|
continue
|
||||||
record = record.group(1)
|
|
||||||
|
if context_base["tuple_delimiter"] == "<|>":
|
||||||
|
# fix entity<| with entity<|>
|
||||||
|
record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record)
|
||||||
|
# fix relationship<| with relationship<|>
|
||||||
|
record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record)
|
||||||
|
# fix <||> with <|>
|
||||||
|
record = record.replace("<||>", "<|>")
|
||||||
|
# fix < | > with <|>
|
||||||
|
record = record.replace("< | >", "<|>")
|
||||||
|
|
||||||
record_attributes = split_string_by_multi_markers(
|
record_attributes = split_string_by_multi_markers(
|
||||||
record, [context_base["tuple_delimiter"]]
|
record, [context_base["tuple_delimiter"]]
|
||||||
)
|
)
|
||||||
|
|
@ -1736,12 +1752,10 @@ async def extract_entities(
|
||||||
maybe_nodes = defaultdict(list)
|
maybe_nodes = defaultdict(list)
|
||||||
maybe_edges = defaultdict(list)
|
maybe_edges = defaultdict(list)
|
||||||
|
|
||||||
# Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues
|
# Standardize Chinese brackets around record_delimiter to English brackets
|
||||||
if context_base["tuple_delimiter"] == "<|>":
|
record_delimiter = context_base["record_delimiter"]
|
||||||
# 1. Convert <||> to <|>
|
bracket_pattern = f"[))](\\s*{re.escape(record_delimiter)}\\s*)[((]"
|
||||||
result = result.replace("<||>", "<|>")
|
result = re.sub(bracket_pattern, ")\\1(", result)
|
||||||
# 2. Convert < | > to <|>
|
|
||||||
result = result.replace("< | >", "<|>")
|
|
||||||
|
|
||||||
records = split_string_by_multi_markers(
|
records = split_string_by_multi_markers(
|
||||||
result,
|
result,
|
||||||
|
|
@ -1749,10 +1763,27 @@ async def extract_entities(
|
||||||
)
|
)
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
record = re.search(r"\((.*)\)", record, re.DOTALL)
|
# Remove outer brackets (support English and Chinese brackets)
|
||||||
|
record = record.strip()
|
||||||
|
if record.startswith("(") or record.startswith("("):
|
||||||
|
record = record[1:]
|
||||||
|
if record.endswith(")") or record.endswith(")"):
|
||||||
|
record = record[:-1]
|
||||||
|
|
||||||
|
record = record.strip()
|
||||||
if record is None:
|
if record is None:
|
||||||
continue
|
continue
|
||||||
record = record.group(1)
|
|
||||||
|
if context_base["tuple_delimiter"] == "<|>":
|
||||||
|
# fix entity<| with entity<|>
|
||||||
|
record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record)
|
||||||
|
# fix relationship<| with relationship<|>
|
||||||
|
record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record)
|
||||||
|
# fix <||> with <|>
|
||||||
|
record = record.replace("<||>", "<|>")
|
||||||
|
# fix < | > with <|>
|
||||||
|
record = record.replace("< | >", "<|>")
|
||||||
|
|
||||||
record_attributes = split_string_by_multi_markers(
|
record_attributes = split_string_by_multi_markers(
|
||||||
record, [context_base["tuple_delimiter"]]
|
record, [context_base["tuple_delimiter"]]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue