Change tuple delimiter from <|SEP|> to <|S|> across codebase

• Update prompt instruction clarity
• Correct utility function examples
• Update regex pattern comments
This commit is contained in:
yangdx 2025-09-12 08:57:46 +08:00
parent b96f1484ec
commit 1892ed23cc
3 changed files with 18 additions and 18 deletions

View file

@ -847,7 +847,7 @@ async def _process_extraction_result(
chunk_key: str,
timestamp: int,
file_path: str = "unknown_source",
tuple_delimiter: str = "<|SEP|>",
tuple_delimiter: str = "<|S|>",
completion_delimiter: str = "<|COMPLETE|>",
) -> tuple[dict, dict]:
"""Process a single extraction result (either initial or gleaning)

View file

@ -15,8 +15,8 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel
---Instructions---
1. Entity Extraction: Identify clearly defined and meaningful entities in the input text, and extract the following information:
- entity_name: Name of the entity, ensure entity names are consistent throughout the extraction.
- entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided types are suitable, classify it as `Other`.
- entity_name: Name of the entity; ensure entity names are consistent throughout the extraction.
- entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided entity types are suitable, classify it as `Other`.
- entity_description: Provide a concise yet comprehensive description of the entity's attributes and activities based on the information present in the input text.
2. Relationship Extraction: Identify direct, clearly stated and meaningful relationships between extracted entities, and extract the following information:
- source_entity: name of the source entity.

View file

@ -2571,19 +2571,19 @@ def fix_tuple_delimiter_corruption(
Args:
record: The text record to fix
delimiter_core: The core delimiter (e.g., "SEP" from "<|SEP|>")
tuple_delimiter: The complete tuple delimiter (e.g., "<|SEP|>")
delimiter_core: The core delimiter (e.g., "S" from "<|S|>")
tuple_delimiter: The complete tuple delimiter (e.g., "<|S|>")
Returns:
The corrected record with proper tuple_delimiter format
Examples:
>>> fix_tuple_delimiter_corruption("entity<X|SEP|>name", "SEP", "<|SEP|>")
"entity<|SEP|>name"
>>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|SEP|>")
"entity<|SEP|>name"
>>> fix_tuple_delimiter_corruption("entity|SEP|>name", "SEP", "<|SEP|>")
"entity<|SEP|>name"
>>> fix_tuple_delimiter_corruption("entity<X|S|>name", "SEP", "<|S|>")
"entity<|S|>name"
>>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|S|>")
"entity<|S|>name"
>>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
"entity<|S|>name"
"""
if not record or not delimiter_core or not tuple_delimiter:
return record
@ -2591,49 +2591,49 @@ def fix_tuple_delimiter_corruption(
# Escape the delimiter core for regex use
escaped_delimiter_core = re.escape(delimiter_core)
# Fix: <|SEP||SEP|> -> <|SEP|>, <|SEP|||SEP|> -> <|SEP|>
# Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
record = re.sub(
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
tuple_delimiter,
record,
)
# Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|> (one extra characters outside pipes)
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|> (one extra characters outside pipes)
record = re.sub(
rf"<.?\|{escaped_delimiter_core}\|.?>",
tuple_delimiter,
record,
)
# Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
# Fix: <S>, <S|>, <|S> -> <|S|> (missing one or both pipes)
record = re.sub(
rf"<\|?{escaped_delimiter_core}\|?>",
tuple_delimiter,
record,
)
# Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
# Fix: <XS|> -> <|S|>, <|SX> -> <|S|> (one pipe is replaced by other character)
record = re.sub(
rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
tuple_delimiter,
record,
)
# Fix: <|SEP| -> <|SEP|> (missing closing >)
# Fix: <|S| -> <|S|> (missing closing >)
record = re.sub(
rf"<\|{escaped_delimiter_core}\|(?!>)",
tuple_delimiter,
record,
)
# Fix: |SEP|> -> <|SEP|> (missing opening <)
# Fix: |S|> -> <|S|> (missing opening <)
record = re.sub(
rf"(?<!<)\|{escaped_delimiter_core}\|>",
tuple_delimiter,
record,
)
# Fix: <|SEP|>| -> <|SEP|> ( this is a fix of: <|SEP|| -> <|SEP|> )
# Fix: <|S|>| -> <|S|> ( this is a fix for: <|S|| -> <|S|> )
record = re.sub(
rf"<\|{escaped_delimiter_core}\|>\|",
tuple_delimiter,