Change tuple delimiter from <|SEP|> to <|S|> across codebase
• Update prompt instruction clarity • Correct utility function examples • Update regex pattern comments
This commit is contained in:
parent
b96f1484ec
commit
1892ed23cc
3 changed files with 18 additions and 18 deletions
|
|
@ -847,7 +847,7 @@ async def _process_extraction_result(
|
|||
chunk_key: str,
|
||||
timestamp: int,
|
||||
file_path: str = "unknown_source",
|
||||
tuple_delimiter: str = "<|SEP|>",
|
||||
tuple_delimiter: str = "<|S|>",
|
||||
completion_delimiter: str = "<|COMPLETE|>",
|
||||
) -> tuple[dict, dict]:
|
||||
"""Process a single extraction result (either initial or gleaning)
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel
|
|||
|
||||
---Instructions---
|
||||
1. Entity Extraction: Identify clearly defined and meaningful entities in the input text, and extract the following information:
|
||||
- entity_name: Name of the entity, ensure entity names are consistent throughout the extraction.
|
||||
- entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided types are suitable, classify it as `Other`.
|
||||
- entity_name: Name of the entity; ensure entity names are consistent throughout the extraction.
|
||||
- entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided entity types are suitable, classify it as `Other`.
|
||||
- entity_description: Provide a concise yet comprehensive description of the entity's attributes and activities based on the information present in the input text.
|
||||
2. Relationship Extraction: Identify direct, clearly stated and meaningful relationships between extracted entities, and extract the following information:
|
||||
- source_entity: name of the source entity.
|
||||
|
|
|
|||
|
|
@ -2571,19 +2571,19 @@ def fix_tuple_delimiter_corruption(
|
|||
|
||||
Args:
|
||||
record: The text record to fix
|
||||
delimiter_core: The core delimiter (e.g., "SEP" from "<|SEP|>")
|
||||
tuple_delimiter: The complete tuple delimiter (e.g., "<|SEP|>")
|
||||
delimiter_core: The core delimiter (e.g., "S" from "<|S|>")
|
||||
tuple_delimiter: The complete tuple delimiter (e.g., "<|S|>")
|
||||
|
||||
Returns:
|
||||
The corrected record with proper tuple_delimiter format
|
||||
|
||||
Examples:
|
||||
>>> fix_tuple_delimiter_corruption("entity<X|SEP|>name", "SEP", "<|SEP|>")
|
||||
"entity<|SEP|>name"
|
||||
>>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|SEP|>")
|
||||
"entity<|SEP|>name"
|
||||
>>> fix_tuple_delimiter_corruption("entity|SEP|>name", "SEP", "<|SEP|>")
|
||||
"entity<|SEP|>name"
|
||||
>>> fix_tuple_delimiter_corruption("entity<X|S|>name", "SEP", "<|S|>")
|
||||
"entity<|S|>name"
|
||||
>>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|S|>")
|
||||
"entity<|S|>name"
|
||||
>>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
|
||||
"entity<|S|>name"
|
||||
"""
|
||||
if not record or not delimiter_core or not tuple_delimiter:
|
||||
return record
|
||||
|
|
@ -2591,49 +2591,49 @@ def fix_tuple_delimiter_corruption(
|
|||
# Escape the delimiter core for regex use
|
||||
escaped_delimiter_core = re.escape(delimiter_core)
|
||||
|
||||
# Fix: <|SEP||SEP|> -> <|SEP|>, <|SEP|||SEP|> -> <|SEP|>
|
||||
# Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
|
||||
record = re.sub(
|
||||
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|> (one extra characters outside pipes)
|
||||
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|> (one extra characters outside pipes)
|
||||
record = re.sub(
|
||||
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
|
||||
# Fix: <S>, <S|>, <|S> -> <|S|> (missing one or both pipes)
|
||||
record = re.sub(
|
||||
rf"<\|?{escaped_delimiter_core}\|?>",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
|
||||
# Fix: <XS|> -> <|S|>, <|SX> -> <|S|> (one pipe is replaced by other character)
|
||||
record = re.sub(
|
||||
rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: <|SEP| -> <|SEP|> (missing closing >)
|
||||
# Fix: <|S| -> <|S|> (missing closing >)
|
||||
record = re.sub(
|
||||
rf"<\|{escaped_delimiter_core}\|(?!>)",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: |SEP|> -> <|SEP|> (missing opening <)
|
||||
# Fix: |S|> -> <|S|> (missing opening <)
|
||||
record = re.sub(
|
||||
rf"(?<!<)\|{escaped_delimiter_core}\|>",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: <|SEP|>| -> <|SEP|> ( this is a fix of: <|SEP|| -> <|SEP|> )
|
||||
# Fix: <|S|>| -> <|S|> ( this is a fix for: <|S|| -> <|S|> )
|
||||
record = re.sub(
|
||||
rf"<\|{escaped_delimiter_core}\|>\|",
|
||||
tuple_delimiter,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue