Change tuple delimiter from <|SEP|> to <|S|> across codebase
• Update prompt instruction clarity • Correct utility function examples • Update regex pattern comments
This commit is contained in:
parent
b96f1484ec
commit
1892ed23cc
3 changed files with 18 additions and 18 deletions
|
|
@ -847,7 +847,7 @@ async def _process_extraction_result(
|
||||||
chunk_key: str,
|
chunk_key: str,
|
||||||
timestamp: int,
|
timestamp: int,
|
||||||
file_path: str = "unknown_source",
|
file_path: str = "unknown_source",
|
||||||
tuple_delimiter: str = "<|SEP|>",
|
tuple_delimiter: str = "<|S|>",
|
||||||
completion_delimiter: str = "<|COMPLETE|>",
|
completion_delimiter: str = "<|COMPLETE|>",
|
||||||
) -> tuple[dict, dict]:
|
) -> tuple[dict, dict]:
|
||||||
"""Process a single extraction result (either initial or gleaning)
|
"""Process a single extraction result (either initial or gleaning)
|
||||||
|
|
|
||||||
|
|
@ -15,8 +15,8 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel
|
||||||
|
|
||||||
---Instructions---
|
---Instructions---
|
||||||
1. Entity Extraction: Identify clearly defined and meaningful entities in the input text, and extract the following information:
|
1. Entity Extraction: Identify clearly defined and meaningful entities in the input text, and extract the following information:
|
||||||
- entity_name: Name of the entity, ensure entity names are consistent throughout the extraction.
|
- entity_name: Name of the entity; ensure entity names are consistent throughout the extraction.
|
||||||
- entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided types are suitable, classify it as `Other`.
|
- entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided entity types are suitable, classify it as `Other`.
|
||||||
- entity_description: Provide a concise yet comprehensive description of the entity's attributes and activities based on the information present in the input text.
|
- entity_description: Provide a concise yet comprehensive description of the entity's attributes and activities based on the information present in the input text.
|
||||||
2. Relationship Extraction: Identify direct, clearly stated and meaningful relationships between extracted entities, and extract the following information:
|
2. Relationship Extraction: Identify direct, clearly stated and meaningful relationships between extracted entities, and extract the following information:
|
||||||
- source_entity: name of the source entity.
|
- source_entity: name of the source entity.
|
||||||
|
|
|
||||||
|
|
@ -2571,19 +2571,19 @@ def fix_tuple_delimiter_corruption(
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
record: The text record to fix
|
record: The text record to fix
|
||||||
delimiter_core: The core delimiter (e.g., "SEP" from "<|SEP|>")
|
delimiter_core: The core delimiter (e.g., "S" from "<|S|>")
|
||||||
tuple_delimiter: The complete tuple delimiter (e.g., "<|SEP|>")
|
tuple_delimiter: The complete tuple delimiter (e.g., "<|S|>")
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The corrected record with proper tuple_delimiter format
|
The corrected record with proper tuple_delimiter format
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> fix_tuple_delimiter_corruption("entity<X|SEP|>name", "SEP", "<|SEP|>")
|
>>> fix_tuple_delimiter_corruption("entity<X|S|>name", "SEP", "<|S|>")
|
||||||
"entity<|SEP|>name"
|
"entity<|S|>name"
|
||||||
>>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|SEP|>")
|
>>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|S|>")
|
||||||
"entity<|SEP|>name"
|
"entity<|S|>name"
|
||||||
>>> fix_tuple_delimiter_corruption("entity|SEP|>name", "SEP", "<|SEP|>")
|
>>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
|
||||||
"entity<|SEP|>name"
|
"entity<|S|>name"
|
||||||
"""
|
"""
|
||||||
if not record or not delimiter_core or not tuple_delimiter:
|
if not record or not delimiter_core or not tuple_delimiter:
|
||||||
return record
|
return record
|
||||||
|
|
@ -2591,49 +2591,49 @@ def fix_tuple_delimiter_corruption(
|
||||||
# Escape the delimiter core for regex use
|
# Escape the delimiter core for regex use
|
||||||
escaped_delimiter_core = re.escape(delimiter_core)
|
escaped_delimiter_core = re.escape(delimiter_core)
|
||||||
|
|
||||||
# Fix: <|SEP||SEP|> -> <|SEP|>, <|SEP|||SEP|> -> <|SEP|>
|
# Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
|
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|> (one extra characters outside pipes)
|
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|> (one extra characters outside pipes)
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
|
# Fix: <S>, <S|>, <|S> -> <|S|> (missing one or both pipes)
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<\|?{escaped_delimiter_core}\|?>",
|
rf"<\|?{escaped_delimiter_core}\|?>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
|
# Fix: <XS|> -> <|S|>, <|SX> -> <|S|> (one pipe is replaced by other character)
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
|
rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <|SEP| -> <|SEP|> (missing closing >)
|
# Fix: <|S| -> <|S|> (missing closing >)
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<\|{escaped_delimiter_core}\|(?!>)",
|
rf"<\|{escaped_delimiter_core}\|(?!>)",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: |SEP|> -> <|SEP|> (missing opening <)
|
# Fix: |S|> -> <|S|> (missing opening <)
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"(?<!<)\|{escaped_delimiter_core}\|>",
|
rf"(?<!<)\|{escaped_delimiter_core}\|>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <|SEP|>| -> <|SEP|> ( this is a fix of: <|SEP|| -> <|SEP|> )
|
# Fix: <|S|>| -> <|S|> ( this is a fix for: <|S|| -> <|S|> )
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<\|{escaped_delimiter_core}\|>\|",
|
rf"<\|{escaped_delimiter_core}\|>\|",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue