Change tuple delimiter from <|SEP|> to <|S|> across codebase

• Update prompt instruction clarity • Correct utility function examples • Update regex pattern comments
2025-09-12 08:57:46 +08:00 · 2025-09-12 08:57:46 +08:00 · 1892ed23cc
commit 1892ed23cc
parent b96f1484ec
3 changed files with 18 additions and 18 deletions
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -847,7 +847,7 @@ async def _process_extraction_result(
    chunk_key: str,
    timestamp: int,
    file_path: str = "unknown_source",
-    tuple_delimiter: str = "<|SEP|>",
+    tuple_delimiter: str = "<|S|>",
    completion_delimiter: str = "<|COMPLETE|>",
 ) -> tuple[dict, dict]:
    """Process a single extraction result (either initial or gleaning)
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@ -15,8 +15,8 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel

 ---Instructions---
 1. Entity Extraction: Identify clearly defined and meaningful entities in the input text, and extract the following information:
-  - entity_name: Name of the entity, ensure entity names are consistent throughout the extraction.
-  - entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided types are suitable, classify it as `Other`.
+  - entity_name: Name of the entity; ensure entity names are consistent throughout the extraction.
+  - entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided entity types are suitable, classify it as `Other`.
  - entity_description: Provide a concise yet comprehensive description of the entity's attributes and activities based on the information present in the input text.
 2. Relationship Extraction: Identify direct, clearly stated and meaningful relationships between extracted entities, and extract the following information:
  - source_entity: name of the source entity.
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -2571,19 +2571,19 @@ def fix_tuple_delimiter_corruption(

    Args:
        record: The text record to fix
-        delimiter_core: The core delimiter (e.g., "SEP" from "<|SEP|>")
-        tuple_delimiter: The complete tuple delimiter (e.g., "<|SEP|>")
+        delimiter_core: The core delimiter (e.g., "S" from "<|S|>")
+        tuple_delimiter: The complete tuple delimiter (e.g., "<|S|>")

    Returns:
        The corrected record with proper tuple_delimiter format

    Examples:
-        >>> fix_tuple_delimiter_corruption("entity<X|SEP|>name", "SEP", "<|SEP|>")
-        "entity<|SEP|>name"
-        >>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|SEP|>")
-        "entity<|SEP|>name"
-        >>> fix_tuple_delimiter_corruption("entity|SEP|>name", "SEP", "<|SEP|>")
-        "entity<|SEP|>name"
+        >>> fix_tuple_delimiter_corruption("entity<X|S|>name", "SEP", "<|S|>")
+        "entity<|S|>name"
+        >>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|S|>")
+        "entity<|S|>name"
+        >>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
+        "entity<|S|>name"
    """
    if not record or not delimiter_core or not tuple_delimiter:
        return record
@ -2591,49 +2591,49 @@ def fix_tuple_delimiter_corruption(
    # Escape the delimiter core for regex use
    escaped_delimiter_core = re.escape(delimiter_core)

-    # Fix: <|SEP||SEP|> -> <|SEP|>, <|SEP|||SEP|> -> <|SEP|>
+    # Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
    record = re.sub(
        rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
        tuple_delimiter,
        record,
    )

-    # Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|>  (one extra characters outside pipes)
+    # Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|>  (one extra characters outside pipes)
    record = re.sub(
        rf"<.?\|{escaped_delimiter_core}\|.?>",
        tuple_delimiter,
        record,
    )

-    # Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
+    # Fix: <S>, <S|>, <|S> -> <|S|> (missing one or both pipes)
    record = re.sub(
        rf"<\|?{escaped_delimiter_core}\|?>",
        tuple_delimiter,
        record,
    )

-    # Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
+    # Fix: <XS|> -> <|S|>, <|SX> -> <|S|> (one pipe is replaced by other character)
    record = re.sub(
        rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
        tuple_delimiter,
        record,
    )

-    # Fix: <|SEP| -> <|SEP|> (missing closing >)
+    # Fix: <|S| -> <|S|> (missing closing >)
    record = re.sub(
        rf"<\|{escaped_delimiter_core}\|(?!>)",
        tuple_delimiter,
        record,
    )

-    # Fix: |SEP|> -> <|SEP|> (missing opening <)
+    # Fix: |S|> -> <|S|> (missing opening <)
    record = re.sub(
        rf"(?<!<)\|{escaped_delimiter_core}\|>",
        tuple_delimiter,
        record,
    )

-    # Fix: <|SEP|>| -> <|SEP|>  ( this is a fix of: <|SEP|| -> <|SEP|> )
+    # Fix: <|S|>| -> <|S|>  ( this is a fix for: <|S|| -> <|S|> )
    record = re.sub(
        rf"<\|{escaped_delimiter_core}\|>\|",
        tuple_delimiter,