Quick fix to limit source_id ballooning while inserting nodes

(cherry picked from commit 54f0a7d1ca)
2025-10-14 14:47:04 +05:00 · 2025-10-14 14:47:04 +05:00 · b9fc6f19dd
commit b9fc6f19dd
parent 429cd6a66f
3 changed files with 390 additions and 1308 deletions
--- a/lightrag/constants.py
+++ b/lightrag/constants.py
@ -13,6 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
 # Default values for extraction settings
 DEFAULT_SUMMARY_LANGUAGE = "English"  # Default language for document processing
 DEFAULT_MAX_GLEANING = 1
 DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
 DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 3
 DEFAULT_MAX_SOURCE_IDS_PER_RELATION = 3
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -15,17 +15,7 @@ from dataclasses import dataclass
 from datetime import datetime
 from functools import wraps
 from hashlib import md5
-from typing import (
+from typing import Any, Protocol, Callable, TYPE_CHECKING, List, Optional
    Any,
    Protocol,
    Callable,
    TYPE_CHECKING,
    List,
    Optional,
    Iterable,
    Sequence,
    Collection,
 )
 import numpy as np
 from dotenv import load_dotenv
@ -36,9 +26,7 @@ from lightrag.constants import (
    GRAPH_FIELD_SEP,
    DEFAULT_MAX_TOTAL_TOKENS,
    DEFAULT_MAX_FILE_PATH_LENGTH,
-    DEFAULT_SOURCE_IDS_LIMIT_METHOD,
+    DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
    VALID_SOURCE_IDS_LIMIT_METHODS,
    SOURCE_IDS_LIMIT_METHOD_FIFO,
 )
 # Initialize logger with basic configuration
@ -2477,157 +2465,19 @@ async def process_chunks_unified(
    return final_chunks
 def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
    """Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
    already_len: int = len(chunk_ids)
-def normalize_source_ids_limit_method(method: str | None) -> str:
+    if already_len >= DEFAULT_MAX_CHUNK_IDS_PER_ENTITY:
    """Normalize the source ID limiting strategy and fall back to default when invalid."""
    if not method:
        return DEFAULT_SOURCE_IDS_LIMIT_METHOD
    normalized = method.upper()
    if normalized not in VALID_SOURCE_IDS_LIMIT_METHODS:
        logger.warning(
-            "Unknown SOURCE_IDS_LIMIT_METHOD '%s', falling back to %s",
+            f"Chunk Ids already exceeds {DEFAULT_MAX_CHUNK_IDS_PER_ENTITY} for {entity_name}, "
-            method,
+            f"current size: {already_len} entries."
            DEFAULT_SOURCE_IDS_LIMIT_METHOD,
        )
-        return DEFAULT_SOURCE_IDS_LIMIT_METHOD
+    
    truncated_chunk_ids = set(list(chunk_ids)[0:DEFAULT_MAX_CHUNK_IDS_PER_ENTITY])
-    return normalized
+    return truncated_chunk_ids
 def merge_source_ids(
    existing_ids: Iterable[str] | None, new_ids: Iterable[str] | None
 ) -> list[str]:
    """Merge two iterables of source IDs while preserving order and removing duplicates."""
    merged: list[str] = []
    seen: set[str] = set()
    for sequence in (existing_ids, new_ids):
        if not sequence:
            continue
        for source_id in sequence:
            if not source_id:
                continue
            if source_id not in seen:
                seen.add(source_id)
                merged.append(source_id)
    return merged
 def apply_source_ids_limit(
    source_ids: Sequence[str],
    limit: int,
    method: str,
    *,
    identifier: str | None = None,
 ) -> list[str]:
    """Apply a limit strategy to a sequence of source IDs."""
    if limit <= 0:
        return []
    source_ids_list = list(source_ids)
    if len(source_ids_list) <= limit:
        return source_ids_list
    normalized_method = normalize_source_ids_limit_method(method)
    if normalized_method == SOURCE_IDS_LIMIT_METHOD_FIFO:
        truncated = source_ids_list[-limit:]
    else:  # IGNORE_NEW
        truncated = source_ids_list[:limit]
    if identifier and len(truncated) < len(source_ids_list):
        logger.debug(
            "Source_id truncated: %s | %s keeping %s of %s entries",
            identifier,
            normalized_method,
            len(truncated),
            len(source_ids_list),
        )
    return truncated
 def compute_incremental_chunk_ids(
    existing_full_chunk_ids: list[str],
    old_chunk_ids: list[str],
    new_chunk_ids: list[str],
 ) -> list[str]:
    """
    Compute incrementally updated chunk IDs based on changes.
    This function applies delta changes (additions and removals) to an existing
    list of chunk IDs while maintaining order and ensuring deduplication.
    Delta additions from new_chunk_ids are placed at the end.
    Args:
        existing_full_chunk_ids: Complete list of existing chunk IDs from storage
        old_chunk_ids: Previous chunk IDs from source_id (chunks being replaced)
        new_chunk_ids: New chunk IDs from updated source_id (chunks being added)
    Returns:
        Updated list of chunk IDs with deduplication
    Example:
        >>> existing = ['chunk-1', 'chunk-2', 'chunk-3']
        >>> old = ['chunk-1', 'chunk-2']
        >>> new = ['chunk-2', 'chunk-4']
        >>> compute_incremental_chunk_ids(existing, old, new)
        ['chunk-3', 'chunk-2', 'chunk-4']
    """
    # Calculate changes
    chunks_to_remove = set(old_chunk_ids) - set(new_chunk_ids)
    chunks_to_add = set(new_chunk_ids) - set(old_chunk_ids)
    # Apply changes to full chunk_ids
    # Step 1: Remove chunks that are no longer needed
    updated_chunk_ids = [
        cid for cid in existing_full_chunk_ids if cid not in chunks_to_remove
    ]
    # Step 2: Add new chunks (preserving order from new_chunk_ids)
    # Note: 'cid not in updated_chunk_ids' check ensures deduplication
    for cid in new_chunk_ids:
        if cid in chunks_to_add and cid not in updated_chunk_ids:
            updated_chunk_ids.append(cid)
    return updated_chunk_ids
 def subtract_source_ids(
    source_ids: Iterable[str],
    ids_to_remove: Collection[str],
 ) -> list[str]:
    """Remove a collection of IDs from an ordered iterable while preserving order."""
    removal_set = set(ids_to_remove)
    if not removal_set:
        return [source_id for source_id in source_ids if source_id]
    return [
        source_id
        for source_id in source_ids
        if source_id and source_id not in removal_set
    ]
 def make_relation_chunk_key(src: str, tgt: str) -> str:
    """Create a deterministic storage key for relation chunk tracking."""
    return GRAPH_FIELD_SEP.join(sorted((src, tgt)))
 def parse_relation_chunk_key(key: str) -> tuple[str, str]:
    """Parse a relation chunk storage key back into its entity pair."""
    parts = key.split(GRAPH_FIELD_SEP)
    if len(parts) != 2:
        raise ValueError(f"Invalid relation chunk key: {key}")
    return parts[0], parts[1]
 def build_file_path(already_file_paths, data_list, target):
@ -2776,9 +2626,9 @@ def fix_tuple_delimiter_corruption(
        record,
    )
-    # Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|> (one extra characters outside pipes)
+    # Fix: <X|#|> -> <|#|>, <|#|Y> -> <|#|>, <X|#|Y> -> <|#|>, <||#||> -> <|#|>, <||#> -> <|#|> (one extra characters outside pipes)
    record = re.sub(
-        rf"<.?\|{escaped_delimiter_core}\|.?>",
+        rf"<.?\|{escaped_delimiter_core}\|*?>",
        tuple_delimiter,
        record,
    )
@ -2798,6 +2648,7 @@ def fix_tuple_delimiter_corruption(
    )
    # Fix: <|#| -> <|#|>, <|#|| -> <|#|> (missing closing >)
    record = re.sub(
        rf"<\|{escaped_delimiter_core}\|+(?!>)",
        tuple_delimiter,
@ -2811,13 +2662,6 @@ def fix_tuple_delimiter_corruption(
        record,
    )
    # Fix: <||#> -> <|#|> (double pipe at start, missing pipe at end)
    record = re.sub(
        rf"<\|+{escaped_delimiter_core}>",
        tuple_delimiter,
        record,
    )
    # Fix: <|| -> <|#|>
    record = re.sub(
        r"<\|\|(?!>)",