ragflow/sdk/python/ragflow_sdk/modules/chunk.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from typing import Any, NotRequired, Optional, TYPE_CHECKING, TypedDict
from .base import Base

if TYPE_CHECKING:
    from ..ragflow import RAGFlow

__all__ = 'Chunk',

class UpdateMessage(TypedDict):
    content: NotRequired[str]
    important_keywords: NotRequired[list[str]]
    available: NotRequired[bool]

class ChunkUpdateError(Exception):
    __slots__ = (
        'code',
        'message',
        'details',
    )

    code: Optional[int]
    message: Optional[str]
    details: Optional[str]

    def __init__(self, code: Optional[int]=None, message: Optional[str]=None, details: Optional[str]=None):
        self.code = code
        self.message = message
        self.details = details
        super().__init__(message)

class Chunk(Base):
    __slots__ = (
        'id',
        'content',
        'important_keywords',
        'questions',
        'create_time',
        'create_timestamp',
        'dataset_id',
        'document_name',
        'document_id',
        'available',
        'similarity',
        'vector_similarity',
        'term_similarity',
        'positions',
        'doc_type',
    )

    id: str
    content: str
    important_keywords: list[str]
    questions: list[str]
    create_time: str
    create_timestamp: float
    dataset_id: Optional[str]
    document_name: str
    document_id: str
    available: bool
    similarity: float
    vector_similarity: float
    term_similarity: float
    positions: list[str]
    doc_type: str

    def __init__(self, rag: "RAGFlow", res_dict: dict[str, Any]) -> None:
        self.id = ""
        self.content = ""
        self.important_keywords = []
        self.questions = []
        self.create_time = ""
        self.create_timestamp = 0.0
        self.dataset_id = None
        self.document_name = ""
        self.document_id = ""
        self.available = True
        # Additional fields for retrieval results
        self.similarity = 0.0
        self.vector_similarity = 0.0
        self.term_similarity = 0.0
        self.positions = []
        self.doc_type = ""
        for k in list(res_dict.keys()):
            if not hasattr(self, k):
                res_dict.pop(k)
        super().__init__(rag, res_dict)

    def update(self, update_message: UpdateMessage) -> None:
        res = self.put(f"/datasets/{self.dataset_id}/documents/{self.document_id}/chunks/{self.id}", update_message)
        res = res.json()
        if res.get("code") != 0:
            raise ChunkUpdateError(
                code=res.get("code"),
                message=res.get("message"),
                details=res.get("details")
            )