ragflow/api/db/services/metadata_service.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

"""
Metadata Management Service for Hierarchical Retrieval.

Provides batch CRUD operations for document metadata to support:
- Efficient metadata filtering in Tier 2 of hierarchical retrieval
- Bulk metadata updates across multiple documents
- Metadata schema management per knowledge base
"""

import logging
from typing import List, Dict, Any, Optional, Tuple

from peewee import fn

from api.db.db_models import DB, Document
from api.db.services.document_service import DocumentService


class MetadataService:
    """
    Service for managing document metadata in batch operations.

    Supports the hierarchical retrieval architecture by providing
    efficient metadata management for document filtering.
    """

    @classmethod
    @DB.connection_context()
    def batch_get_metadata(
        cls,
        doc_ids: List[str],
        fields: Optional[List[str]] = None
    ) -> Dict[str, Dict[str, Any]]:
        """
        Get metadata for multiple documents.

        Args:
            doc_ids: List of document IDs
            fields: Optional list of specific metadata fields to retrieve

        Returns:
            Dict mapping doc_id to metadata dict
        """
        if not doc_ids:
            return {}

        result = {}
        docs = Document.select(
            Document.id,
            Document.meta_fields,
            Document.name
        ).where(Document.id.in_(doc_ids))

        for doc in docs:
            meta = doc.meta_fields or {}
            if fields:
                # Filter to requested fields only
                meta = {k: v for k, v in meta.items() if k in fields}
            result[doc.id] = {
                "doc_id": doc.id,
                "doc_name": doc.name,
                "metadata": meta
            }

        return result

    @classmethod
    @DB.connection_context()
    def batch_update_metadata(
        cls,
        updates: List[Dict[str, Any]],
        merge: bool = True
    ) -> Tuple[int, List[str]]:
        """
        Update metadata for multiple documents in batch.

        Args:
            updates: List of dicts with 'doc_id' and 'metadata' keys
            merge: If True, merge with existing metadata; if False, replace

        Returns:
            Tuple of (success_count, list of failed doc_ids)
        """
        if not updates:
            return 0, []

        success_count = 0
        failed_ids = []

        for update in updates:
            doc_id = update.get("doc_id")
            new_metadata = update.get("metadata", {})

            if not doc_id:
                continue

            try:
                if merge:
                    # Get existing metadata and merge
                    doc = Document.get_or_none(Document.id == doc_id)
                    if doc:
                        existing = doc.meta_fields or {}
                        existing.update(new_metadata)
                        new_metadata = existing

                DocumentService.update_meta_fields(doc_id, new_metadata)
                success_count += 1

            except Exception as e:
                logging.error(f"Failed to update metadata for doc {doc_id}: {e}")
                failed_ids.append(doc_id)

        logging.info(f"Batch metadata update: {success_count} succeeded, {len(failed_ids)} failed")
        return success_count, failed_ids

    @classmethod
    @DB.connection_context()
    def batch_delete_metadata_fields(
        cls,
        doc_ids: List[str],
        fields: List[str]
    ) -> Tuple[int, List[str]]:
        """
        Delete specific metadata fields from multiple documents.

        Args:
            doc_ids: List of document IDs
            fields: List of metadata field names to delete

        Returns:
            Tuple of (success_count, list of failed doc_ids)
        """
        if not doc_ids or not fields:
            return 0, []

        success_count = 0
        failed_ids = []

        docs = Document.select(
            Document.id,
            Document.meta_fields
        ).where(Document.id.in_(doc_ids))

        for doc in docs:
            try:
                meta = doc.meta_fields or {}
                modified = False

                for field in fields:
                    if field in meta:
                        del meta[field]
                        modified = True

                if modified:
                    DocumentService.update_meta_fields(doc.id, meta)
                    success_count += 1

            except Exception as e:
                logging.error(f"Failed to delete metadata fields for doc {doc.id}: {e}")
                failed_ids.append(doc.id)

        return success_count, failed_ids

    @classmethod
    @DB.connection_context()
    def batch_set_metadata_field(
        cls,
        doc_ids: List[str],
        field_name: str,
        field_value: Any
    ) -> Tuple[int, List[str]]:
        """
        Set a specific metadata field to the same value for multiple documents.

        Useful for bulk categorization or tagging.

        Args:
            doc_ids: List of document IDs
            field_name: Name of the metadata field
            field_value: Value to set

        Returns:
            Tuple of (success_count, list of failed doc_ids)
        """
        if not doc_ids or not field_name:
            return 0, []

        updates = [
            {"doc_id": doc_id, "metadata": {field_name: field_value}}
            for doc_id in doc_ids
        ]

        return cls.batch_update_metadata(updates, merge=True)

    @classmethod
    @DB.connection_context()
    def get_metadata_schema(cls, kb_id: str) -> Dict[str, Dict[str, Any]]:
        """
        Get the metadata schema for a knowledge base.

        Analyzes all documents in the KB to determine available
        metadata fields and their types/values.

        Args:
            kb_id: Knowledge base ID

        Returns:
            Dict mapping field names to field info (type, sample values, count)
        """
        schema = {}

        docs = Document.select(
            Document.meta_fields
        ).where(Document.kb_id == kb_id)

        for doc in docs:
            meta = doc.meta_fields or {}
            for field_name, field_value in meta.items():
                if field_name not in schema:
                    schema[field_name] = {
                        "type": type(field_value).__name__,
                        "sample_values": set(),
                        "count": 0
                    }

                schema[field_name]["count"] += 1

                # Collect sample values (limit to 10)
                if len(schema[field_name]["sample_values"]) < 10:
                    try:
                        schema[field_name]["sample_values"].add(str(field_value)[:100])
                    except Exception:
                        pass

        # Convert sets to lists for JSON serialization
        for field_name in schema:
            schema[field_name]["sample_values"] = list(schema[field_name]["sample_values"])

        return schema

    @classmethod
    @DB.connection_context()
    def search_by_metadata(
        cls,
        kb_id: str,
        filters: Dict[str, Any],
        limit: int = 100
    ) -> List[Dict[str, Any]]:
        """
        Search documents by metadata filters.

        Args:
            kb_id: Knowledge base ID
            filters: Dict of field_name -> value or {operator: value}
            limit: Maximum number of results

        Returns:
            List of matching documents with their metadata
        """
        docs = Document.select(
            Document.id,
            Document.name,
            Document.meta_fields
        ).where(Document.kb_id == kb_id)

        results = []
        for doc in docs:
            meta = doc.meta_fields or {}
            matches = True

            for field_name, condition in filters.items():
                doc_value = meta.get(field_name)

                if isinstance(condition, dict):
                    # Operator-based condition
                    op = list(condition.keys())[0]
                    val = condition[op]

                    if op == "equals":
                        matches = str(doc_value) == str(val)
                    elif op == "contains":
                        matches = str(val).lower() in str(doc_value).lower()
                    elif op == "starts_with":
                        matches = str(doc_value).lower().startswith(str(val).lower())
                    elif op == "in":
                        matches = doc_value in val
                    elif op == "gt":
                        matches = float(doc_value) > float(val) if doc_value else False
                    elif op == "lt":
                        matches = float(doc_value) < float(val) if doc_value else False
                else:
                    # Simple equality
                    matches = str(doc_value) == str(condition)

                if not matches:
                    break

            if matches:
                results.append({
                    "doc_id": doc.id,
                    "doc_name": doc.name,
                    "metadata": meta
                })

                if len(results) >= limit:
                    break

        return results

    @classmethod
    @DB.connection_context()
    def get_metadata_statistics(cls, kb_id: str) -> Dict[str, Any]:
        """
        Get statistics about metadata usage in a knowledge base.

        Args:
            kb_id: Knowledge base ID

        Returns:
            Dict with statistics about metadata fields
        """
        total_docs = Document.select(fn.COUNT(Document.id)).where(
            Document.kb_id == kb_id
        ).scalar()

        docs_with_metadata = 0
        field_usage = {}

        docs = Document.select(Document.meta_fields).where(Document.kb_id == kb_id)

        for doc in docs:
            meta = doc.meta_fields or {}
            if meta:
                docs_with_metadata += 1
                for field_name in meta.keys():
                    field_usage[field_name] = field_usage.get(field_name, 0) + 1

        return {
            "total_documents": total_docs,
            "documents_with_metadata": docs_with_metadata,
            "metadata_coverage": docs_with_metadata / total_docs if total_docs > 0 else 0,
            "field_usage": field_usage,
            "unique_fields": len(field_usage)
        }

    @classmethod
    @DB.connection_context()
    def copy_metadata(
        cls,
        source_doc_id: str,
        target_doc_ids: List[str],
        fields: Optional[List[str]] = None
    ) -> Tuple[int, List[str]]:
        """
        Copy metadata from one document to multiple target documents.

        Args:
            source_doc_id: Source document ID
            target_doc_ids: List of target document IDs
            fields: Optional list of specific fields to copy (all if None)

        Returns:
            Tuple of (success_count, list of failed doc_ids)
        """
        source_doc = Document.get_or_none(Document.id == source_doc_id)
        if not source_doc:
            return 0, target_doc_ids

        source_meta = source_doc.meta_fields or {}

        if fields:
            source_meta = {k: v for k, v in source_meta.items() if k in fields}

        if not source_meta:
            return 0, []

        updates = [
            {"doc_id": doc_id, "metadata": source_meta.copy()}
            for doc_id in target_doc_ids
        ]

        return cls.batch_update_metadata(updates, merge=True)