Merge 8518a5cf29 into a2e080c2d3

2025-12-15 19:28:35 +08:00 · 2025-12-15 19:28:35 +08:00 · 197c1ec221
commit 197c1ec221
parent a2e080c2d3 8518a5cf29
3 changed files with 367 additions and 4 deletions
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -30,7 +30,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
 from common.connection_utils import timeout
 from rag.utils.base64_image import image2id
-from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason
+from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason, should_skip_raptor_for_chunks
 from common.log_utils import init_root_logger
 from common.config_utils import show_configs
 from graphrag.general.index import run_graphrag_for_kb
@ -742,6 +742,15 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
                                                 fields=["content_with_weight", vctr_nm],
                                                 sort_by_position=True):
                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
+            
+            # Check if chunks contain HTML tables (content-based detection)
+            skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+            if skip_for_tables:
+                logging.info(f"Skipping Raptor for document {doc_id}: {skip_reason}")
+                if callback:
+                    callback(prog=(x+1.)/len(doc_ids), msg=f"Raptor skipped: {skip_reason}")
+                continue
+            
            await generate(chunks, doc_id)
            callback(prog=(x+1.)/len(doc_ids))
    else:
@ -752,6 +761,14 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
                                                 sort_by_position=True):
                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))

+        # Check if chunks contain HTML tables (content-based detection)
+        skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+        if skip_for_tables:
+            logging.info(f"Skipping Raptor for KB scope: {skip_reason}")
+            if callback:
+                callback(prog=1.0, msg=f"Raptor skipped: {skip_reason}")
+            return res, tk_count
+
        await generate(chunks, fake_doc_id)

    return res, tk_count
--- a/rag/utils/raptor_utils.py
+++ b/rag/utils/raptor_utils.py
@ -19,7 +19,10 @@ Utility functions for Raptor processing decisions.
 """

 import logging
-from typing import Optional
+import re
+from typing import Optional, List, Tuple
+
+import numpy as np


 # File extensions for structured data types
@ -27,6 +30,13 @@ EXCEL_EXTENSIONS = {".xls", ".xlsx", ".xlsm", ".xlsb"}
 CSV_EXTENSIONS = {".csv", ".tsv"}
 STRUCTURED_EXTENSIONS = EXCEL_EXTENSIONS | CSV_EXTENSIONS

+# Regex patterns for detecting HTML tables in content
+HTML_TABLE_PATTERN = re.compile(r'<table[^>]*>.*?</table>', re.IGNORECASE | re.DOTALL)
+HTML_TABLE_START_PATTERN = re.compile(r'<table[^>]*>', re.IGNORECASE)
+
+# Threshold for considering content as "table-heavy" (percentage of chunks with tables)
+TABLE_CONTENT_THRESHOLD = 0.3  # If 30%+ of chunks contain tables, skip Raptor
+

 def is_structured_file_type(file_type: Optional[str]) -> bool:
    """
@ -120,7 +130,8 @@ def should_skip_raptor(
 def get_skip_reason(
    file_type: Optional[str] = None,
    parser_id: str = "",
-    parser_config: Optional[dict] = None
+    parser_config: Optional[dict] = None,
+    has_table_content: bool = False
 ) -> str:
    """
    Get a human-readable reason why Raptor was skipped.
@ -129,6 +140,7 @@ def get_skip_reason(
        file_type: File extension
        parser_id: Parser ID being used
        parser_config: Parser configuration dict
+        has_table_content: Whether content contains HTML tables
        
    Returns:
        Reason string, or empty string if Raptor should not be skipped
@ -142,4 +154,96 @@ def get_skip_reason(
        if is_tabular_pdf(parser_id, parser_config):
            return f"Tabular PDF (parser={parser_id}) - Raptor auto-disabled"
    
+    if has_table_content:
+        return "Content contains HTML tables - Raptor auto-disabled"
+    
    return ""
+
+
+def contains_html_table(content: str) -> bool:
+    """
+    Check if content contains HTML table markup.
+    
+    Args:
+        content: Text content to check
+        
+    Returns:
+        True if content contains HTML table tags
+    """
+    if not content:
+        return False
+    return bool(HTML_TABLE_START_PATTERN.search(content))
+
+
+def analyze_chunks_for_tables(
+    chunks: List[Tuple[str, np.ndarray]],
+    threshold: float = TABLE_CONTENT_THRESHOLD
+) -> Tuple[bool, float]:
+    """
+    Analyze chunks to determine if they contain significant table content.
+    
+    This function checks the actual content of chunks for HTML table markup,
+    which is generated when PDFs with tables are parsed.
+    
+    Args:
+        chunks: List of (content, vector) tuples
+        threshold: Percentage threshold for considering content as table-heavy
+        
+    Returns:
+        Tuple of (should_skip, table_percentage)
+    """
+    if not chunks:
+        return False, 0.0
+    
+    table_count = 0
+    for content, _ in chunks:
+        if contains_html_table(content):
+            table_count += 1
+    
+    table_percentage = table_count / len(chunks)
+    should_skip = table_percentage >= threshold
+    
+    if should_skip:
+        logging.info(
+            f"Detected table-heavy content: {table_count}/{len(chunks)} chunks "
+            f"({table_percentage:.1%}) contain HTML tables"
+        )
+    
+    return should_skip, table_percentage
+
+
+def should_skip_raptor_for_chunks(
+    chunks: List[Tuple[str, np.ndarray]],
+    raptor_config: Optional[dict] = None,
+    threshold: float = TABLE_CONTENT_THRESHOLD
+) -> Tuple[bool, str]:
+    """
+    Check if Raptor should be skipped based on chunk content analysis.
+    
+    This is a content-based check that runs after chunks are loaded,
+    detecting HTML tables that were extracted during parsing.
+    
+    Args:
+        chunks: List of (content, vector) tuples
+        raptor_config: Raptor configuration dict
+        threshold: Percentage threshold for table content
+        
+    Returns:
+        Tuple of (should_skip, reason)
+    """
+    raptor_config = raptor_config or {}
+    
+    # Check if auto-disable is explicitly disabled
+    if raptor_config.get("auto_disable_for_structured_data", True) is False:
+        return False, ""
+    
+    should_skip, table_pct = analyze_chunks_for_tables(chunks, threshold)
+    
+    if should_skip:
+        reason = (
+            f"Content contains {table_pct:.0%} HTML tables "
+            f"(threshold: {threshold:.0%}) - Raptor auto-disabled"
+        )
+        return True, reason
+    
+    return False, ""
--- a/test/unit_test/utils/test_raptor_utils.py
+++ b/test/unit_test/utils/test_raptor_utils.py
@ -19,14 +19,19 @@ Unit tests for Raptor utility functions.
 """

 import pytest
+import numpy as np
 from rag.utils.raptor_utils import (
    is_structured_file_type,
    is_tabular_pdf,
    should_skip_raptor,
    get_skip_reason,
+    contains_html_table,
+    analyze_chunks_for_tables,
+    should_skip_raptor_for_chunks,
    EXCEL_EXTENSIONS,
    CSV_EXTENSIONS,
-    STRUCTURED_EXTENSIONS
+    STRUCTURED_EXTENSIONS,
+    TABLE_CONTENT_THRESHOLD
 )


@ -283,5 +288,242 @@ class TestIntegrationScenarios:
        assert should_skip_raptor(file_type, raptor_config=raptor_config) is False


+class TestContainsHtmlTable:
+    """Test HTML table detection in content"""
+
+    def test_detect_simple_table(self):
+        """Test detection of simple HTML table"""
+        content = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr></table>"
+        assert contains_html_table(content) is True
+
+    def test_detect_table_with_attributes(self):
+        """Test detection of table with attributes"""
+        content = '<table class="data-table" border="1"><tr><td>Data</td></tr></table>'
+        assert contains_html_table(content) is True
+
+    def test_detect_table_case_insensitive(self):
+        """Test case insensitive detection"""
+        assert contains_html_table("<TABLE><TR><TD>X</TD></TR></TABLE>") is True
+        assert contains_html_table("<Table><tr><td>X</td></tr></Table>") is True
+
+    def test_no_table_in_plain_text(self):
+        """Test that plain text is not detected as table"""
+        content = "This is just plain text without any tables."
+        assert contains_html_table(content) is False
+
+    def test_no_table_in_empty_content(self):
+        """Test empty content handling"""
+        assert contains_html_table("") is False
+        # Note: None is rejected by type hints (beartype), which is correct behavior
+
+    def test_table_word_not_detected(self):
+        """Test that the word 'table' alone is not detected"""
+        content = "Please see the table below for more information."
+        assert contains_html_table(content) is False
+
+    def test_mixed_content_with_table(self):
+        """Test content with text and table"""
+        content = """
+        This is some introductory text.
+        <table>
+            <caption>Financial Data</caption>
+            <tr><th>Year</th><th>Revenue</th></tr>
+            <tr><td>2024</td><td>$1M</td></tr>
+        </table>
+        More text after the table.
+        """
+        assert contains_html_table(content) is True
+
+
+class TestAnalyzeChunksForTables:
+    """Test chunk analysis for table content"""
+
+    def _make_chunk(self, content: str):
+        """Helper to create a chunk tuple"""
+        return (content, np.zeros(768))
+
+    def test_all_table_chunks(self):
+        """Test when all chunks contain tables"""
+        chunks = [
+            self._make_chunk("<table><tr><td>1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>2</td></tr></table>"),
+            self._make_chunk("<table><tr><td>3</td></tr></table>"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is True
+        assert pct == 1.0
+
+    def test_no_table_chunks(self):
+        """Test when no chunks contain tables"""
+        chunks = [
+            self._make_chunk("Plain text content 1"),
+            self._make_chunk("Plain text content 2"),
+            self._make_chunk("Plain text content 3"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is False
+        assert pct == 0.0
+
+    def test_mixed_chunks_below_threshold(self):
+        """Test mixed chunks below threshold"""
+        # 1 out of 5 = 20%, below 30% threshold
+        chunks = [
+            self._make_chunk("<table><tr><td>Table</td></tr></table>"),
+            self._make_chunk("Plain text 1"),
+            self._make_chunk("Plain text 2"),
+            self._make_chunk("Plain text 3"),
+            self._make_chunk("Plain text 4"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is False
+        assert pct == 0.2
+
+    def test_mixed_chunks_above_threshold(self):
+        """Test mixed chunks above threshold"""
+        # 2 out of 5 = 40%, above 30% threshold
+        chunks = [
+            self._make_chunk("<table><tr><td>Table 1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>Table 2</td></tr></table>"),
+            self._make_chunk("Plain text 1"),
+            self._make_chunk("Plain text 2"),
+            self._make_chunk("Plain text 3"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is True
+        assert pct == 0.4
+
+    def test_empty_chunks(self):
+        """Test empty chunk list"""
+        should_skip, pct = analyze_chunks_for_tables([])
+        assert should_skip is False
+        assert pct == 0.0
+
+    def test_custom_threshold(self):
+        """Test with custom threshold"""
+        # 1 out of 5 = 20%
+        chunks = [
+            self._make_chunk("<table><tr><td>Table</td></tr></table>"),
+            self._make_chunk("Plain text 1"),
+            self._make_chunk("Plain text 2"),
+            self._make_chunk("Plain text 3"),
+            self._make_chunk("Plain text 4"),
+        ]
+        # With 15% threshold, should skip
+        should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.15)
+        assert should_skip is True
+        
+        # With 25% threshold, should not skip
+        should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.25)
+        assert should_skip is False
+
+    def test_default_threshold_value(self):
+        """Test that default threshold is 30%"""
+        assert TABLE_CONTENT_THRESHOLD == 0.3
+
+
+class TestShouldSkipRaptorForChunks:
+    """Test content-based Raptor skip decision"""
+
+    def _make_chunk(self, content: str):
+        """Helper to create a chunk tuple"""
+        return (content, np.zeros(768))
+
+    def test_skip_for_table_heavy_content(self):
+        """Test skipping for table-heavy content"""
+        chunks = [
+            self._make_chunk("<table><tr><td>1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>2</td></tr></table>"),
+            self._make_chunk("Plain text"),
+        ]
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is True
+        assert "HTML tables" in reason
+
+    def test_no_skip_for_text_content(self):
+        """Test not skipping for text content"""
+        chunks = [
+            self._make_chunk("Plain text content 1"),
+            self._make_chunk("Plain text content 2"),
+            self._make_chunk("Plain text content 3"),
+        ]
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is False
+        assert reason == ""
+
+    def test_override_with_config(self):
+        """Test that auto-disable can be overridden"""
+        chunks = [
+            self._make_chunk("<table><tr><td>1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>2</td></tr></table>"),
+        ]
+        raptor_config = {"auto_disable_for_structured_data": False}
+        should_skip, reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+        assert should_skip is False
+        assert reason == ""
+
+    def test_empty_chunks(self):
+        """Test with empty chunks"""
+        should_skip, reason = should_skip_raptor_for_chunks([])
+        assert should_skip is False
+        assert reason == ""
+
+
+class TestPDFWithHtmlTables:
+    """Test real-world PDF with HTML tables scenario (ahmadshakil's issue)"""
+
+    def _make_chunk(self, content: str):
+        """Helper to create a chunk tuple"""
+        return (content, np.zeros(768))
+
+    def test_pdf_with_extracted_tables(self):
+        """Test PDF that has tables extracted as HTML during parsing"""
+        # Simulating chunks from a PDF like Fbr_IncomeTaxOrdinance_2001
+        chunks = [
+            self._make_chunk("Section 1: Introduction to Tax Law"),
+            self._make_chunk('<table><caption>Table Location: Section 2</caption><tr><th>Tax Rate</th><th>Income Range</th></tr><tr><td>10%</td><td>0-500,000</td></tr></table>'),
+            self._make_chunk("Section 3: Deductions and Exemptions"),
+            self._make_chunk('<table><tr><th>Deduction Type</th><th>Maximum Amount</th></tr><tr><td>Medical</td><td>100,000</td></tr></table>'),
+            self._make_chunk("Section 4: Filing Requirements"),
+        ]
+        
+        # 2 out of 5 = 40%, above 30% threshold
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is True
+        assert "HTML tables" in reason
+
+    def test_pdf_with_few_tables(self):
+        """Test PDF with only occasional tables"""
+        chunks = [
+            self._make_chunk("Chapter 1: Overview of the legal framework..."),
+            self._make_chunk("Chapter 2: Detailed analysis of provisions..."),
+            self._make_chunk("Chapter 3: Case studies and examples..."),
+            self._make_chunk("Chapter 4: Implementation guidelines..."),
+            self._make_chunk("Chapter 5: Compliance requirements..."),
+            self._make_chunk("Chapter 6: Penalties and enforcement..."),
+            self._make_chunk("Chapter 7: Appeals process..."),
+            self._make_chunk("Chapter 8: Recent amendments..."),
+            self._make_chunk("Chapter 9: Future outlook..."),
+            self._make_chunk('<table><tr><td>Summary Table</td></tr></table>'),  # Only 1 table
+        ]
+        
+        # 1 out of 10 = 10%, below 30% threshold
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is False
+
+    def test_financial_pdf_with_many_tables(self):
+        """Test financial PDF with many tables (should skip)"""
+        chunks = [
+            self._make_chunk('<table><caption>Balance Sheet</caption><tr><td>Assets</td><td>$1M</td></tr></table>'),
+            self._make_chunk('<table><caption>Income Statement</caption><tr><td>Revenue</td><td>$500K</td></tr></table>'),
+            self._make_chunk('<table><caption>Cash Flow</caption><tr><td>Operating</td><td>$200K</td></tr></table>'),
+            self._make_chunk("Notes to financial statements..."),
+            self._make_chunk('<table><caption>Tax Schedule</caption><tr><td>Tax</td><td>$50K</td></tr></table>'),
+        ]
+        
+        # 4 out of 5 = 80%, well above threshold
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is True
+
+
 if __name__ == "__main__":
    pytest.main([__file__, "-v"])