From ff408dea69d21b2bd1a20d4f4d3c7809b95d9381 Mon Sep 17 00:00:00 2001
From: "hsparks.codes" <autonome0802@gmail.com>
Date: Fri, 5 Dec 2025 09:38:28 +0100
Subject: [PATCH] fix: Detect HTML tables in PDF content for Raptor
 auto-disable

Addresses issue reported by @ahmadshakil where PDFs with HTML tables
(like Fbr_IncomeTaxOrdinance_2001) were still being sent to Raptor.

Problem:
- Original implementation only checked parser_id and html4excel config
- PDFs parsed with 'naive' parser extract tables as <table> HTML
- These tables were not detected, so Raptor processed them anyway

Solution:
- Add content-based detection: analyze chunks for <table> HTML tags
- Skip Raptor if 30%+ of chunks contain HTML tables
- Check happens after chunks are loaded, before Raptor processing
- Configurable threshold via TABLE_CONTENT_THRESHOLD

New functions:
- contains_html_table(): Detect <table> tags in content
- analyze_chunks_for_tables(): Calculate table percentage in chunks
- should_skip_raptor_for_chunks(): Content-based skip decision

Tests:
- Added 21 new tests for content-based detection (65 total)
- Includes test case simulating ahmadshakil's PDF scenario
- All tests passing

This fix ensures PDFs with extracted tables are properly skipped,
regardless of which parser was used.
---
 rag/svr/task_executor.py                  |  19 +-
 rag/utils/raptor_utils.py                 | 108 +++++++++-
 test/unit_test/utils/test_raptor_utils.py | 244 +++++++++++++++++++++-
 3 files changed, 367 insertions(+), 4 deletions(-)
diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index 8cf1bf290..b23c222ac 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -29,7 +29,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
 from common.connection_utils import timeout
 from rag.utils.base64_image import image2id
-from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason
+from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason, should_skip_raptor_for_chunks
 from common.log_utils import init_root_logger
 from common.config_utils import show_configs
 from graphrag.general.index import run_graphrag_for_kb
@@ -694,6 +694,15 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
                                                  fields=["content_with_weight", vctr_nm],
                                                  sort_by_position=True):
                 chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
+            
+            # Check if chunks contain HTML tables (content-based detection)
+            skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+            if skip_for_tables:
+                logging.info(f"Skipping Raptor for document {doc_id}: {skip_reason}")
+                if callback:
+                    callback(prog=(x+1.)/len(doc_ids), msg=f"Raptor skipped: {skip_reason}")
+                continue
+            
             await generate(chunks, doc_id)
             callback(prog=(x+1.)/len(doc_ids))
     else:
@@ -704,6 +713,14 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
                                                  sort_by_position=True):
                 chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
 
+        # Check if chunks contain HTML tables (content-based detection)
+        skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+        if skip_for_tables:
+            logging.info(f"Skipping Raptor for KB scope: {skip_reason}")
+            if callback:
+                callback(prog=1.0, msg=f"Raptor skipped: {skip_reason}")
+            return res, tk_count
+
         await generate(chunks, fake_doc_id)
 
     return res, tk_count
diff --git a/rag/utils/raptor_utils.py b/rag/utils/raptor_utils.py
index c48e0999b..bc4dc2a83 100644
--- a/rag/utils/raptor_utils.py
+++ b/rag/utils/raptor_utils.py
@@ -19,7 +19,10 @@ Utility functions for Raptor processing decisions.
 """
 
 import logging
-from typing import Optional
+import re
+from typing import Optional, List, Tuple
+
+import numpy as np
 
 
 # File extensions for structured data types
@@ -27,6 +30,13 @@ EXCEL_EXTENSIONS = {".xls", ".xlsx", ".xlsm", ".xlsb"}
 CSV_EXTENSIONS = {".csv", ".tsv"}
 STRUCTURED_EXTENSIONS = EXCEL_EXTENSIONS | CSV_EXTENSIONS
 
+# Regex patterns for detecting HTML tables in content
+HTML_TABLE_PATTERN = re.compile(r'<table[^>]*>.*?</table>', re.IGNORECASE | re.DOTALL)
+HTML_TABLE_START_PATTERN = re.compile(r'<table[^>]*>', re.IGNORECASE)
+
+# Threshold for considering content as "table-heavy" (percentage of chunks with tables)
+TABLE_CONTENT_THRESHOLD = 0.3  # If 30%+ of chunks contain tables, skip Raptor
+
 
 def is_structured_file_type(file_type: Optional[str]) -> bool:
     """
@@ -120,7 +130,8 @@ def should_skip_raptor(
 def get_skip_reason(
     file_type: Optional[str] = None,
     parser_id: str = "",
-    parser_config: Optional[dict] = None
+    parser_config: Optional[dict] = None,
+    has_table_content: bool = False
 ) -> str:
     """
     Get a human-readable reason why Raptor was skipped.
@@ -129,6 +140,7 @@ def get_skip_reason(
         file_type: File extension
         parser_id: Parser ID being used
         parser_config: Parser configuration dict
+        has_table_content: Whether content contains HTML tables
         
     Returns:
         Reason string, or empty string if Raptor should not be skipped
@@ -142,4 +154,96 @@ def get_skip_reason(
         if is_tabular_pdf(parser_id, parser_config):
             return f"Tabular PDF (parser={parser_id}) - Raptor auto-disabled"
     
+    if has_table_content:
+        return "Content contains HTML tables - Raptor auto-disabled"
+    
     return ""
+
+
+def contains_html_table(content: str) -> bool:
+    """
+    Check if content contains HTML table markup.
+    
+    Args:
+        content: Text content to check
+        
+    Returns:
+        True if content contains HTML table tags
+    """
+    if not content:
+        return False
+    return bool(HTML_TABLE_START_PATTERN.search(content))
+
+
+def analyze_chunks_for_tables(
+    chunks: List[Tuple[str, np.ndarray]],
+    threshold: float = TABLE_CONTENT_THRESHOLD
+) -> Tuple[bool, float]:
+    """
+    Analyze chunks to determine if they contain significant table content.
+    
+    This function checks the actual content of chunks for HTML table markup,
+    which is generated when PDFs with tables are parsed.
+    
+    Args:
+        chunks: List of (content, vector) tuples
+        threshold: Percentage threshold for considering content as table-heavy
+        
+    Returns:
+        Tuple of (should_skip, table_percentage)
+    """
+    if not chunks:
+        return False, 0.0
+    
+    table_count = 0
+    for content, _ in chunks:
+        if contains_html_table(content):
+            table_count += 1
+    
+    table_percentage = table_count / len(chunks)
+    should_skip = table_percentage >= threshold
+    
+    if should_skip:
+        logging.info(
+            f"Detected table-heavy content: {table_count}/{len(chunks)} chunks "
+            f"({table_percentage:.1%}) contain HTML tables"
+        )
+    
+    return should_skip, table_percentage
+
+
+def should_skip_raptor_for_chunks(
+    chunks: List[Tuple[str, np.ndarray]],
+    raptor_config: Optional[dict] = None,
+    threshold: float = TABLE_CONTENT_THRESHOLD
+) -> Tuple[bool, str]:
+    """
+    Check if Raptor should be skipped based on chunk content analysis.
+    
+    This is a content-based check that runs after chunks are loaded,
+    detecting HTML tables that were extracted during parsing.
+    
+    Args:
+        chunks: List of (content, vector) tuples
+        raptor_config: Raptor configuration dict
+        threshold: Percentage threshold for table content
+        
+    Returns:
+        Tuple of (should_skip, reason)
+    """
+    raptor_config = raptor_config or {}
+    
+    # Check if auto-disable is explicitly disabled
+    if raptor_config.get("auto_disable_for_structured_data", True) is False:
+        return False, ""
+    
+    should_skip, table_pct = analyze_chunks_for_tables(chunks, threshold)
+    
+    if should_skip:
+        reason = (
+            f"Content contains {table_pct:.0%} HTML tables "
+            f"(threshold: {threshold:.0%}) - Raptor auto-disabled"
+        )
+        return True, reason
+    
+    return False, ""
diff --git a/test/unit_test/utils/test_raptor_utils.py b/test/unit_test/utils/test_raptor_utils.py
index 5138ccda7..1b08345be 100644
--- a/test/unit_test/utils/test_raptor_utils.py
+++ b/test/unit_test/utils/test_raptor_utils.py
@@ -19,14 +19,19 @@ Unit tests for Raptor utility functions.
 """
 
 import pytest
+import numpy as np
 from rag.utils.raptor_utils import (
     is_structured_file_type,
     is_tabular_pdf,
     should_skip_raptor,
     get_skip_reason,
+    contains_html_table,
+    analyze_chunks_for_tables,
+    should_skip_raptor_for_chunks,
     EXCEL_EXTENSIONS,
     CSV_EXTENSIONS,
-    STRUCTURED_EXTENSIONS
+    STRUCTURED_EXTENSIONS,
+    TABLE_CONTENT_THRESHOLD
 )
 
 
@@ -283,5 +288,242 @@ class TestIntegrationScenarios:
         assert should_skip_raptor(file_type, raptor_config=raptor_config) is False
 
 
+class TestContainsHtmlTable:
+    """Test HTML table detection in content"""
+
+    def test_detect_simple_table(self):
+        """Test detection of simple HTML table"""
+        content = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr></table>"
+        assert contains_html_table(content) is True
+
+    def test_detect_table_with_attributes(self):
+        """Test detection of table with attributes"""
+        content = '<table class="data-table" border="1"><tr><td>Data</td></tr></table>'
+        assert contains_html_table(content) is True
+
+    def test_detect_table_case_insensitive(self):
+        """Test case insensitive detection"""
+        assert contains_html_table("<TABLE><TR><TD>X</TD></TR></TABLE>") is True
+        assert contains_html_table("<Table><tr><td>X</td></tr></Table>") is True
+
+    def test_no_table_in_plain_text(self):
+        """Test that plain text is not detected as table"""
+        content = "This is just plain text without any tables."
+        assert contains_html_table(content) is False
+
+    def test_no_table_in_empty_content(self):
+        """Test empty content handling"""
+        assert contains_html_table("") is False
+        # Note: None is rejected by type hints (beartype), which is correct behavior
+
+    def test_table_word_not_detected(self):
+        """Test that the word 'table' alone is not detected"""
+        content = "Please see the table below for more information."
+        assert contains_html_table(content) is False
+
+    def test_mixed_content_with_table(self):
+        """Test content with text and table"""
+        content = """
+        This is some introductory text.
+        <table>
+            <caption>Financial Data</caption>
+            <tr><th>Year</th><th>Revenue</th></tr>
+            <tr><td>2024</td><td>$1M</td></tr>
+        </table>
+        More text after the table.
+        """
+        assert contains_html_table(content) is True
+
+
+class TestAnalyzeChunksForTables:
+    """Test chunk analysis for table content"""
+
+    def _make_chunk(self, content: str):
+        """Helper to create a chunk tuple"""
+        return (content, np.zeros(768))
+
+    def test_all_table_chunks(self):
+        """Test when all chunks contain tables"""
+        chunks = [
+            self._make_chunk("<table><tr><td>1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>2</td></tr></table>"),
+            self._make_chunk("<table><tr><td>3</td></tr></table>"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is True
+        assert pct == 1.0
+
+    def test_no_table_chunks(self):
+        """Test when no chunks contain tables"""
+        chunks = [
+            self._make_chunk("Plain text content 1"),
+            self._make_chunk("Plain text content 2"),
+            self._make_chunk("Plain text content 3"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is False
+        assert pct == 0.0
+
+    def test_mixed_chunks_below_threshold(self):
+        """Test mixed chunks below threshold"""
+        # 1 out of 5 = 20%, below 30% threshold
+        chunks = [
+            self._make_chunk("<table><tr><td>Table</td></tr></table>"),
+            self._make_chunk("Plain text 1"),
+            self._make_chunk("Plain text 2"),
+            self._make_chunk("Plain text 3"),
+            self._make_chunk("Plain text 4"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is False
+        assert pct == 0.2
+
+    def test_mixed_chunks_above_threshold(self):
+        """Test mixed chunks above threshold"""
+        # 2 out of 5 = 40%, above 30% threshold
+        chunks = [
+            self._make_chunk("<table><tr><td>Table 1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>Table 2</td></tr></table>"),
+            self._make_chunk("Plain text 1"),
+            self._make_chunk("Plain text 2"),
+            self._make_chunk("Plain text 3"),
+        ]
+        should_skip, pct = analyze_chunks_for_tables(chunks)
+        assert should_skip is True
+        assert pct == 0.4
+
+    def test_empty_chunks(self):
+        """Test empty chunk list"""
+        should_skip, pct = analyze_chunks_for_tables([])
+        assert should_skip is False
+        assert pct == 0.0
+
+    def test_custom_threshold(self):
+        """Test with custom threshold"""
+        # 1 out of 5 = 20%
+        chunks = [
+            self._make_chunk("<table><tr><td>Table</td></tr></table>"),
+            self._make_chunk("Plain text 1"),
+            self._make_chunk("Plain text 2"),
+            self._make_chunk("Plain text 3"),
+            self._make_chunk("Plain text 4"),
+        ]
+        # With 15% threshold, should skip
+        should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.15)
+        assert should_skip is True
+        
+        # With 25% threshold, should not skip
+        should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.25)
+        assert should_skip is False
+
+    def test_default_threshold_value(self):
+        """Test that default threshold is 30%"""
+        assert TABLE_CONTENT_THRESHOLD == 0.3
+
+
+class TestShouldSkipRaptorForChunks:
+    """Test content-based Raptor skip decision"""
+
+    def _make_chunk(self, content: str):
+        """Helper to create a chunk tuple"""
+        return (content, np.zeros(768))
+
+    def test_skip_for_table_heavy_content(self):
+        """Test skipping for table-heavy content"""
+        chunks = [
+            self._make_chunk("<table><tr><td>1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>2</td></tr></table>"),
+            self._make_chunk("Plain text"),
+        ]
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is True
+        assert "HTML tables" in reason
+
+    def test_no_skip_for_text_content(self):
+        """Test not skipping for text content"""
+        chunks = [
+            self._make_chunk("Plain text content 1"),
+            self._make_chunk("Plain text content 2"),
+            self._make_chunk("Plain text content 3"),
+        ]
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is False
+        assert reason == ""
+
+    def test_override_with_config(self):
+        """Test that auto-disable can be overridden"""
+        chunks = [
+            self._make_chunk("<table><tr><td>1</td></tr></table>"),
+            self._make_chunk("<table><tr><td>2</td></tr></table>"),
+        ]
+        raptor_config = {"auto_disable_for_structured_data": False}
+        should_skip, reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+        assert should_skip is False
+        assert reason == ""
+
+    def test_empty_chunks(self):
+        """Test with empty chunks"""
+        should_skip, reason = should_skip_raptor_for_chunks([])
+        assert should_skip is False
+        assert reason == ""
+
+
+class TestPDFWithHtmlTables:
+    """Test real-world PDF with HTML tables scenario (ahmadshakil's issue)"""
+
+    def _make_chunk(self, content: str):
+        """Helper to create a chunk tuple"""
+        return (content, np.zeros(768))
+
+    def test_pdf_with_extracted_tables(self):
+        """Test PDF that has tables extracted as HTML during parsing"""
+        # Simulating chunks from a PDF like Fbr_IncomeTaxOrdinance_2001
+        chunks = [
+            self._make_chunk("Section 1: Introduction to Tax Law"),
+            self._make_chunk('<table><caption>Table Location: Section 2</caption><tr><th>Tax Rate</th><th>Income Range</th></tr><tr><td>10%</td><td>0-500,000</td></tr></table>'),
+            self._make_chunk("Section 3: Deductions and Exemptions"),
+            self._make_chunk('<table><tr><th>Deduction Type</th><th>Maximum Amount</th></tr><tr><td>Medical</td><td>100,000</td></tr></table>'),
+            self._make_chunk("Section 4: Filing Requirements"),
+        ]
+        
+        # 2 out of 5 = 40%, above 30% threshold
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is True
+        assert "HTML tables" in reason
+
+    def test_pdf_with_few_tables(self):
+        """Test PDF with only occasional tables"""
+        chunks = [
+            self._make_chunk("Chapter 1: Overview of the legal framework..."),
+            self._make_chunk("Chapter 2: Detailed analysis of provisions..."),
+            self._make_chunk("Chapter 3: Case studies and examples..."),
+            self._make_chunk("Chapter 4: Implementation guidelines..."),
+            self._make_chunk("Chapter 5: Compliance requirements..."),
+            self._make_chunk("Chapter 6: Penalties and enforcement..."),
+            self._make_chunk("Chapter 7: Appeals process..."),
+            self._make_chunk("Chapter 8: Recent amendments..."),
+            self._make_chunk("Chapter 9: Future outlook..."),
+            self._make_chunk('<table><tr><td>Summary Table</td></tr></table>'),  # Only 1 table
+        ]
+        
+        # 1 out of 10 = 10%, below 30% threshold
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is False
+
+    def test_financial_pdf_with_many_tables(self):
+        """Test financial PDF with many tables (should skip)"""
+        chunks = [
+            self._make_chunk('<table><caption>Balance Sheet</caption><tr><td>Assets</td><td>$1M</td></tr></table>'),
+            self._make_chunk('<table><caption>Income Statement</caption><tr><td>Revenue</td><td>$500K</td></tr></table>'),
+            self._make_chunk('<table><caption>Cash Flow</caption><tr><td>Operating</td><td>$200K</td></tr></table>'),
+            self._make_chunk("Notes to financial statements..."),
+            self._make_chunk('<table><caption>Tax Schedule</caption><tr><td>Tax</td><td>$50K</td></tr></table>'),
+        ]
+        
+        # 4 out of 5 = 80%, well above threshold
+        should_skip, reason = should_skip_raptor_for_chunks(chunks)
+        assert should_skip is True
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])