From ff408dea69d21b2bd1a20d4f4d3c7809b95d9381 Mon Sep 17 00:00:00 2001 From: "hsparks.codes" Date: Fri, 5 Dec 2025 09:38:28 +0100 Subject: [PATCH] fix: Detect HTML tables in PDF content for Raptor auto-disable Addresses issue reported by @ahmadshakil where PDFs with HTML tables (like Fbr_IncomeTaxOrdinance_2001) were still being sent to Raptor. Problem: - Original implementation only checked parser_id and html4excel config - PDFs parsed with 'naive' parser extract tables as HTML - These tables were not detected, so Raptor processed them anyway Solution: - Add content-based detection: analyze chunks for
HTML tags - Skip Raptor if 30%+ of chunks contain HTML tables - Check happens after chunks are loaded, before Raptor processing - Configurable threshold via TABLE_CONTENT_THRESHOLD New functions: - contains_html_table(): Detect
tags in content - analyze_chunks_for_tables(): Calculate table percentage in chunks - should_skip_raptor_for_chunks(): Content-based skip decision Tests: - Added 21 new tests for content-based detection (65 total) - Includes test case simulating ahmadshakil's PDF scenario - All tests passing This fix ensures PDFs with extracted tables are properly skipped, regardless of which parser was used. --- rag/svr/task_executor.py | 19 +- rag/utils/raptor_utils.py | 108 +++++++++- test/unit_test/utils/test_raptor_utils.py | 244 +++++++++++++++++++++- 3 files changed, 367 insertions(+), 4 deletions(-) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 8cf1bf290..b23c222ac 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -29,7 +29,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.pipeline_operation_log_service import PipelineOperationLogService from common.connection_utils import timeout from rag.utils.base64_image import image2id -from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason +from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason, should_skip_raptor_for_chunks from common.log_utils import init_root_logger from common.config_utils import show_configs from graphrag.general.index import run_graphrag_for_kb @@ -694,6 +694,15 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si fields=["content_with_weight", vctr_nm], sort_by_position=True): chunks.append((d["content_with_weight"], np.array(d[vctr_nm]))) + + # Check if chunks contain HTML tables (content-based detection) + skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config) + if skip_for_tables: + logging.info(f"Skipping Raptor for document {doc_id}: {skip_reason}") + if callback: + callback(prog=(x+1.)/len(doc_ids), msg=f"Raptor skipped: {skip_reason}") + continue + await generate(chunks, doc_id) callback(prog=(x+1.)/len(doc_ids)) else: @@ -704,6 +713,14 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si sort_by_position=True): chunks.append((d["content_with_weight"], np.array(d[vctr_nm]))) + # Check if chunks contain HTML tables (content-based detection) + skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config) + if skip_for_tables: + logging.info(f"Skipping Raptor for KB scope: {skip_reason}") + if callback: + callback(prog=1.0, msg=f"Raptor skipped: {skip_reason}") + return res, tk_count + await generate(chunks, fake_doc_id) return res, tk_count diff --git a/rag/utils/raptor_utils.py b/rag/utils/raptor_utils.py index c48e0999b..bc4dc2a83 100644 --- a/rag/utils/raptor_utils.py +++ b/rag/utils/raptor_utils.py @@ -19,7 +19,10 @@ Utility functions for Raptor processing decisions. """ import logging -from typing import Optional +import re +from typing import Optional, List, Tuple + +import numpy as np # File extensions for structured data types @@ -27,6 +30,13 @@ EXCEL_EXTENSIONS = {".xls", ".xlsx", ".xlsm", ".xlsb"} CSV_EXTENSIONS = {".csv", ".tsv"} STRUCTURED_EXTENSIONS = EXCEL_EXTENSIONS | CSV_EXTENSIONS +# Regex patterns for detecting HTML tables in content +HTML_TABLE_PATTERN = re.compile(r']*>.*?
', re.IGNORECASE | re.DOTALL) +HTML_TABLE_START_PATTERN = re.compile(r']*>', re.IGNORECASE) + +# Threshold for considering content as "table-heavy" (percentage of chunks with tables) +TABLE_CONTENT_THRESHOLD = 0.3 # If 30%+ of chunks contain tables, skip Raptor + def is_structured_file_type(file_type: Optional[str]) -> bool: """ @@ -120,7 +130,8 @@ def should_skip_raptor( def get_skip_reason( file_type: Optional[str] = None, parser_id: str = "", - parser_config: Optional[dict] = None + parser_config: Optional[dict] = None, + has_table_content: bool = False ) -> str: """ Get a human-readable reason why Raptor was skipped. @@ -129,6 +140,7 @@ def get_skip_reason( file_type: File extension parser_id: Parser ID being used parser_config: Parser configuration dict + has_table_content: Whether content contains HTML tables Returns: Reason string, or empty string if Raptor should not be skipped @@ -142,4 +154,96 @@ def get_skip_reason( if is_tabular_pdf(parser_id, parser_config): return f"Tabular PDF (parser={parser_id}) - Raptor auto-disabled" + if has_table_content: + return "Content contains HTML tables - Raptor auto-disabled" + return "" + + +def contains_html_table(content: str) -> bool: + """ + Check if content contains HTML table markup. + + Args: + content: Text content to check + + Returns: + True if content contains HTML table tags + """ + if not content: + return False + return bool(HTML_TABLE_START_PATTERN.search(content)) + + +def analyze_chunks_for_tables( + chunks: List[Tuple[str, np.ndarray]], + threshold: float = TABLE_CONTENT_THRESHOLD +) -> Tuple[bool, float]: + """ + Analyze chunks to determine if they contain significant table content. + + This function checks the actual content of chunks for HTML table markup, + which is generated when PDFs with tables are parsed. + + Args: + chunks: List of (content, vector) tuples + threshold: Percentage threshold for considering content as table-heavy + + Returns: + Tuple of (should_skip, table_percentage) + """ + if not chunks: + return False, 0.0 + + table_count = 0 + for content, _ in chunks: + if contains_html_table(content): + table_count += 1 + + table_percentage = table_count / len(chunks) + should_skip = table_percentage >= threshold + + if should_skip: + logging.info( + f"Detected table-heavy content: {table_count}/{len(chunks)} chunks " + f"({table_percentage:.1%}) contain HTML tables" + ) + + return should_skip, table_percentage + + +def should_skip_raptor_for_chunks( + chunks: List[Tuple[str, np.ndarray]], + raptor_config: Optional[dict] = None, + threshold: float = TABLE_CONTENT_THRESHOLD +) -> Tuple[bool, str]: + """ + Check if Raptor should be skipped based on chunk content analysis. + + This is a content-based check that runs after chunks are loaded, + detecting HTML tables that were extracted during parsing. + + Args: + chunks: List of (content, vector) tuples + raptor_config: Raptor configuration dict + threshold: Percentage threshold for table content + + Returns: + Tuple of (should_skip, reason) + """ + raptor_config = raptor_config or {} + + # Check if auto-disable is explicitly disabled + if raptor_config.get("auto_disable_for_structured_data", True) is False: + return False, "" + + should_skip, table_pct = analyze_chunks_for_tables(chunks, threshold) + + if should_skip: + reason = ( + f"Content contains {table_pct:.0%} HTML tables " + f"(threshold: {threshold:.0%}) - Raptor auto-disabled" + ) + return True, reason + + return False, "" diff --git a/test/unit_test/utils/test_raptor_utils.py b/test/unit_test/utils/test_raptor_utils.py index 5138ccda7..1b08345be 100644 --- a/test/unit_test/utils/test_raptor_utils.py +++ b/test/unit_test/utils/test_raptor_utils.py @@ -19,14 +19,19 @@ Unit tests for Raptor utility functions. """ import pytest +import numpy as np from rag.utils.raptor_utils import ( is_structured_file_type, is_tabular_pdf, should_skip_raptor, get_skip_reason, + contains_html_table, + analyze_chunks_for_tables, + should_skip_raptor_for_chunks, EXCEL_EXTENSIONS, CSV_EXTENSIONS, - STRUCTURED_EXTENSIONS + STRUCTURED_EXTENSIONS, + TABLE_CONTENT_THRESHOLD ) @@ -283,5 +288,242 @@ class TestIntegrationScenarios: assert should_skip_raptor(file_type, raptor_config=raptor_config) is False +class TestContainsHtmlTable: + """Test HTML table detection in content""" + + def test_detect_simple_table(self): + """Test detection of simple HTML table""" + content = "
Cell 1Cell 2
" + assert contains_html_table(content) is True + + def test_detect_table_with_attributes(self): + """Test detection of table with attributes""" + content = '
Data
' + assert contains_html_table(content) is True + + def test_detect_table_case_insensitive(self): + """Test case insensitive detection""" + assert contains_html_table("
X
") is True + assert contains_html_table("
X
") is True + + def test_no_table_in_plain_text(self): + """Test that plain text is not detected as table""" + content = "This is just plain text without any tables." + assert contains_html_table(content) is False + + def test_no_table_in_empty_content(self): + """Test empty content handling""" + assert contains_html_table("") is False + # Note: None is rejected by type hints (beartype), which is correct behavior + + def test_table_word_not_detected(self): + """Test that the word 'table' alone is not detected""" + content = "Please see the table below for more information." + assert contains_html_table(content) is False + + def test_mixed_content_with_table(self): + """Test content with text and table""" + content = """ + This is some introductory text. + + + + +
Financial Data
YearRevenue
2024$1M
+ More text after the table. + """ + assert contains_html_table(content) is True + + +class TestAnalyzeChunksForTables: + """Test chunk analysis for table content""" + + def _make_chunk(self, content: str): + """Helper to create a chunk tuple""" + return (content, np.zeros(768)) + + def test_all_table_chunks(self): + """Test when all chunks contain tables""" + chunks = [ + self._make_chunk("
1
"), + self._make_chunk("
2
"), + self._make_chunk("
3
"), + ] + should_skip, pct = analyze_chunks_for_tables(chunks) + assert should_skip is True + assert pct == 1.0 + + def test_no_table_chunks(self): + """Test when no chunks contain tables""" + chunks = [ + self._make_chunk("Plain text content 1"), + self._make_chunk("Plain text content 2"), + self._make_chunk("Plain text content 3"), + ] + should_skip, pct = analyze_chunks_for_tables(chunks) + assert should_skip is False + assert pct == 0.0 + + def test_mixed_chunks_below_threshold(self): + """Test mixed chunks below threshold""" + # 1 out of 5 = 20%, below 30% threshold + chunks = [ + self._make_chunk("
Table
"), + self._make_chunk("Plain text 1"), + self._make_chunk("Plain text 2"), + self._make_chunk("Plain text 3"), + self._make_chunk("Plain text 4"), + ] + should_skip, pct = analyze_chunks_for_tables(chunks) + assert should_skip is False + assert pct == 0.2 + + def test_mixed_chunks_above_threshold(self): + """Test mixed chunks above threshold""" + # 2 out of 5 = 40%, above 30% threshold + chunks = [ + self._make_chunk("
Table 1
"), + self._make_chunk("
Table 2
"), + self._make_chunk("Plain text 1"), + self._make_chunk("Plain text 2"), + self._make_chunk("Plain text 3"), + ] + should_skip, pct = analyze_chunks_for_tables(chunks) + assert should_skip is True + assert pct == 0.4 + + def test_empty_chunks(self): + """Test empty chunk list""" + should_skip, pct = analyze_chunks_for_tables([]) + assert should_skip is False + assert pct == 0.0 + + def test_custom_threshold(self): + """Test with custom threshold""" + # 1 out of 5 = 20% + chunks = [ + self._make_chunk("
Table
"), + self._make_chunk("Plain text 1"), + self._make_chunk("Plain text 2"), + self._make_chunk("Plain text 3"), + self._make_chunk("Plain text 4"), + ] + # With 15% threshold, should skip + should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.15) + assert should_skip is True + + # With 25% threshold, should not skip + should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.25) + assert should_skip is False + + def test_default_threshold_value(self): + """Test that default threshold is 30%""" + assert TABLE_CONTENT_THRESHOLD == 0.3 + + +class TestShouldSkipRaptorForChunks: + """Test content-based Raptor skip decision""" + + def _make_chunk(self, content: str): + """Helper to create a chunk tuple""" + return (content, np.zeros(768)) + + def test_skip_for_table_heavy_content(self): + """Test skipping for table-heavy content""" + chunks = [ + self._make_chunk("
1
"), + self._make_chunk("
2
"), + self._make_chunk("Plain text"), + ] + should_skip, reason = should_skip_raptor_for_chunks(chunks) + assert should_skip is True + assert "HTML tables" in reason + + def test_no_skip_for_text_content(self): + """Test not skipping for text content""" + chunks = [ + self._make_chunk("Plain text content 1"), + self._make_chunk("Plain text content 2"), + self._make_chunk("Plain text content 3"), + ] + should_skip, reason = should_skip_raptor_for_chunks(chunks) + assert should_skip is False + assert reason == "" + + def test_override_with_config(self): + """Test that auto-disable can be overridden""" + chunks = [ + self._make_chunk("
1
"), + self._make_chunk("
2
"), + ] + raptor_config = {"auto_disable_for_structured_data": False} + should_skip, reason = should_skip_raptor_for_chunks(chunks, raptor_config) + assert should_skip is False + assert reason == "" + + def test_empty_chunks(self): + """Test with empty chunks""" + should_skip, reason = should_skip_raptor_for_chunks([]) + assert should_skip is False + assert reason == "" + + +class TestPDFWithHtmlTables: + """Test real-world PDF with HTML tables scenario (ahmadshakil's issue)""" + + def _make_chunk(self, content: str): + """Helper to create a chunk tuple""" + return (content, np.zeros(768)) + + def test_pdf_with_extracted_tables(self): + """Test PDF that has tables extracted as HTML during parsing""" + # Simulating chunks from a PDF like Fbr_IncomeTaxOrdinance_2001 + chunks = [ + self._make_chunk("Section 1: Introduction to Tax Law"), + self._make_chunk('
Table Location: Section 2
Tax RateIncome Range
10%0-500,000
'), + self._make_chunk("Section 3: Deductions and Exemptions"), + self._make_chunk('
Deduction TypeMaximum Amount
Medical100,000
'), + self._make_chunk("Section 4: Filing Requirements"), + ] + + # 2 out of 5 = 40%, above 30% threshold + should_skip, reason = should_skip_raptor_for_chunks(chunks) + assert should_skip is True + assert "HTML tables" in reason + + def test_pdf_with_few_tables(self): + """Test PDF with only occasional tables""" + chunks = [ + self._make_chunk("Chapter 1: Overview of the legal framework..."), + self._make_chunk("Chapter 2: Detailed analysis of provisions..."), + self._make_chunk("Chapter 3: Case studies and examples..."), + self._make_chunk("Chapter 4: Implementation guidelines..."), + self._make_chunk("Chapter 5: Compliance requirements..."), + self._make_chunk("Chapter 6: Penalties and enforcement..."), + self._make_chunk("Chapter 7: Appeals process..."), + self._make_chunk("Chapter 8: Recent amendments..."), + self._make_chunk("Chapter 9: Future outlook..."), + self._make_chunk('
Summary Table
'), # Only 1 table + ] + + # 1 out of 10 = 10%, below 30% threshold + should_skip, reason = should_skip_raptor_for_chunks(chunks) + assert should_skip is False + + def test_financial_pdf_with_many_tables(self): + """Test financial PDF with many tables (should skip)""" + chunks = [ + self._make_chunk('
Balance Sheet
Assets$1M
'), + self._make_chunk('
Income Statement
Revenue$500K
'), + self._make_chunk('
Cash Flow
Operating$200K
'), + self._make_chunk("Notes to financial statements..."), + self._make_chunk('
Tax Schedule
Tax$50K
'), + ] + + # 4 out of 5 = 80%, well above threshold + should_skip, reason = should_skip_raptor_for_chunks(chunks) + assert should_skip is True + + if __name__ == "__main__": pytest.main([__file__, "-v"])