diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index 1a0c51600..d848b6d24 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -30,7 +30,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
from common.connection_utils import timeout
from rag.utils.base64_image import image2id
-from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason
+from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason, should_skip_raptor_for_chunks
from common.log_utils import init_root_logger
from common.config_utils import show_configs
from graphrag.general.index import run_graphrag_for_kb
@@ -742,6 +742,15 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
fields=["content_with_weight", vctr_nm],
sort_by_position=True):
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
+
+ # Check if chunks contain HTML tables (content-based detection)
+ skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+ if skip_for_tables:
+ logging.info(f"Skipping Raptor for document {doc_id}: {skip_reason}")
+ if callback:
+ callback(prog=(x+1.)/len(doc_ids), msg=f"Raptor skipped: {skip_reason}")
+ continue
+
await generate(chunks, doc_id)
callback(prog=(x+1.)/len(doc_ids))
else:
@@ -752,6 +761,14 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
sort_by_position=True):
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
+ # Check if chunks contain HTML tables (content-based detection)
+ skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+ if skip_for_tables:
+ logging.info(f"Skipping Raptor for KB scope: {skip_reason}")
+ if callback:
+ callback(prog=1.0, msg=f"Raptor skipped: {skip_reason}")
+ return res, tk_count
+
await generate(chunks, fake_doc_id)
return res, tk_count
diff --git a/rag/utils/raptor_utils.py b/rag/utils/raptor_utils.py
index c48e0999b..bc4dc2a83 100644
--- a/rag/utils/raptor_utils.py
+++ b/rag/utils/raptor_utils.py
@@ -19,7 +19,10 @@ Utility functions for Raptor processing decisions.
"""
import logging
-from typing import Optional
+import re
+from typing import Optional, List, Tuple
+
+import numpy as np
# File extensions for structured data types
@@ -27,6 +30,13 @@ EXCEL_EXTENSIONS = {".xls", ".xlsx", ".xlsm", ".xlsb"}
CSV_EXTENSIONS = {".csv", ".tsv"}
STRUCTURED_EXTENSIONS = EXCEL_EXTENSIONS | CSV_EXTENSIONS
+# Regex patterns for detecting HTML tables in content
+HTML_TABLE_PATTERN = re.compile(r'
', re.IGNORECASE | re.DOTALL)
+HTML_TABLE_START_PATTERN = re.compile(r']*>', re.IGNORECASE)
+
+# Threshold for considering content as "table-heavy" (percentage of chunks with tables)
+TABLE_CONTENT_THRESHOLD = 0.3 # If 30%+ of chunks contain tables, skip Raptor
+
def is_structured_file_type(file_type: Optional[str]) -> bool:
"""
@@ -120,7 +130,8 @@ def should_skip_raptor(
def get_skip_reason(
file_type: Optional[str] = None,
parser_id: str = "",
- parser_config: Optional[dict] = None
+ parser_config: Optional[dict] = None,
+ has_table_content: bool = False
) -> str:
"""
Get a human-readable reason why Raptor was skipped.
@@ -129,6 +140,7 @@ def get_skip_reason(
file_type: File extension
parser_id: Parser ID being used
parser_config: Parser configuration dict
+ has_table_content: Whether content contains HTML tables
Returns:
Reason string, or empty string if Raptor should not be skipped
@@ -142,4 +154,96 @@ def get_skip_reason(
if is_tabular_pdf(parser_id, parser_config):
return f"Tabular PDF (parser={parser_id}) - Raptor auto-disabled"
+ if has_table_content:
+ return "Content contains HTML tables - Raptor auto-disabled"
+
return ""
+
+
+def contains_html_table(content: str) -> bool:
+ """
+ Check if content contains HTML table markup.
+
+ Args:
+ content: Text content to check
+
+ Returns:
+ True if content contains HTML table tags
+ """
+ if not content:
+ return False
+ return bool(HTML_TABLE_START_PATTERN.search(content))
+
+
+def analyze_chunks_for_tables(
+ chunks: List[Tuple[str, np.ndarray]],
+ threshold: float = TABLE_CONTENT_THRESHOLD
+) -> Tuple[bool, float]:
+ """
+ Analyze chunks to determine if they contain significant table content.
+
+ This function checks the actual content of chunks for HTML table markup,
+ which is generated when PDFs with tables are parsed.
+
+ Args:
+ chunks: List of (content, vector) tuples
+ threshold: Percentage threshold for considering content as table-heavy
+
+ Returns:
+ Tuple of (should_skip, table_percentage)
+ """
+ if not chunks:
+ return False, 0.0
+
+ table_count = 0
+ for content, _ in chunks:
+ if contains_html_table(content):
+ table_count += 1
+
+ table_percentage = table_count / len(chunks)
+ should_skip = table_percentage >= threshold
+
+ if should_skip:
+ logging.info(
+ f"Detected table-heavy content: {table_count}/{len(chunks)} chunks "
+ f"({table_percentage:.1%}) contain HTML tables"
+ )
+
+ return should_skip, table_percentage
+
+
+def should_skip_raptor_for_chunks(
+ chunks: List[Tuple[str, np.ndarray]],
+ raptor_config: Optional[dict] = None,
+ threshold: float = TABLE_CONTENT_THRESHOLD
+) -> Tuple[bool, str]:
+ """
+ Check if Raptor should be skipped based on chunk content analysis.
+
+ This is a content-based check that runs after chunks are loaded,
+ detecting HTML tables that were extracted during parsing.
+
+ Args:
+ chunks: List of (content, vector) tuples
+ raptor_config: Raptor configuration dict
+ threshold: Percentage threshold for table content
+
+ Returns:
+ Tuple of (should_skip, reason)
+ """
+ raptor_config = raptor_config or {}
+
+ # Check if auto-disable is explicitly disabled
+ if raptor_config.get("auto_disable_for_structured_data", True) is False:
+ return False, ""
+
+ should_skip, table_pct = analyze_chunks_for_tables(chunks, threshold)
+
+ if should_skip:
+ reason = (
+ f"Content contains {table_pct:.0%} HTML tables "
+ f"(threshold: {threshold:.0%}) - Raptor auto-disabled"
+ )
+ return True, reason
+
+ return False, ""
diff --git a/test/unit_test/utils/test_raptor_utils.py b/test/unit_test/utils/test_raptor_utils.py
index 5138ccda7..1b08345be 100644
--- a/test/unit_test/utils/test_raptor_utils.py
+++ b/test/unit_test/utils/test_raptor_utils.py
@@ -19,14 +19,19 @@ Unit tests for Raptor utility functions.
"""
import pytest
+import numpy as np
from rag.utils.raptor_utils import (
is_structured_file_type,
is_tabular_pdf,
should_skip_raptor,
get_skip_reason,
+ contains_html_table,
+ analyze_chunks_for_tables,
+ should_skip_raptor_for_chunks,
EXCEL_EXTENSIONS,
CSV_EXTENSIONS,
- STRUCTURED_EXTENSIONS
+ STRUCTURED_EXTENSIONS,
+ TABLE_CONTENT_THRESHOLD
)
@@ -283,5 +288,242 @@ class TestIntegrationScenarios:
assert should_skip_raptor(file_type, raptor_config=raptor_config) is False
+class TestContainsHtmlTable:
+ """Test HTML table detection in content"""
+
+ def test_detect_simple_table(self):
+ """Test detection of simple HTML table"""
+ content = ""
+ assert contains_html_table(content) is True
+
+ def test_detect_table_with_attributes(self):
+ """Test detection of table with attributes"""
+ content = ''
+ assert contains_html_table(content) is True
+
+ def test_detect_table_case_insensitive(self):
+ """Test case insensitive detection"""
+ assert contains_html_table("") is True
+ assert contains_html_table("") is True
+
+ def test_no_table_in_plain_text(self):
+ """Test that plain text is not detected as table"""
+ content = "This is just plain text without any tables."
+ assert contains_html_table(content) is False
+
+ def test_no_table_in_empty_content(self):
+ """Test empty content handling"""
+ assert contains_html_table("") is False
+ # Note: None is rejected by type hints (beartype), which is correct behavior
+
+ def test_table_word_not_detected(self):
+ """Test that the word 'table' alone is not detected"""
+ content = "Please see the table below for more information."
+ assert contains_html_table(content) is False
+
+ def test_mixed_content_with_table(self):
+ """Test content with text and table"""
+ content = """
+ This is some introductory text.
+
+ Financial Data
+ | Year | Revenue |
+ | 2024 | $1M |
+
+ More text after the table.
+ """
+ assert contains_html_table(content) is True
+
+
+class TestAnalyzeChunksForTables:
+ """Test chunk analysis for table content"""
+
+ def _make_chunk(self, content: str):
+ """Helper to create a chunk tuple"""
+ return (content, np.zeros(768))
+
+ def test_all_table_chunks(self):
+ """Test when all chunks contain tables"""
+ chunks = [
+ self._make_chunk(""),
+ self._make_chunk(""),
+ self._make_chunk(""),
+ ]
+ should_skip, pct = analyze_chunks_for_tables(chunks)
+ assert should_skip is True
+ assert pct == 1.0
+
+ def test_no_table_chunks(self):
+ """Test when no chunks contain tables"""
+ chunks = [
+ self._make_chunk("Plain text content 1"),
+ self._make_chunk("Plain text content 2"),
+ self._make_chunk("Plain text content 3"),
+ ]
+ should_skip, pct = analyze_chunks_for_tables(chunks)
+ assert should_skip is False
+ assert pct == 0.0
+
+ def test_mixed_chunks_below_threshold(self):
+ """Test mixed chunks below threshold"""
+ # 1 out of 5 = 20%, below 30% threshold
+ chunks = [
+ self._make_chunk(""),
+ self._make_chunk("Plain text 1"),
+ self._make_chunk("Plain text 2"),
+ self._make_chunk("Plain text 3"),
+ self._make_chunk("Plain text 4"),
+ ]
+ should_skip, pct = analyze_chunks_for_tables(chunks)
+ assert should_skip is False
+ assert pct == 0.2
+
+ def test_mixed_chunks_above_threshold(self):
+ """Test mixed chunks above threshold"""
+ # 2 out of 5 = 40%, above 30% threshold
+ chunks = [
+ self._make_chunk(""),
+ self._make_chunk(""),
+ self._make_chunk("Plain text 1"),
+ self._make_chunk("Plain text 2"),
+ self._make_chunk("Plain text 3"),
+ ]
+ should_skip, pct = analyze_chunks_for_tables(chunks)
+ assert should_skip is True
+ assert pct == 0.4
+
+ def test_empty_chunks(self):
+ """Test empty chunk list"""
+ should_skip, pct = analyze_chunks_for_tables([])
+ assert should_skip is False
+ assert pct == 0.0
+
+ def test_custom_threshold(self):
+ """Test with custom threshold"""
+ # 1 out of 5 = 20%
+ chunks = [
+ self._make_chunk(""),
+ self._make_chunk("Plain text 1"),
+ self._make_chunk("Plain text 2"),
+ self._make_chunk("Plain text 3"),
+ self._make_chunk("Plain text 4"),
+ ]
+ # With 15% threshold, should skip
+ should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.15)
+ assert should_skip is True
+
+ # With 25% threshold, should not skip
+ should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.25)
+ assert should_skip is False
+
+ def test_default_threshold_value(self):
+ """Test that default threshold is 30%"""
+ assert TABLE_CONTENT_THRESHOLD == 0.3
+
+
+class TestShouldSkipRaptorForChunks:
+ """Test content-based Raptor skip decision"""
+
+ def _make_chunk(self, content: str):
+ """Helper to create a chunk tuple"""
+ return (content, np.zeros(768))
+
+ def test_skip_for_table_heavy_content(self):
+ """Test skipping for table-heavy content"""
+ chunks = [
+ self._make_chunk(""),
+ self._make_chunk(""),
+ self._make_chunk("Plain text"),
+ ]
+ should_skip, reason = should_skip_raptor_for_chunks(chunks)
+ assert should_skip is True
+ assert "HTML tables" in reason
+
+ def test_no_skip_for_text_content(self):
+ """Test not skipping for text content"""
+ chunks = [
+ self._make_chunk("Plain text content 1"),
+ self._make_chunk("Plain text content 2"),
+ self._make_chunk("Plain text content 3"),
+ ]
+ should_skip, reason = should_skip_raptor_for_chunks(chunks)
+ assert should_skip is False
+ assert reason == ""
+
+ def test_override_with_config(self):
+ """Test that auto-disable can be overridden"""
+ chunks = [
+ self._make_chunk(""),
+ self._make_chunk(""),
+ ]
+ raptor_config = {"auto_disable_for_structured_data": False}
+ should_skip, reason = should_skip_raptor_for_chunks(chunks, raptor_config)
+ assert should_skip is False
+ assert reason == ""
+
+ def test_empty_chunks(self):
+ """Test with empty chunks"""
+ should_skip, reason = should_skip_raptor_for_chunks([])
+ assert should_skip is False
+ assert reason == ""
+
+
+class TestPDFWithHtmlTables:
+ """Test real-world PDF with HTML tables scenario (ahmadshakil's issue)"""
+
+ def _make_chunk(self, content: str):
+ """Helper to create a chunk tuple"""
+ return (content, np.zeros(768))
+
+ def test_pdf_with_extracted_tables(self):
+ """Test PDF that has tables extracted as HTML during parsing"""
+ # Simulating chunks from a PDF like Fbr_IncomeTaxOrdinance_2001
+ chunks = [
+ self._make_chunk("Section 1: Introduction to Tax Law"),
+ self._make_chunk('Table Location: Section 2| Tax Rate | Income Range |
|---|
| 10% | 0-500,000 |
'),
+ self._make_chunk("Section 3: Deductions and Exemptions"),
+ self._make_chunk('| Deduction Type | Maximum Amount |
|---|
| Medical | 100,000 |
'),
+ self._make_chunk("Section 4: Filing Requirements"),
+ ]
+
+ # 2 out of 5 = 40%, above 30% threshold
+ should_skip, reason = should_skip_raptor_for_chunks(chunks)
+ assert should_skip is True
+ assert "HTML tables" in reason
+
+ def test_pdf_with_few_tables(self):
+ """Test PDF with only occasional tables"""
+ chunks = [
+ self._make_chunk("Chapter 1: Overview of the legal framework..."),
+ self._make_chunk("Chapter 2: Detailed analysis of provisions..."),
+ self._make_chunk("Chapter 3: Case studies and examples..."),
+ self._make_chunk("Chapter 4: Implementation guidelines..."),
+ self._make_chunk("Chapter 5: Compliance requirements..."),
+ self._make_chunk("Chapter 6: Penalties and enforcement..."),
+ self._make_chunk("Chapter 7: Appeals process..."),
+ self._make_chunk("Chapter 8: Recent amendments..."),
+ self._make_chunk("Chapter 9: Future outlook..."),
+ self._make_chunk(''), # Only 1 table
+ ]
+
+ # 1 out of 10 = 10%, below 30% threshold
+ should_skip, reason = should_skip_raptor_for_chunks(chunks)
+ assert should_skip is False
+
+ def test_financial_pdf_with_many_tables(self):
+ """Test financial PDF with many tables (should skip)"""
+ chunks = [
+ self._make_chunk(''),
+ self._make_chunk('Income Statement| Revenue | $500K |
'),
+ self._make_chunk(''),
+ self._make_chunk("Notes to financial statements..."),
+ self._make_chunk(''),
+ ]
+
+ # 4 out of 5 = 80%, well above threshold
+ should_skip, reason = should_skip_raptor_for_chunks(chunks)
+ assert should_skip is True
+
+
if __name__ == "__main__":
pytest.main([__file__, "-v"])