This commit is contained in:
hsparks-codes 2025-12-15 19:28:35 +08:00 committed by GitHub
commit 197c1ec221
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 367 additions and 4 deletions

View file

@ -30,7 +30,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
from common.connection_utils import timeout
from rag.utils.base64_image import image2id
from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason
from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason, should_skip_raptor_for_chunks
from common.log_utils import init_root_logger
from common.config_utils import show_configs
from graphrag.general.index import run_graphrag_for_kb
@ -742,6 +742,15 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
fields=["content_with_weight", vctr_nm],
sort_by_position=True):
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
# Check if chunks contain HTML tables (content-based detection)
skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
if skip_for_tables:
logging.info(f"Skipping Raptor for document {doc_id}: {skip_reason}")
if callback:
callback(prog=(x+1.)/len(doc_ids), msg=f"Raptor skipped: {skip_reason}")
continue
await generate(chunks, doc_id)
callback(prog=(x+1.)/len(doc_ids))
else:
@ -752,6 +761,14 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
sort_by_position=True):
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
# Check if chunks contain HTML tables (content-based detection)
skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
if skip_for_tables:
logging.info(f"Skipping Raptor for KB scope: {skip_reason}")
if callback:
callback(prog=1.0, msg=f"Raptor skipped: {skip_reason}")
return res, tk_count
await generate(chunks, fake_doc_id)
return res, tk_count

View file

@ -19,7 +19,10 @@ Utility functions for Raptor processing decisions.
"""
import logging
from typing import Optional
import re
from typing import Optional, List, Tuple
import numpy as np
# File extensions for structured data types
@ -27,6 +30,13 @@ EXCEL_EXTENSIONS = {".xls", ".xlsx", ".xlsm", ".xlsb"}
CSV_EXTENSIONS = {".csv", ".tsv"}
STRUCTURED_EXTENSIONS = EXCEL_EXTENSIONS | CSV_EXTENSIONS
# Regex patterns for detecting HTML tables in content
HTML_TABLE_PATTERN = re.compile(r'<table[^>]*>.*?</table>', re.IGNORECASE | re.DOTALL)
HTML_TABLE_START_PATTERN = re.compile(r'<table[^>]*>', re.IGNORECASE)
# Threshold for considering content as "table-heavy" (percentage of chunks with tables)
TABLE_CONTENT_THRESHOLD = 0.3 # If 30%+ of chunks contain tables, skip Raptor
def is_structured_file_type(file_type: Optional[str]) -> bool:
"""
@ -120,7 +130,8 @@ def should_skip_raptor(
def get_skip_reason(
file_type: Optional[str] = None,
parser_id: str = "",
parser_config: Optional[dict] = None
parser_config: Optional[dict] = None,
has_table_content: bool = False
) -> str:
"""
Get a human-readable reason why Raptor was skipped.
@ -129,6 +140,7 @@ def get_skip_reason(
file_type: File extension
parser_id: Parser ID being used
parser_config: Parser configuration dict
has_table_content: Whether content contains HTML tables
Returns:
Reason string, or empty string if Raptor should not be skipped
@ -142,4 +154,96 @@ def get_skip_reason(
if is_tabular_pdf(parser_id, parser_config):
return f"Tabular PDF (parser={parser_id}) - Raptor auto-disabled"
if has_table_content:
return "Content contains HTML tables - Raptor auto-disabled"
return ""
def contains_html_table(content: str) -> bool:
"""
Check if content contains HTML table markup.
Args:
content: Text content to check
Returns:
True if content contains HTML table tags
"""
if not content:
return False
return bool(HTML_TABLE_START_PATTERN.search(content))
def analyze_chunks_for_tables(
chunks: List[Tuple[str, np.ndarray]],
threshold: float = TABLE_CONTENT_THRESHOLD
) -> Tuple[bool, float]:
"""
Analyze chunks to determine if they contain significant table content.
This function checks the actual content of chunks for HTML table markup,
which is generated when PDFs with tables are parsed.
Args:
chunks: List of (content, vector) tuples
threshold: Percentage threshold for considering content as table-heavy
Returns:
Tuple of (should_skip, table_percentage)
"""
if not chunks:
return False, 0.0
table_count = 0
for content, _ in chunks:
if contains_html_table(content):
table_count += 1
table_percentage = table_count / len(chunks)
should_skip = table_percentage >= threshold
if should_skip:
logging.info(
f"Detected table-heavy content: {table_count}/{len(chunks)} chunks "
f"({table_percentage:.1%}) contain HTML tables"
)
return should_skip, table_percentage
def should_skip_raptor_for_chunks(
chunks: List[Tuple[str, np.ndarray]],
raptor_config: Optional[dict] = None,
threshold: float = TABLE_CONTENT_THRESHOLD
) -> Tuple[bool, str]:
"""
Check if Raptor should be skipped based on chunk content analysis.
This is a content-based check that runs after chunks are loaded,
detecting HTML tables that were extracted during parsing.
Args:
chunks: List of (content, vector) tuples
raptor_config: Raptor configuration dict
threshold: Percentage threshold for table content
Returns:
Tuple of (should_skip, reason)
"""
raptor_config = raptor_config or {}
# Check if auto-disable is explicitly disabled
if raptor_config.get("auto_disable_for_structured_data", True) is False:
return False, ""
should_skip, table_pct = analyze_chunks_for_tables(chunks, threshold)
if should_skip:
reason = (
f"Content contains {table_pct:.0%} HTML tables "
f"(threshold: {threshold:.0%}) - Raptor auto-disabled"
)
return True, reason
return False, ""

View file

@ -19,14 +19,19 @@ Unit tests for Raptor utility functions.
"""
import pytest
import numpy as np
from rag.utils.raptor_utils import (
is_structured_file_type,
is_tabular_pdf,
should_skip_raptor,
get_skip_reason,
contains_html_table,
analyze_chunks_for_tables,
should_skip_raptor_for_chunks,
EXCEL_EXTENSIONS,
CSV_EXTENSIONS,
STRUCTURED_EXTENSIONS
STRUCTURED_EXTENSIONS,
TABLE_CONTENT_THRESHOLD
)
@ -283,5 +288,242 @@ class TestIntegrationScenarios:
assert should_skip_raptor(file_type, raptor_config=raptor_config) is False
class TestContainsHtmlTable:
"""Test HTML table detection in content"""
def test_detect_simple_table(self):
"""Test detection of simple HTML table"""
content = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr></table>"
assert contains_html_table(content) is True
def test_detect_table_with_attributes(self):
"""Test detection of table with attributes"""
content = '<table class="data-table" border="1"><tr><td>Data</td></tr></table>'
assert contains_html_table(content) is True
def test_detect_table_case_insensitive(self):
"""Test case insensitive detection"""
assert contains_html_table("<TABLE><TR><TD>X</TD></TR></TABLE>") is True
assert contains_html_table("<Table><tr><td>X</td></tr></Table>") is True
def test_no_table_in_plain_text(self):
"""Test that plain text is not detected as table"""
content = "This is just plain text without any tables."
assert contains_html_table(content) is False
def test_no_table_in_empty_content(self):
"""Test empty content handling"""
assert contains_html_table("") is False
# Note: None is rejected by type hints (beartype), which is correct behavior
def test_table_word_not_detected(self):
"""Test that the word 'table' alone is not detected"""
content = "Please see the table below for more information."
assert contains_html_table(content) is False
def test_mixed_content_with_table(self):
"""Test content with text and table"""
content = """
This is some introductory text.
<table>
<caption>Financial Data</caption>
<tr><th>Year</th><th>Revenue</th></tr>
<tr><td>2024</td><td>$1M</td></tr>
</table>
More text after the table.
"""
assert contains_html_table(content) is True
class TestAnalyzeChunksForTables:
"""Test chunk analysis for table content"""
def _make_chunk(self, content: str):
"""Helper to create a chunk tuple"""
return (content, np.zeros(768))
def test_all_table_chunks(self):
"""Test when all chunks contain tables"""
chunks = [
self._make_chunk("<table><tr><td>1</td></tr></table>"),
self._make_chunk("<table><tr><td>2</td></tr></table>"),
self._make_chunk("<table><tr><td>3</td></tr></table>"),
]
should_skip, pct = analyze_chunks_for_tables(chunks)
assert should_skip is True
assert pct == 1.0
def test_no_table_chunks(self):
"""Test when no chunks contain tables"""
chunks = [
self._make_chunk("Plain text content 1"),
self._make_chunk("Plain text content 2"),
self._make_chunk("Plain text content 3"),
]
should_skip, pct = analyze_chunks_for_tables(chunks)
assert should_skip is False
assert pct == 0.0
def test_mixed_chunks_below_threshold(self):
"""Test mixed chunks below threshold"""
# 1 out of 5 = 20%, below 30% threshold
chunks = [
self._make_chunk("<table><tr><td>Table</td></tr></table>"),
self._make_chunk("Plain text 1"),
self._make_chunk("Plain text 2"),
self._make_chunk("Plain text 3"),
self._make_chunk("Plain text 4"),
]
should_skip, pct = analyze_chunks_for_tables(chunks)
assert should_skip is False
assert pct == 0.2
def test_mixed_chunks_above_threshold(self):
"""Test mixed chunks above threshold"""
# 2 out of 5 = 40%, above 30% threshold
chunks = [
self._make_chunk("<table><tr><td>Table 1</td></tr></table>"),
self._make_chunk("<table><tr><td>Table 2</td></tr></table>"),
self._make_chunk("Plain text 1"),
self._make_chunk("Plain text 2"),
self._make_chunk("Plain text 3"),
]
should_skip, pct = analyze_chunks_for_tables(chunks)
assert should_skip is True
assert pct == 0.4
def test_empty_chunks(self):
"""Test empty chunk list"""
should_skip, pct = analyze_chunks_for_tables([])
assert should_skip is False
assert pct == 0.0
def test_custom_threshold(self):
"""Test with custom threshold"""
# 1 out of 5 = 20%
chunks = [
self._make_chunk("<table><tr><td>Table</td></tr></table>"),
self._make_chunk("Plain text 1"),
self._make_chunk("Plain text 2"),
self._make_chunk("Plain text 3"),
self._make_chunk("Plain text 4"),
]
# With 15% threshold, should skip
should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.15)
assert should_skip is True
# With 25% threshold, should not skip
should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.25)
assert should_skip is False
def test_default_threshold_value(self):
"""Test that default threshold is 30%"""
assert TABLE_CONTENT_THRESHOLD == 0.3
class TestShouldSkipRaptorForChunks:
"""Test content-based Raptor skip decision"""
def _make_chunk(self, content: str):
"""Helper to create a chunk tuple"""
return (content, np.zeros(768))
def test_skip_for_table_heavy_content(self):
"""Test skipping for table-heavy content"""
chunks = [
self._make_chunk("<table><tr><td>1</td></tr></table>"),
self._make_chunk("<table><tr><td>2</td></tr></table>"),
self._make_chunk("Plain text"),
]
should_skip, reason = should_skip_raptor_for_chunks(chunks)
assert should_skip is True
assert "HTML tables" in reason
def test_no_skip_for_text_content(self):
"""Test not skipping for text content"""
chunks = [
self._make_chunk("Plain text content 1"),
self._make_chunk("Plain text content 2"),
self._make_chunk("Plain text content 3"),
]
should_skip, reason = should_skip_raptor_for_chunks(chunks)
assert should_skip is False
assert reason == ""
def test_override_with_config(self):
"""Test that auto-disable can be overridden"""
chunks = [
self._make_chunk("<table><tr><td>1</td></tr></table>"),
self._make_chunk("<table><tr><td>2</td></tr></table>"),
]
raptor_config = {"auto_disable_for_structured_data": False}
should_skip, reason = should_skip_raptor_for_chunks(chunks, raptor_config)
assert should_skip is False
assert reason == ""
def test_empty_chunks(self):
"""Test with empty chunks"""
should_skip, reason = should_skip_raptor_for_chunks([])
assert should_skip is False
assert reason == ""
class TestPDFWithHtmlTables:
"""Test real-world PDF with HTML tables scenario (ahmadshakil's issue)"""
def _make_chunk(self, content: str):
"""Helper to create a chunk tuple"""
return (content, np.zeros(768))
def test_pdf_with_extracted_tables(self):
"""Test PDF that has tables extracted as HTML during parsing"""
# Simulating chunks from a PDF like Fbr_IncomeTaxOrdinance_2001
chunks = [
self._make_chunk("Section 1: Introduction to Tax Law"),
self._make_chunk('<table><caption>Table Location: Section 2</caption><tr><th>Tax Rate</th><th>Income Range</th></tr><tr><td>10%</td><td>0-500,000</td></tr></table>'),
self._make_chunk("Section 3: Deductions and Exemptions"),
self._make_chunk('<table><tr><th>Deduction Type</th><th>Maximum Amount</th></tr><tr><td>Medical</td><td>100,000</td></tr></table>'),
self._make_chunk("Section 4: Filing Requirements"),
]
# 2 out of 5 = 40%, above 30% threshold
should_skip, reason = should_skip_raptor_for_chunks(chunks)
assert should_skip is True
assert "HTML tables" in reason
def test_pdf_with_few_tables(self):
"""Test PDF with only occasional tables"""
chunks = [
self._make_chunk("Chapter 1: Overview of the legal framework..."),
self._make_chunk("Chapter 2: Detailed analysis of provisions..."),
self._make_chunk("Chapter 3: Case studies and examples..."),
self._make_chunk("Chapter 4: Implementation guidelines..."),
self._make_chunk("Chapter 5: Compliance requirements..."),
self._make_chunk("Chapter 6: Penalties and enforcement..."),
self._make_chunk("Chapter 7: Appeals process..."),
self._make_chunk("Chapter 8: Recent amendments..."),
self._make_chunk("Chapter 9: Future outlook..."),
self._make_chunk('<table><tr><td>Summary Table</td></tr></table>'), # Only 1 table
]
# 1 out of 10 = 10%, below 30% threshold
should_skip, reason = should_skip_raptor_for_chunks(chunks)
assert should_skip is False
def test_financial_pdf_with_many_tables(self):
"""Test financial PDF with many tables (should skip)"""
chunks = [
self._make_chunk('<table><caption>Balance Sheet</caption><tr><td>Assets</td><td>$1M</td></tr></table>'),
self._make_chunk('<table><caption>Income Statement</caption><tr><td>Revenue</td><td>$500K</td></tr></table>'),
self._make_chunk('<table><caption>Cash Flow</caption><tr><td>Operating</td><td>$200K</td></tr></table>'),
self._make_chunk("Notes to financial statements..."),
self._make_chunk('<table><caption>Tax Schedule</caption><tr><td>Tax</td><td>$50K</td></tr></table>'),
]
# 4 out of 5 = 80%, well above threshold
should_skip, reason = should_skip_raptor_for_chunks(chunks)
assert should_skip is True
if __name__ == "__main__":
pytest.main([__file__, "-v"])