Merge 8518a5cf29 into a2e080c2d3
This commit is contained in:
commit
197c1ec221
3 changed files with 367 additions and 4 deletions
|
|
@ -30,7 +30,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
|||
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
|
||||
from common.connection_utils import timeout
|
||||
from rag.utils.base64_image import image2id
|
||||
from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason
|
||||
from rag.utils.raptor_utils import should_skip_raptor, get_skip_reason, should_skip_raptor_for_chunks
|
||||
from common.log_utils import init_root_logger
|
||||
from common.config_utils import show_configs
|
||||
from graphrag.general.index import run_graphrag_for_kb
|
||||
|
|
@ -742,6 +742,15 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
|||
fields=["content_with_weight", vctr_nm],
|
||||
sort_by_position=True):
|
||||
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
|
||||
|
||||
# Check if chunks contain HTML tables (content-based detection)
|
||||
skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
|
||||
if skip_for_tables:
|
||||
logging.info(f"Skipping Raptor for document {doc_id}: {skip_reason}")
|
||||
if callback:
|
||||
callback(prog=(x+1.)/len(doc_ids), msg=f"Raptor skipped: {skip_reason}")
|
||||
continue
|
||||
|
||||
await generate(chunks, doc_id)
|
||||
callback(prog=(x+1.)/len(doc_ids))
|
||||
else:
|
||||
|
|
@ -752,6 +761,14 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
|||
sort_by_position=True):
|
||||
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
|
||||
|
||||
# Check if chunks contain HTML tables (content-based detection)
|
||||
skip_for_tables, skip_reason = should_skip_raptor_for_chunks(chunks, raptor_config)
|
||||
if skip_for_tables:
|
||||
logging.info(f"Skipping Raptor for KB scope: {skip_reason}")
|
||||
if callback:
|
||||
callback(prog=1.0, msg=f"Raptor skipped: {skip_reason}")
|
||||
return res, tk_count
|
||||
|
||||
await generate(chunks, fake_doc_id)
|
||||
|
||||
return res, tk_count
|
||||
|
|
|
|||
|
|
@ -19,7 +19,10 @@ Utility functions for Raptor processing decisions.
|
|||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
import re
|
||||
from typing import Optional, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# File extensions for structured data types
|
||||
|
|
@ -27,6 +30,13 @@ EXCEL_EXTENSIONS = {".xls", ".xlsx", ".xlsm", ".xlsb"}
|
|||
CSV_EXTENSIONS = {".csv", ".tsv"}
|
||||
STRUCTURED_EXTENSIONS = EXCEL_EXTENSIONS | CSV_EXTENSIONS
|
||||
|
||||
# Regex patterns for detecting HTML tables in content
|
||||
HTML_TABLE_PATTERN = re.compile(r'<table[^>]*>.*?</table>', re.IGNORECASE | re.DOTALL)
|
||||
HTML_TABLE_START_PATTERN = re.compile(r'<table[^>]*>', re.IGNORECASE)
|
||||
|
||||
# Threshold for considering content as "table-heavy" (percentage of chunks with tables)
|
||||
TABLE_CONTENT_THRESHOLD = 0.3 # If 30%+ of chunks contain tables, skip Raptor
|
||||
|
||||
|
||||
def is_structured_file_type(file_type: Optional[str]) -> bool:
|
||||
"""
|
||||
|
|
@ -120,7 +130,8 @@ def should_skip_raptor(
|
|||
def get_skip_reason(
|
||||
file_type: Optional[str] = None,
|
||||
parser_id: str = "",
|
||||
parser_config: Optional[dict] = None
|
||||
parser_config: Optional[dict] = None,
|
||||
has_table_content: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Get a human-readable reason why Raptor was skipped.
|
||||
|
|
@ -129,6 +140,7 @@ def get_skip_reason(
|
|||
file_type: File extension
|
||||
parser_id: Parser ID being used
|
||||
parser_config: Parser configuration dict
|
||||
has_table_content: Whether content contains HTML tables
|
||||
|
||||
Returns:
|
||||
Reason string, or empty string if Raptor should not be skipped
|
||||
|
|
@ -142,4 +154,96 @@ def get_skip_reason(
|
|||
if is_tabular_pdf(parser_id, parser_config):
|
||||
return f"Tabular PDF (parser={parser_id}) - Raptor auto-disabled"
|
||||
|
||||
if has_table_content:
|
||||
return "Content contains HTML tables - Raptor auto-disabled"
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def contains_html_table(content: str) -> bool:
|
||||
"""
|
||||
Check if content contains HTML table markup.
|
||||
|
||||
Args:
|
||||
content: Text content to check
|
||||
|
||||
Returns:
|
||||
True if content contains HTML table tags
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
return bool(HTML_TABLE_START_PATTERN.search(content))
|
||||
|
||||
|
||||
def analyze_chunks_for_tables(
|
||||
chunks: List[Tuple[str, np.ndarray]],
|
||||
threshold: float = TABLE_CONTENT_THRESHOLD
|
||||
) -> Tuple[bool, float]:
|
||||
"""
|
||||
Analyze chunks to determine if they contain significant table content.
|
||||
|
||||
This function checks the actual content of chunks for HTML table markup,
|
||||
which is generated when PDFs with tables are parsed.
|
||||
|
||||
Args:
|
||||
chunks: List of (content, vector) tuples
|
||||
threshold: Percentage threshold for considering content as table-heavy
|
||||
|
||||
Returns:
|
||||
Tuple of (should_skip, table_percentage)
|
||||
"""
|
||||
if not chunks:
|
||||
return False, 0.0
|
||||
|
||||
table_count = 0
|
||||
for content, _ in chunks:
|
||||
if contains_html_table(content):
|
||||
table_count += 1
|
||||
|
||||
table_percentage = table_count / len(chunks)
|
||||
should_skip = table_percentage >= threshold
|
||||
|
||||
if should_skip:
|
||||
logging.info(
|
||||
f"Detected table-heavy content: {table_count}/{len(chunks)} chunks "
|
||||
f"({table_percentage:.1%}) contain HTML tables"
|
||||
)
|
||||
|
||||
return should_skip, table_percentage
|
||||
|
||||
|
||||
def should_skip_raptor_for_chunks(
|
||||
chunks: List[Tuple[str, np.ndarray]],
|
||||
raptor_config: Optional[dict] = None,
|
||||
threshold: float = TABLE_CONTENT_THRESHOLD
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
Check if Raptor should be skipped based on chunk content analysis.
|
||||
|
||||
This is a content-based check that runs after chunks are loaded,
|
||||
detecting HTML tables that were extracted during parsing.
|
||||
|
||||
Args:
|
||||
chunks: List of (content, vector) tuples
|
||||
raptor_config: Raptor configuration dict
|
||||
threshold: Percentage threshold for table content
|
||||
|
||||
Returns:
|
||||
Tuple of (should_skip, reason)
|
||||
"""
|
||||
raptor_config = raptor_config or {}
|
||||
|
||||
# Check if auto-disable is explicitly disabled
|
||||
if raptor_config.get("auto_disable_for_structured_data", True) is False:
|
||||
return False, ""
|
||||
|
||||
should_skip, table_pct = analyze_chunks_for_tables(chunks, threshold)
|
||||
|
||||
if should_skip:
|
||||
reason = (
|
||||
f"Content contains {table_pct:.0%} HTML tables "
|
||||
f"(threshold: {threshold:.0%}) - Raptor auto-disabled"
|
||||
)
|
||||
return True, reason
|
||||
|
||||
return False, ""
|
||||
|
|
|
|||
|
|
@ -19,14 +19,19 @@ Unit tests for Raptor utility functions.
|
|||
"""
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from rag.utils.raptor_utils import (
|
||||
is_structured_file_type,
|
||||
is_tabular_pdf,
|
||||
should_skip_raptor,
|
||||
get_skip_reason,
|
||||
contains_html_table,
|
||||
analyze_chunks_for_tables,
|
||||
should_skip_raptor_for_chunks,
|
||||
EXCEL_EXTENSIONS,
|
||||
CSV_EXTENSIONS,
|
||||
STRUCTURED_EXTENSIONS
|
||||
STRUCTURED_EXTENSIONS,
|
||||
TABLE_CONTENT_THRESHOLD
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -283,5 +288,242 @@ class TestIntegrationScenarios:
|
|||
assert should_skip_raptor(file_type, raptor_config=raptor_config) is False
|
||||
|
||||
|
||||
class TestContainsHtmlTable:
|
||||
"""Test HTML table detection in content"""
|
||||
|
||||
def test_detect_simple_table(self):
|
||||
"""Test detection of simple HTML table"""
|
||||
content = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr></table>"
|
||||
assert contains_html_table(content) is True
|
||||
|
||||
def test_detect_table_with_attributes(self):
|
||||
"""Test detection of table with attributes"""
|
||||
content = '<table class="data-table" border="1"><tr><td>Data</td></tr></table>'
|
||||
assert contains_html_table(content) is True
|
||||
|
||||
def test_detect_table_case_insensitive(self):
|
||||
"""Test case insensitive detection"""
|
||||
assert contains_html_table("<TABLE><TR><TD>X</TD></TR></TABLE>") is True
|
||||
assert contains_html_table("<Table><tr><td>X</td></tr></Table>") is True
|
||||
|
||||
def test_no_table_in_plain_text(self):
|
||||
"""Test that plain text is not detected as table"""
|
||||
content = "This is just plain text without any tables."
|
||||
assert contains_html_table(content) is False
|
||||
|
||||
def test_no_table_in_empty_content(self):
|
||||
"""Test empty content handling"""
|
||||
assert contains_html_table("") is False
|
||||
# Note: None is rejected by type hints (beartype), which is correct behavior
|
||||
|
||||
def test_table_word_not_detected(self):
|
||||
"""Test that the word 'table' alone is not detected"""
|
||||
content = "Please see the table below for more information."
|
||||
assert contains_html_table(content) is False
|
||||
|
||||
def test_mixed_content_with_table(self):
|
||||
"""Test content with text and table"""
|
||||
content = """
|
||||
This is some introductory text.
|
||||
<table>
|
||||
<caption>Financial Data</caption>
|
||||
<tr><th>Year</th><th>Revenue</th></tr>
|
||||
<tr><td>2024</td><td>$1M</td></tr>
|
||||
</table>
|
||||
More text after the table.
|
||||
"""
|
||||
assert contains_html_table(content) is True
|
||||
|
||||
|
||||
class TestAnalyzeChunksForTables:
|
||||
"""Test chunk analysis for table content"""
|
||||
|
||||
def _make_chunk(self, content: str):
|
||||
"""Helper to create a chunk tuple"""
|
||||
return (content, np.zeros(768))
|
||||
|
||||
def test_all_table_chunks(self):
|
||||
"""Test when all chunks contain tables"""
|
||||
chunks = [
|
||||
self._make_chunk("<table><tr><td>1</td></tr></table>"),
|
||||
self._make_chunk("<table><tr><td>2</td></tr></table>"),
|
||||
self._make_chunk("<table><tr><td>3</td></tr></table>"),
|
||||
]
|
||||
should_skip, pct = analyze_chunks_for_tables(chunks)
|
||||
assert should_skip is True
|
||||
assert pct == 1.0
|
||||
|
||||
def test_no_table_chunks(self):
|
||||
"""Test when no chunks contain tables"""
|
||||
chunks = [
|
||||
self._make_chunk("Plain text content 1"),
|
||||
self._make_chunk("Plain text content 2"),
|
||||
self._make_chunk("Plain text content 3"),
|
||||
]
|
||||
should_skip, pct = analyze_chunks_for_tables(chunks)
|
||||
assert should_skip is False
|
||||
assert pct == 0.0
|
||||
|
||||
def test_mixed_chunks_below_threshold(self):
|
||||
"""Test mixed chunks below threshold"""
|
||||
# 1 out of 5 = 20%, below 30% threshold
|
||||
chunks = [
|
||||
self._make_chunk("<table><tr><td>Table</td></tr></table>"),
|
||||
self._make_chunk("Plain text 1"),
|
||||
self._make_chunk("Plain text 2"),
|
||||
self._make_chunk("Plain text 3"),
|
||||
self._make_chunk("Plain text 4"),
|
||||
]
|
||||
should_skip, pct = analyze_chunks_for_tables(chunks)
|
||||
assert should_skip is False
|
||||
assert pct == 0.2
|
||||
|
||||
def test_mixed_chunks_above_threshold(self):
|
||||
"""Test mixed chunks above threshold"""
|
||||
# 2 out of 5 = 40%, above 30% threshold
|
||||
chunks = [
|
||||
self._make_chunk("<table><tr><td>Table 1</td></tr></table>"),
|
||||
self._make_chunk("<table><tr><td>Table 2</td></tr></table>"),
|
||||
self._make_chunk("Plain text 1"),
|
||||
self._make_chunk("Plain text 2"),
|
||||
self._make_chunk("Plain text 3"),
|
||||
]
|
||||
should_skip, pct = analyze_chunks_for_tables(chunks)
|
||||
assert should_skip is True
|
||||
assert pct == 0.4
|
||||
|
||||
def test_empty_chunks(self):
|
||||
"""Test empty chunk list"""
|
||||
should_skip, pct = analyze_chunks_for_tables([])
|
||||
assert should_skip is False
|
||||
assert pct == 0.0
|
||||
|
||||
def test_custom_threshold(self):
|
||||
"""Test with custom threshold"""
|
||||
# 1 out of 5 = 20%
|
||||
chunks = [
|
||||
self._make_chunk("<table><tr><td>Table</td></tr></table>"),
|
||||
self._make_chunk("Plain text 1"),
|
||||
self._make_chunk("Plain text 2"),
|
||||
self._make_chunk("Plain text 3"),
|
||||
self._make_chunk("Plain text 4"),
|
||||
]
|
||||
# With 15% threshold, should skip
|
||||
should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.15)
|
||||
assert should_skip is True
|
||||
|
||||
# With 25% threshold, should not skip
|
||||
should_skip, pct = analyze_chunks_for_tables(chunks, threshold=0.25)
|
||||
assert should_skip is False
|
||||
|
||||
def test_default_threshold_value(self):
|
||||
"""Test that default threshold is 30%"""
|
||||
assert TABLE_CONTENT_THRESHOLD == 0.3
|
||||
|
||||
|
||||
class TestShouldSkipRaptorForChunks:
|
||||
"""Test content-based Raptor skip decision"""
|
||||
|
||||
def _make_chunk(self, content: str):
|
||||
"""Helper to create a chunk tuple"""
|
||||
return (content, np.zeros(768))
|
||||
|
||||
def test_skip_for_table_heavy_content(self):
|
||||
"""Test skipping for table-heavy content"""
|
||||
chunks = [
|
||||
self._make_chunk("<table><tr><td>1</td></tr></table>"),
|
||||
self._make_chunk("<table><tr><td>2</td></tr></table>"),
|
||||
self._make_chunk("Plain text"),
|
||||
]
|
||||
should_skip, reason = should_skip_raptor_for_chunks(chunks)
|
||||
assert should_skip is True
|
||||
assert "HTML tables" in reason
|
||||
|
||||
def test_no_skip_for_text_content(self):
|
||||
"""Test not skipping for text content"""
|
||||
chunks = [
|
||||
self._make_chunk("Plain text content 1"),
|
||||
self._make_chunk("Plain text content 2"),
|
||||
self._make_chunk("Plain text content 3"),
|
||||
]
|
||||
should_skip, reason = should_skip_raptor_for_chunks(chunks)
|
||||
assert should_skip is False
|
||||
assert reason == ""
|
||||
|
||||
def test_override_with_config(self):
|
||||
"""Test that auto-disable can be overridden"""
|
||||
chunks = [
|
||||
self._make_chunk("<table><tr><td>1</td></tr></table>"),
|
||||
self._make_chunk("<table><tr><td>2</td></tr></table>"),
|
||||
]
|
||||
raptor_config = {"auto_disable_for_structured_data": False}
|
||||
should_skip, reason = should_skip_raptor_for_chunks(chunks, raptor_config)
|
||||
assert should_skip is False
|
||||
assert reason == ""
|
||||
|
||||
def test_empty_chunks(self):
|
||||
"""Test with empty chunks"""
|
||||
should_skip, reason = should_skip_raptor_for_chunks([])
|
||||
assert should_skip is False
|
||||
assert reason == ""
|
||||
|
||||
|
||||
class TestPDFWithHtmlTables:
|
||||
"""Test real-world PDF with HTML tables scenario (ahmadshakil's issue)"""
|
||||
|
||||
def _make_chunk(self, content: str):
|
||||
"""Helper to create a chunk tuple"""
|
||||
return (content, np.zeros(768))
|
||||
|
||||
def test_pdf_with_extracted_tables(self):
|
||||
"""Test PDF that has tables extracted as HTML during parsing"""
|
||||
# Simulating chunks from a PDF like Fbr_IncomeTaxOrdinance_2001
|
||||
chunks = [
|
||||
self._make_chunk("Section 1: Introduction to Tax Law"),
|
||||
self._make_chunk('<table><caption>Table Location: Section 2</caption><tr><th>Tax Rate</th><th>Income Range</th></tr><tr><td>10%</td><td>0-500,000</td></tr></table>'),
|
||||
self._make_chunk("Section 3: Deductions and Exemptions"),
|
||||
self._make_chunk('<table><tr><th>Deduction Type</th><th>Maximum Amount</th></tr><tr><td>Medical</td><td>100,000</td></tr></table>'),
|
||||
self._make_chunk("Section 4: Filing Requirements"),
|
||||
]
|
||||
|
||||
# 2 out of 5 = 40%, above 30% threshold
|
||||
should_skip, reason = should_skip_raptor_for_chunks(chunks)
|
||||
assert should_skip is True
|
||||
assert "HTML tables" in reason
|
||||
|
||||
def test_pdf_with_few_tables(self):
|
||||
"""Test PDF with only occasional tables"""
|
||||
chunks = [
|
||||
self._make_chunk("Chapter 1: Overview of the legal framework..."),
|
||||
self._make_chunk("Chapter 2: Detailed analysis of provisions..."),
|
||||
self._make_chunk("Chapter 3: Case studies and examples..."),
|
||||
self._make_chunk("Chapter 4: Implementation guidelines..."),
|
||||
self._make_chunk("Chapter 5: Compliance requirements..."),
|
||||
self._make_chunk("Chapter 6: Penalties and enforcement..."),
|
||||
self._make_chunk("Chapter 7: Appeals process..."),
|
||||
self._make_chunk("Chapter 8: Recent amendments..."),
|
||||
self._make_chunk("Chapter 9: Future outlook..."),
|
||||
self._make_chunk('<table><tr><td>Summary Table</td></tr></table>'), # Only 1 table
|
||||
]
|
||||
|
||||
# 1 out of 10 = 10%, below 30% threshold
|
||||
should_skip, reason = should_skip_raptor_for_chunks(chunks)
|
||||
assert should_skip is False
|
||||
|
||||
def test_financial_pdf_with_many_tables(self):
|
||||
"""Test financial PDF with many tables (should skip)"""
|
||||
chunks = [
|
||||
self._make_chunk('<table><caption>Balance Sheet</caption><tr><td>Assets</td><td>$1M</td></tr></table>'),
|
||||
self._make_chunk('<table><caption>Income Statement</caption><tr><td>Revenue</td><td>$500K</td></tr></table>'),
|
||||
self._make_chunk('<table><caption>Cash Flow</caption><tr><td>Operating</td><td>$200K</td></tr></table>'),
|
||||
self._make_chunk("Notes to financial statements..."),
|
||||
self._make_chunk('<table><caption>Tax Schedule</caption><tr><td>Tax</td><td>$50K</td></tr></table>'),
|
||||
]
|
||||
|
||||
# 4 out of 5 = 80%, well above threshold
|
||||
should_skip, reason = should_skip_raptor_for_chunks(chunks)
|
||||
assert should_skip is True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue