diff --git a/src/main.py b/src/main.py index 5221d432..8f714be9 100644 --- a/src/main.py +++ b/src/main.py @@ -85,6 +85,9 @@ logger.info( cuda_version=torch.version.cuda, ) +# Files to exclude from startup ingestion +EXCLUDED_INGESTION_FILES = {"warmup_ocr.pdf"} + async def wait_for_opensearch(): """Wait for OpenSearch to be ready with retries""" @@ -312,11 +315,12 @@ async def ingest_default_documents_when_ready(services): ) return - # Collect files recursively + # Collect files recursively, excluding warmup files file_paths = [ os.path.join(root, fn) for root, _, files in os.walk(base_dir) for fn in files + if fn not in EXCLUDED_INGESTION_FILES ] if not file_paths: diff --git a/tests/integration/test_startup_ingest.py b/tests/integration/test_startup_ingest.py index b2243b33..44d1e8b2 100644 --- a/tests/integration/test_startup_ingest.py +++ b/tests/integration/test_startup_ingest.py @@ -5,6 +5,9 @@ from pathlib import Path import httpx import pytest +# Files to exclude from ingestion (should match src/main.py) +EXCLUDED_INGESTION_FILES = {"warmup_ocr.pdf"} + async def wait_for_ready(client: httpx.AsyncClient, timeout_s: float = 30.0): deadline = asyncio.get_event_loop().time() + timeout_s @@ -29,7 +32,7 @@ def count_files_in_documents() -> int: base_dir = Path(os.getcwd()) / "documents" if not base_dir.is_dir(): return 0 - return sum(1 for _ in base_dir.rglob("*") if _.is_file()) + return sum(1 for _ in base_dir.rglob("*") if _.is_file() and _.name not in EXCLUDED_INGESTION_FILES) @pytest.mark.parametrize("disable_langflow_ingest", [True, False])