fix: Make sure we exclude the warmup file ingestion

This commit is contained in:
Eric Hare 2025-11-18 12:07:38 -08:00
parent 3216d866f7
commit cfe7f6b581
No known key found for this signature in database
GPG key ID: A73DF73724270AB7
2 changed files with 9 additions and 2 deletions

View file

@ -85,6 +85,9 @@ logger.info(
cuda_version=torch.version.cuda,
)
# Files to exclude from startup ingestion
EXCLUDED_INGESTION_FILES = {"warmup_ocr.pdf"}
async def wait_for_opensearch():
"""Wait for OpenSearch to be ready with retries"""
@ -312,11 +315,12 @@ async def ingest_default_documents_when_ready(services):
)
return
# Collect files recursively
# Collect files recursively, excluding warmup files
file_paths = [
os.path.join(root, fn)
for root, _, files in os.walk(base_dir)
for fn in files
if fn not in EXCLUDED_INGESTION_FILES
]
if not file_paths:

View file

@ -5,6 +5,9 @@ from pathlib import Path
import httpx
import pytest
# Files to exclude from ingestion (should match src/main.py)
EXCLUDED_INGESTION_FILES = {"warmup_ocr.pdf"}
async def wait_for_ready(client: httpx.AsyncClient, timeout_s: float = 30.0):
deadline = asyncio.get_event_loop().time() + timeout_s
@ -29,7 +32,7 @@ def count_files_in_documents() -> int:
base_dir = Path(os.getcwd()) / "documents"
if not base_dir.is_dir():
return 0
return sum(1 for _ in base_dir.rglob("*") if _.is_file())
return sum(1 for _ in base_dir.rglob("*") if _.is_file() and _.name not in EXCLUDED_INGESTION_FILES)
@pytest.mark.parametrize("disable_langflow_ingest", [True, False])