fix: Make sure we exclude the warmup file ingestion

This commit is contained in:
Eric Hare 2025-11-18 12:07:38 -08:00
parent 3216d866f7
commit cfe7f6b581
No known key found for this signature in database
GPG key ID: A73DF73724270AB7
2 changed files with 9 additions and 2 deletions

View file

@ -85,6 +85,9 @@ logger.info(
cuda_version=torch.version.cuda, cuda_version=torch.version.cuda,
) )
# Files to exclude from startup ingestion
EXCLUDED_INGESTION_FILES = {"warmup_ocr.pdf"}
async def wait_for_opensearch(): async def wait_for_opensearch():
"""Wait for OpenSearch to be ready with retries""" """Wait for OpenSearch to be ready with retries"""
@ -312,11 +315,12 @@ async def ingest_default_documents_when_ready(services):
) )
return return
# Collect files recursively # Collect files recursively, excluding warmup files
file_paths = [ file_paths = [
os.path.join(root, fn) os.path.join(root, fn)
for root, _, files in os.walk(base_dir) for root, _, files in os.walk(base_dir)
for fn in files for fn in files
if fn not in EXCLUDED_INGESTION_FILES
] ]
if not file_paths: if not file_paths:

View file

@ -5,6 +5,9 @@ from pathlib import Path
import httpx import httpx
import pytest import pytest
# Files to exclude from ingestion (should match src/main.py)
EXCLUDED_INGESTION_FILES = {"warmup_ocr.pdf"}
async def wait_for_ready(client: httpx.AsyncClient, timeout_s: float = 30.0): async def wait_for_ready(client: httpx.AsyncClient, timeout_s: float = 30.0):
deadline = asyncio.get_event_loop().time() + timeout_s deadline = asyncio.get_event_loop().time() + timeout_s
@ -29,7 +32,7 @@ def count_files_in_documents() -> int:
base_dir = Path(os.getcwd()) / "documents" base_dir = Path(os.getcwd()) / "documents"
if not base_dir.is_dir(): if not base_dir.is_dir():
return 0 return 0
return sum(1 for _ in base_dir.rglob("*") if _.is_file()) return sum(1 for _ in base_dir.rglob("*") if _.is_file() and _.name not in EXCLUDED_INGESTION_FILES)
@pytest.mark.parametrize("disable_langflow_ingest", [True, False]) @pytest.mark.parametrize("disable_langflow_ingest", [True, False])