From 3429af32c25ec926043ab67f0d0e07f8f7834e1a Mon Sep 17 00:00:00 2001 From: vasilije Date: Sat, 19 Jul 2025 15:17:27 +0200 Subject: [PATCH] added fixes for nltk --- .../modules/pipelines/operations/pipeline.py | 19 ++++++++++++++++ cognee/tasks/ingestion/plugin_ingest_data.py | 22 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/cognee/modules/pipelines/operations/pipeline.py b/cognee/modules/pipelines/operations/pipeline.py index e58c15254..826f162f7 100644 --- a/cognee/modules/pipelines/operations/pipeline.py +++ b/cognee/modules/pipelines/operations/pipeline.py @@ -71,6 +71,25 @@ async def cognee_pipeline( if cognee_pipeline.first_run: from cognee.infrastructure.llm.utils import test_llm_connection, test_embedding_connection + # Ensure NLTK data is downloaded on first run + def ensure_nltk_data(): + """Download required NLTK data if not already present.""" + try: + import nltk + + # Download essential NLTK data used by the system + nltk.download("punkt_tab", quiet=True) + nltk.download("punkt", quiet=True) + nltk.download("averaged_perceptron_tagger", quiet=True) + nltk.download("averaged_perceptron_tagger_eng", quiet=True) + nltk.download("maxent_ne_chunker", quiet=True) + nltk.download("words", quiet=True) + logger.info("NLTK data initialized successfully") + except Exception as e: + logger.warning(f"Failed to initialize NLTK data: {e}") + + ensure_nltk_data() + # Test LLM and Embedding configuration once before running Cognee await test_llm_connection() await test_embedding_connection() diff --git a/cognee/tasks/ingestion/plugin_ingest_data.py b/cognee/tasks/ingestion/plugin_ingest_data.py index c2a225c78..994de6f5c 100644 --- a/cognee/tasks/ingestion/plugin_ingest_data.py +++ b/cognee/tasks/ingestion/plugin_ingest_data.py @@ -54,6 +54,28 @@ async def plugin_ingest_data( if not user: user = await get_default_user() + # Ensure NLTK data is downloaded (preserves automatic download behavior) + def ensure_nltk_data(): + """Download required NLTK data if not already present.""" + try: + import nltk + + # Download essential NLTK data used by the system + nltk.download("punkt_tab", quiet=True) + nltk.download("punkt", quiet=True) + nltk.download("averaged_perceptron_tagger", quiet=True) + nltk.download("averaged_perceptron_tagger_eng", quiet=True) + nltk.download("maxent_ne_chunker", quiet=True) + nltk.download("words", quiet=True) + logger.info("NLTK data verified/downloaded successfully") + except Exception as e: + logger.warning(f"Failed to download NLTK data: {e}") + + # Download NLTK data once per session + if not hasattr(plugin_ingest_data, "_nltk_initialized"): + ensure_nltk_data() + plugin_ingest_data._nltk_initialized = True + # Initialize S3 support (maintain existing behavior) s3_config = get_s3_config() fs = None