added fixes for nltk

This commit is contained in:
vasilije 2025-07-19 15:17:27 +02:00
parent 9110a2b59b
commit 3429af32c2
2 changed files with 41 additions and 0 deletions

View file

@ -71,6 +71,25 @@ async def cognee_pipeline(
if cognee_pipeline.first_run:
from cognee.infrastructure.llm.utils import test_llm_connection, test_embedding_connection
# Ensure NLTK data is downloaded on first run
def ensure_nltk_data():
"""Download required NLTK data if not already present."""
try:
import nltk
# Download essential NLTK data used by the system
nltk.download("punkt_tab", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("maxent_ne_chunker", quiet=True)
nltk.download("words", quiet=True)
logger.info("NLTK data initialized successfully")
except Exception as e:
logger.warning(f"Failed to initialize NLTK data: {e}")
ensure_nltk_data()
# Test LLM and Embedding configuration once before running Cognee
await test_llm_connection()
await test_embedding_connection()

View file

@ -54,6 +54,28 @@ async def plugin_ingest_data(
if not user:
user = await get_default_user()
# Ensure NLTK data is downloaded (preserves automatic download behavior)
def ensure_nltk_data():
"""Download required NLTK data if not already present."""
try:
import nltk
# Download essential NLTK data used by the system
nltk.download("punkt_tab", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("maxent_ne_chunker", quiet=True)
nltk.download("words", quiet=True)
logger.info("NLTK data verified/downloaded successfully")
except Exception as e:
logger.warning(f"Failed to download NLTK data: {e}")
# Download NLTK data once per session
if not hasattr(plugin_ingest_data, "_nltk_initialized"):
ensure_nltk_data()
plugin_ingest_data._nltk_initialized = True
# Initialize S3 support (maintain existing behavior)
s3_config = get_s3_config()
fs = None