diff --git a/cognee/infrastructure/data/utils/extract_keywords.py b/cognee/infrastructure/data/utils/extract_keywords.py index 8085459c9..811999618 100644 --- a/cognee/infrastructure/data/utils/extract_keywords.py +++ b/cognee/infrastructure/data/utils/extract_keywords.py @@ -1,48 +1,46 @@ -from sklearn.feature_extraction.text import TfidfVectorizer from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError -from cognee.shared.utils import extract_pos_tags -def extract_keywords(text: str) -> list[str]: - """ - Extract keywords from the provided text string. +# def extract_keywords(text: str) -> list[str]: +# """ +# Extract keywords from the provided text string. - This function raises an KeyWordExtractionError if the input text is empty. It processes the - text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most - relevant keywords based on their frequency. The function returns a list of up to 15 - keywords, each having more than 3 characters. +# This function raises an KeyWordExtractionError if the input text is empty. It processes the +# text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most +# relevant keywords based on their frequency. The function returns a list of up to 15 +# keywords, each having more than 3 characters. - Parameters: - ----------- +# Parameters: +# ----------- - - text (str): The input text from which to extract keywords. +# - text (str): The input text from which to extract keywords. - Returns: - -------- +# Returns: +# -------- - - list[str]: A list of keywords extracted from the text, containing up to 15 nouns - with more than 3 characters. - """ - if len(text) == 0: - raise KeywordExtractionError() +# - list[str]: A list of keywords extracted from the text, containing up to 15 nouns +# with more than 3 characters. +# """ +# if len(text) == 0: +# raise KeywordExtractionError() - tags = extract_pos_tags(text) - nouns = [word for (word, tag) in tags if tag == "NN"] +# tags = extract_pos_tags(text) +# nouns = [word for (word, tag) in tags if tag == "NN"] - vectorizer = TfidfVectorizer() - tfidf = vectorizer.fit_transform(nouns) +# vectorizer = TfidfVectorizer() +# tfidf = vectorizer.fit_transform(nouns) - top_nouns = sorted( - vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True - ) +# top_nouns = sorted( +# vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True +# ) - keywords = [] +# keywords = [] - for word in top_nouns: - if len(word) > 3: - keywords.append(word) - if len(keywords) >= 15: - break +# for word in top_nouns: +# if len(word) > 3: +# keywords.append(word) +# if len(keywords) >= 15: +# break - return keywords +# return keywords diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index 22557d2ac..df141f2de 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -18,34 +18,41 @@ proxy_url = "https://test.prometh.ai" def get_entities(tagged_tokens): - import nltk - - nltk.download("maxent_ne_chunker", quiet=True) - - from nltk.chunk import ne_chunk - - return ne_chunk(tagged_tokens) + try: + import nltk + nltk.download("maxent_ne_chunker", quiet=True) + from nltk.chunk import ne_chunk + return ne_chunk(tagged_tokens) + except ImportError: + raise ImportError( + "NLTK is required for entity extraction. Install with 'pip install cognee[nlp]' to use this feature." + ) def extract_pos_tags(sentence): """Extract Part-of-Speech (POS) tags for words in a sentence.""" - import nltk + try: + import nltk - # Ensure that the necessary NLTK resources are downloaded - nltk.download("words", quiet=True) - nltk.download("punkt", quiet=True) - nltk.download("averaged_perceptron_tagger", quiet=True) + # Ensure that the necessary NLTK resources are downloaded + nltk.download("words", quiet=True) + nltk.download("punkt", quiet=True) + nltk.download("averaged_perceptron_tagger", quiet=True) - from nltk.tag import pos_tag - from nltk.tokenize import word_tokenize + from nltk.tag import pos_tag + from nltk.tokenize import word_tokenize - # Tokenize the sentence into words - tokens = word_tokenize(sentence) + # Tokenize the sentence into words + tokens = word_tokenize(sentence) - # Tag each word with its corresponding POS tag - pos_tags = pos_tag(tokens) + # Tag each word with its corresponding POS tag + pos_tags = pos_tag(tokens) - return pos_tags + return pos_tags + except ImportError: + raise ImportError( + "NLTK is required for POS tagging. Install with 'pip install cognee[nlp]' to use this feature." + ) def get_anonymous_id(): diff --git a/pyproject.toml b/pyproject.toml index 4c165a325..e71393b21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ dependencies = [ "pydantic>=2.10.5,<3.0.0", "pydantic-settings>=2.2.1,<3", "typing_extensions>=4.12.2,<5.0.0", - "nltk>=3.9.1,<4.0.0", "numpy>=1.26.4, <=4.0.0", "sqlalchemy>=2.0.39,<3.0.0", "aiosqlite>=0.20.0,<1.0.0", @@ -40,14 +39,11 @@ dependencies = [ "jinja2>=3.1.3,<4", "lancedb>=0.24.0,<1.0.0", "alembic>=1.13.3,<2", - "pre-commit>=4.0.1,<5", - "scikit-learn>=1.6.1,<2", "limits>=4.4.1,<5", "fastapi>=0.115.7,<1.0.0", "python-multipart>=0.0.20,<1.0.0", "fastapi-users[sqlalchemy]>=14.0.1,<15.0.0", "structlog>=25.2.0,<26", - "pympler>=1.1,<2.0.0", "pylance>=0.22.0,<1.0.0", "kuzu (==0.11.0)", "python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows @@ -135,6 +131,7 @@ dev = [ debug = ["debugpy>=1.8.9,<2.0.0"] visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"] monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"] +nlp = ["nltk>=3.9.1,<4.0.0"] [project.urls] Homepage = "https://www.cognee.ai"