removed extra things

2025-08-27 19:59:38 +02:00 · 2025-08-27 19:59:38 +02:00 · 76b8e16bcb
commit 76b8e16bcb
parent 00c7dfae49
3 changed files with 58 additions and 56 deletions
--- a/cognee/infrastructure/data/utils/extract_keywords.py
+++ b/cognee/infrastructure/data/utils/extract_keywords.py
@ -1,48 +1,46 @@
-from sklearn.feature_extraction.text import TfidfVectorizer

 from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
-from cognee.shared.utils import extract_pos_tags


-def extract_keywords(text: str) -> list[str]:
-    """
-    Extract keywords from the provided text string.
+# def extract_keywords(text: str) -> list[str]:
+#     """
+#     Extract keywords from the provided text string.

-    This function raises an KeyWordExtractionError if the input text is empty. It processes the
-    text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
-    relevant keywords based on their frequency. The function returns a list of up to 15
-    keywords, each having more than 3 characters.
+#     This function raises an KeyWordExtractionError if the input text is empty. It processes the
+#     text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
+#     relevant keywords based on their frequency. The function returns a list of up to 15
+#     keywords, each having more than 3 characters.

-    Parameters:
-    -----------
+#     Parameters:
+#     -----------

-        - text (str): The input text from which to extract keywords.
+#         - text (str): The input text from which to extract keywords.

-    Returns:
-    --------
+#     Returns:
+#     --------

-        - list[str]: A list of keywords extracted from the text, containing up to 15 nouns
-          with more than 3 characters.
-    """
-    if len(text) == 0:
-        raise KeywordExtractionError()
+#         - list[str]: A list of keywords extracted from the text, containing up to 15 nouns
+#           with more than 3 characters.
+#     """
+#     if len(text) == 0:
+#         raise KeywordExtractionError()

-    tags = extract_pos_tags(text)
-    nouns = [word for (word, tag) in tags if tag == "NN"]
+#     tags = extract_pos_tags(text)
+#     nouns = [word for (word, tag) in tags if tag == "NN"]

-    vectorizer = TfidfVectorizer()
-    tfidf = vectorizer.fit_transform(nouns)
+#     vectorizer = TfidfVectorizer()
+#     tfidf = vectorizer.fit_transform(nouns)

-    top_nouns = sorted(
-        vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
-    )
+#     top_nouns = sorted(
+#         vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
+#     )

-    keywords = []
+#     keywords = []

-    for word in top_nouns:
-        if len(word) > 3:
-            keywords.append(word)
-        if len(keywords) >= 15:
-            break
+#     for word in top_nouns:
+#         if len(word) > 3:
+#             keywords.append(word)
+#         if len(keywords) >= 15:
+#             break

-    return keywords
+#     return keywords
--- a/cognee/shared/utils.py
+++ b/cognee/shared/utils.py
@ -18,34 +18,41 @@ proxy_url = "https://test.prometh.ai"


 def get_entities(tagged_tokens):
-    import nltk
-
-    nltk.download("maxent_ne_chunker", quiet=True)
-
-    from nltk.chunk import ne_chunk
-
-    return ne_chunk(tagged_tokens)
+    try:
+        import nltk
+        nltk.download("maxent_ne_chunker", quiet=True)
+        from nltk.chunk import ne_chunk
+        return ne_chunk(tagged_tokens)
+    except ImportError:
+        raise ImportError(
+            "NLTK is required for entity extraction. Install with 'pip install cognee[nlp]' to use this feature."
+        )


 def extract_pos_tags(sentence):
    """Extract Part-of-Speech (POS) tags for words in a sentence."""
-    import nltk
+    try:
+        import nltk

-    # Ensure that the necessary NLTK resources are downloaded
-    nltk.download("words", quiet=True)
-    nltk.download("punkt", quiet=True)
-    nltk.download("averaged_perceptron_tagger", quiet=True)
+        # Ensure that the necessary NLTK resources are downloaded
+        nltk.download("words", quiet=True)
+        nltk.download("punkt", quiet=True)
+        nltk.download("averaged_perceptron_tagger", quiet=True)

-    from nltk.tag import pos_tag
-    from nltk.tokenize import word_tokenize
+        from nltk.tag import pos_tag
+        from nltk.tokenize import word_tokenize

-    # Tokenize the sentence into words
-    tokens = word_tokenize(sentence)
+        # Tokenize the sentence into words
+        tokens = word_tokenize(sentence)

-    # Tag each word with its corresponding POS tag
-    pos_tags = pos_tag(tokens)
+        # Tag each word with its corresponding POS tag
+        pos_tags = pos_tag(tokens)

-    return pos_tags
+        return pos_tags
+    except ImportError:
+        raise ImportError(
+            "NLTK is required for POS tagging. Install with 'pip install cognee[nlp]' to use this feature."
+        )


 def get_anonymous_id():
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,7 +25,6 @@ dependencies = [
    "pydantic>=2.10.5,<3.0.0",
    "pydantic-settings>=2.2.1,<3",
    "typing_extensions>=4.12.2,<5.0.0",
-    "nltk>=3.9.1,<4.0.0",
    "numpy>=1.26.4, <=4.0.0",
    "sqlalchemy>=2.0.39,<3.0.0",
    "aiosqlite>=0.20.0,<1.0.0",
@ -40,14 +39,11 @@ dependencies = [
    "jinja2>=3.1.3,<4",
    "lancedb>=0.24.0,<1.0.0",
    "alembic>=1.13.3,<2",
-    "pre-commit>=4.0.1,<5",
-    "scikit-learn>=1.6.1,<2",
    "limits>=4.4.1,<5",
    "fastapi>=0.115.7,<1.0.0",
    "python-multipart>=0.0.20,<1.0.0",
    "fastapi-users[sqlalchemy]>=14.0.1,<15.0.0",
    "structlog>=25.2.0,<26",
-    "pympler>=1.1,<2.0.0",
    "pylance>=0.22.0,<1.0.0",
    "kuzu (==0.11.0)",
    "python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows
@ -135,6 +131,7 @@ dev = [
 debug = ["debugpy>=1.8.9,<2.0.0"]
 visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"]
 monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
+nlp = ["nltk>=3.9.1,<4.0.0"]

 [project.urls]
 Homepage = "https://www.cognee.ai"