removed extra things

This commit is contained in:
vasilije 2025-08-27 19:59:38 +02:00
parent 00c7dfae49
commit 76b8e16bcb
3 changed files with 58 additions and 56 deletions

View file

@ -1,48 +1,46 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
from cognee.shared.utils import extract_pos_tags
def extract_keywords(text: str) -> list[str]:
"""
Extract keywords from the provided text string.
# def extract_keywords(text: str) -> list[str]:
# """
# Extract keywords from the provided text string.
This function raises an KeyWordExtractionError if the input text is empty. It processes the
text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
relevant keywords based on their frequency. The function returns a list of up to 15
keywords, each having more than 3 characters.
# This function raises an KeyWordExtractionError if the input text is empty. It processes the
# text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
# relevant keywords based on their frequency. The function returns a list of up to 15
# keywords, each having more than 3 characters.
Parameters:
-----------
# Parameters:
# -----------
- text (str): The input text from which to extract keywords.
# - text (str): The input text from which to extract keywords.
Returns:
--------
# Returns:
# --------
- list[str]: A list of keywords extracted from the text, containing up to 15 nouns
with more than 3 characters.
"""
if len(text) == 0:
raise KeywordExtractionError()
# - list[str]: A list of keywords extracted from the text, containing up to 15 nouns
# with more than 3 characters.
# """
# if len(text) == 0:
# raise KeywordExtractionError()
tags = extract_pos_tags(text)
nouns = [word for (word, tag) in tags if tag == "NN"]
# tags = extract_pos_tags(text)
# nouns = [word for (word, tag) in tags if tag == "NN"]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(nouns)
# vectorizer = TfidfVectorizer()
# tfidf = vectorizer.fit_transform(nouns)
top_nouns = sorted(
vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
)
# top_nouns = sorted(
# vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
# )
keywords = []
# keywords = []
for word in top_nouns:
if len(word) > 3:
keywords.append(word)
if len(keywords) >= 15:
break
# for word in top_nouns:
# if len(word) > 3:
# keywords.append(word)
# if len(keywords) >= 15:
# break
return keywords
# return keywords

View file

@ -18,34 +18,41 @@ proxy_url = "https://test.prometh.ai"
def get_entities(tagged_tokens):
import nltk
nltk.download("maxent_ne_chunker", quiet=True)
from nltk.chunk import ne_chunk
return ne_chunk(tagged_tokens)
try:
import nltk
nltk.download("maxent_ne_chunker", quiet=True)
from nltk.chunk import ne_chunk
return ne_chunk(tagged_tokens)
except ImportError:
raise ImportError(
"NLTK is required for entity extraction. Install with 'pip install cognee[nlp]' to use this feature."
)
def extract_pos_tags(sentence):
"""Extract Part-of-Speech (POS) tags for words in a sentence."""
import nltk
try:
import nltk
# Ensure that the necessary NLTK resources are downloaded
nltk.download("words", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
# Ensure that the necessary NLTK resources are downloaded
nltk.download("words", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# Tokenize the sentence into words
tokens = word_tokenize(sentence)
# Tokenize the sentence into words
tokens = word_tokenize(sentence)
# Tag each word with its corresponding POS tag
pos_tags = pos_tag(tokens)
# Tag each word with its corresponding POS tag
pos_tags = pos_tag(tokens)
return pos_tags
return pos_tags
except ImportError:
raise ImportError(
"NLTK is required for POS tagging. Install with 'pip install cognee[nlp]' to use this feature."
)
def get_anonymous_id():

View file

@ -25,7 +25,6 @@ dependencies = [
"pydantic>=2.10.5,<3.0.0",
"pydantic-settings>=2.2.1,<3",
"typing_extensions>=4.12.2,<5.0.0",
"nltk>=3.9.1,<4.0.0",
"numpy>=1.26.4, <=4.0.0",
"sqlalchemy>=2.0.39,<3.0.0",
"aiosqlite>=0.20.0,<1.0.0",
@ -40,14 +39,11 @@ dependencies = [
"jinja2>=3.1.3,<4",
"lancedb>=0.24.0,<1.0.0",
"alembic>=1.13.3,<2",
"pre-commit>=4.0.1,<5",
"scikit-learn>=1.6.1,<2",
"limits>=4.4.1,<5",
"fastapi>=0.115.7,<1.0.0",
"python-multipart>=0.0.20,<1.0.0",
"fastapi-users[sqlalchemy]>=14.0.1,<15.0.0",
"structlog>=25.2.0,<26",
"pympler>=1.1,<2.0.0",
"pylance>=0.22.0,<1.0.0",
"kuzu (==0.11.0)",
"python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows
@ -135,6 +131,7 @@ dev = [
debug = ["debugpy>=1.8.9,<2.0.0"]
visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"]
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
nlp = ["nltk>=3.9.1,<4.0.0"]
[project.urls]
Homepage = "https://www.cognee.ai"