removed extra things

This commit is contained in:
vasilije 2025-08-27 19:59:38 +02:00
parent 00c7dfae49
commit 76b8e16bcb
3 changed files with 58 additions and 56 deletions

View file

@ -1,48 +1,46 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
from cognee.shared.utils import extract_pos_tags
def extract_keywords(text: str) -> list[str]: # def extract_keywords(text: str) -> list[str]:
""" # """
Extract keywords from the provided text string. # Extract keywords from the provided text string.
This function raises an KeyWordExtractionError if the input text is empty. It processes the # This function raises an KeyWordExtractionError if the input text is empty. It processes the
text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most # text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
relevant keywords based on their frequency. The function returns a list of up to 15 # relevant keywords based on their frequency. The function returns a list of up to 15
keywords, each having more than 3 characters. # keywords, each having more than 3 characters.
Parameters: # Parameters:
----------- # -----------
- text (str): The input text from which to extract keywords. # - text (str): The input text from which to extract keywords.
Returns: # Returns:
-------- # --------
- list[str]: A list of keywords extracted from the text, containing up to 15 nouns # - list[str]: A list of keywords extracted from the text, containing up to 15 nouns
with more than 3 characters. # with more than 3 characters.
""" # """
if len(text) == 0: # if len(text) == 0:
raise KeywordExtractionError() # raise KeywordExtractionError()
tags = extract_pos_tags(text) # tags = extract_pos_tags(text)
nouns = [word for (word, tag) in tags if tag == "NN"] # nouns = [word for (word, tag) in tags if tag == "NN"]
vectorizer = TfidfVectorizer() # vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(nouns) # tfidf = vectorizer.fit_transform(nouns)
top_nouns = sorted( # top_nouns = sorted(
vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True # vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
) # )
keywords = [] # keywords = []
for word in top_nouns: # for word in top_nouns:
if len(word) > 3: # if len(word) > 3:
keywords.append(word) # keywords.append(word)
if len(keywords) >= 15: # if len(keywords) >= 15:
break # break
return keywords # return keywords

View file

@ -18,34 +18,41 @@ proxy_url = "https://test.prometh.ai"
def get_entities(tagged_tokens): def get_entities(tagged_tokens):
import nltk try:
import nltk
nltk.download("maxent_ne_chunker", quiet=True) nltk.download("maxent_ne_chunker", quiet=True)
from nltk.chunk import ne_chunk
from nltk.chunk import ne_chunk return ne_chunk(tagged_tokens)
except ImportError:
return ne_chunk(tagged_tokens) raise ImportError(
"NLTK is required for entity extraction. Install with 'pip install cognee[nlp]' to use this feature."
)
def extract_pos_tags(sentence): def extract_pos_tags(sentence):
"""Extract Part-of-Speech (POS) tags for words in a sentence.""" """Extract Part-of-Speech (POS) tags for words in a sentence."""
import nltk try:
import nltk
# Ensure that the necessary NLTK resources are downloaded # Ensure that the necessary NLTK resources are downloaded
nltk.download("words", quiet=True) nltk.download("words", quiet=True)
nltk.download("punkt", quiet=True) nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True) nltk.download("averaged_perceptron_tagger", quiet=True)
from nltk.tag import pos_tag from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
# Tokenize the sentence into words # Tokenize the sentence into words
tokens = word_tokenize(sentence) tokens = word_tokenize(sentence)
# Tag each word with its corresponding POS tag # Tag each word with its corresponding POS tag
pos_tags = pos_tag(tokens) pos_tags = pos_tag(tokens)
return pos_tags return pos_tags
except ImportError:
raise ImportError(
"NLTK is required for POS tagging. Install with 'pip install cognee[nlp]' to use this feature."
)
def get_anonymous_id(): def get_anonymous_id():

View file

@ -25,7 +25,6 @@ dependencies = [
"pydantic>=2.10.5,<3.0.0", "pydantic>=2.10.5,<3.0.0",
"pydantic-settings>=2.2.1,<3", "pydantic-settings>=2.2.1,<3",
"typing_extensions>=4.12.2,<5.0.0", "typing_extensions>=4.12.2,<5.0.0",
"nltk>=3.9.1,<4.0.0",
"numpy>=1.26.4, <=4.0.0", "numpy>=1.26.4, <=4.0.0",
"sqlalchemy>=2.0.39,<3.0.0", "sqlalchemy>=2.0.39,<3.0.0",
"aiosqlite>=0.20.0,<1.0.0", "aiosqlite>=0.20.0,<1.0.0",
@ -40,14 +39,11 @@ dependencies = [
"jinja2>=3.1.3,<4", "jinja2>=3.1.3,<4",
"lancedb>=0.24.0,<1.0.0", "lancedb>=0.24.0,<1.0.0",
"alembic>=1.13.3,<2", "alembic>=1.13.3,<2",
"pre-commit>=4.0.1,<5",
"scikit-learn>=1.6.1,<2",
"limits>=4.4.1,<5", "limits>=4.4.1,<5",
"fastapi>=0.115.7,<1.0.0", "fastapi>=0.115.7,<1.0.0",
"python-multipart>=0.0.20,<1.0.0", "python-multipart>=0.0.20,<1.0.0",
"fastapi-users[sqlalchemy]>=14.0.1,<15.0.0", "fastapi-users[sqlalchemy]>=14.0.1,<15.0.0",
"structlog>=25.2.0,<26", "structlog>=25.2.0,<26",
"pympler>=1.1,<2.0.0",
"pylance>=0.22.0,<1.0.0", "pylance>=0.22.0,<1.0.0",
"kuzu (==0.11.0)", "kuzu (==0.11.0)",
"python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows "python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows
@ -135,6 +131,7 @@ dev = [
debug = ["debugpy>=1.8.9,<2.0.0"] debug = ["debugpy>=1.8.9,<2.0.0"]
visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"] visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"]
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"] monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
nlp = ["nltk>=3.9.1,<4.0.0"]
[project.urls] [project.urls]
Homepage = "https://www.cognee.ai" Homepage = "https://www.cognee.ai"