cognee/cognee/infrastructure/data/utils/extract_keywords.py
2025-08-13 14:42:57 +02:00

48 lines
1.4 KiB
Python

from sklearn.feature_extraction.text import TfidfVectorizer
from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
from cognee.shared.utils import extract_pos_tags
def extract_keywords(text: str) -> list[str]:
"""
Extract keywords from the provided text string.
This function raises an KeyWordExtractionError if the input text is empty. It processes the
text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
relevant keywords based on their frequency. The function returns a list of up to 15
keywords, each having more than 3 characters.
Parameters:
-----------
- text (str): The input text from which to extract keywords.
Returns:
--------
- list[str]: A list of keywords extracted from the text, containing up to 15 nouns
with more than 3 characters.
"""
if len(text) == 0:
raise KeywordExtractionError()
tags = extract_pos_tags(text)
nouns = [word for (word, tag) in tags if tag == "NN"]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(nouns)
top_nouns = sorted(
vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
)
keywords = []
for word in top_nouns:
if len(word) > 3:
keywords.append(word)
if len(keywords) >= 15:
break
return keywords