85 lines
2.2 KiB
Python
85 lines
2.2 KiB
Python
import re
|
|
import nltk
|
|
from nltk.tag import pos_tag
|
|
from nltk.corpus import stopwords, wordnet
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
def extract_topics_keybert(texts: list[str]):
|
|
from keybert import KeyBERT
|
|
|
|
kw_model = KeyBERT()
|
|
|
|
for text in texts:
|
|
topics = kw_model.extract_keywords(
|
|
preprocess_text(text),
|
|
keyphrase_ngram_range = (1, 2),
|
|
top_n = 3,
|
|
# use_mmr = True,
|
|
# diversity = 0.9,
|
|
)
|
|
yield [topic[0] for topic in topics]
|
|
|
|
def preprocess_text(text: str):
|
|
try:
|
|
# Used for stopwords removal.
|
|
stopwords.ensure_loaded()
|
|
except LookupError:
|
|
nltk.download("stopwords", quiet = True)
|
|
stopwords.ensure_loaded()
|
|
|
|
try:
|
|
# Used in WordNetLemmatizer.
|
|
wordnet.ensure_loaded()
|
|
except LookupError:
|
|
nltk.download("wordnet", quiet = True)
|
|
wordnet.ensure_loaded()
|
|
|
|
try:
|
|
# Used in word_tokenize.
|
|
nltk.data.find("tokenizers/punkt")
|
|
except LookupError:
|
|
nltk.download("punkt", quiet = True)
|
|
|
|
text = text.lower()
|
|
|
|
# Remove punctuation
|
|
text = re.sub(r"[^\w\s-]", "", text)
|
|
|
|
# Tokenize the text
|
|
tokens = word_tokenize(text)
|
|
|
|
tagged_tokens = pos_tag(tokens)
|
|
tokens = [word for word, tag in tagged_tokens if tag in ["NNP", "NN", "JJ"]]
|
|
|
|
# Remove stop words
|
|
stop_words = set(stopwords.words("english"))
|
|
tokens = [word for word in tokens if word not in stop_words]
|
|
|
|
# Lemmatize the text
|
|
lemmatizer = WordNetLemmatizer()
|
|
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
|
|
|
# Join tokens back to a single string
|
|
processed_text = " ".join(tokens)
|
|
|
|
return processed_text
|
|
|
|
|
|
# def clean_text(text: str):
|
|
# text = re.sub(r"[ \t]{2,}|[\n\r]", " ", text.lower())
|
|
# # text = re.sub(r"[`\"'.,;!?…]", "", text).strip()
|
|
# return text
|
|
|
|
# def remove_stop_words(text: str):
|
|
# try:
|
|
# stopwords.ensure_loaded()
|
|
# except LookupError:
|
|
# download("stopwords")
|
|
# stopwords.ensure_loaded()
|
|
|
|
# stop_words = set(stopwords.words("english"))
|
|
# text = text.split()
|
|
# text = [word for word in text if not word in stop_words]
|
|
# return " ".join(text)
|
|
|