fix: remove keybert import (#120)

This commit is contained in:
Boris 2024-07-20 17:08:03 +02:00 committed by GitHub
parent 6ef4bbe862
commit 86c7aa23a8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 0 additions and 86 deletions

View file

@ -1 +0,0 @@
from .extract_topics import extract_topics_keybert

View file

@ -1,85 +0,0 @@
import re
import nltk
from nltk.tag import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
def extract_topics_keybert(texts: list[str]):
from keybert import KeyBERT
kw_model = KeyBERT()
for text in texts:
topics = kw_model.extract_keywords(
preprocess_text(text),
keyphrase_ngram_range = (1, 2),
top_n = 3,
# use_mmr = True,
# diversity = 0.9,
)
yield [topic[0] for topic in topics]
def preprocess_text(text: str):
try:
# Used for stopwords removal.
stopwords.ensure_loaded()
except LookupError:
nltk.download("stopwords", quiet = True)
stopwords.ensure_loaded()
try:
# Used in WordNetLemmatizer.
wordnet.ensure_loaded()
except LookupError:
nltk.download("wordnet", quiet = True)
wordnet.ensure_loaded()
try:
# Used in word_tokenize.
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", quiet = True)
text = text.lower()
# Remove punctuation
text = re.sub(r"[^\w\s-]", "", text)
# Tokenize the text
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
tokens = [word for word, tag in tagged_tokens if tag in ["NNP", "NN", "JJ"]]
# Remove stop words
stop_words = set(stopwords.words("english"))
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize the text
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Join tokens back to a single string
processed_text = " ".join(tokens)
return processed_text
# def clean_text(text: str):
# text = re.sub(r"[ \t]{2,}|[\n\r]", " ", text.lower())
# # text = re.sub(r"[`\"'.,;!?…]", "", text).strip()
# return text
# def remove_stop_words(text: str):
# try:
# stopwords.ensure_loaded()
# except LookupError:
# download("stopwords")
# stopwords.ensure_loaded()
# stop_words = set(stopwords.words("english"))
# text = text.split()
# text = [word for word in text if not word in stop_words]
# return " ".join(text)