fix: remove keybert import (#120)
This commit is contained in:
parent
6ef4bbe862
commit
86c7aa23a8
2 changed files with 0 additions and 86 deletions
|
|
@ -1 +0,0 @@
|
||||||
from .extract_topics import extract_topics_keybert
|
|
||||||
|
|
@ -1,85 +0,0 @@
|
||||||
import re
|
|
||||||
import nltk
|
|
||||||
from nltk.tag import pos_tag
|
|
||||||
from nltk.corpus import stopwords, wordnet
|
|
||||||
from nltk.tokenize import word_tokenize
|
|
||||||
from nltk.stem import WordNetLemmatizer
|
|
||||||
|
|
||||||
def extract_topics_keybert(texts: list[str]):
|
|
||||||
from keybert import KeyBERT
|
|
||||||
|
|
||||||
kw_model = KeyBERT()
|
|
||||||
|
|
||||||
for text in texts:
|
|
||||||
topics = kw_model.extract_keywords(
|
|
||||||
preprocess_text(text),
|
|
||||||
keyphrase_ngram_range = (1, 2),
|
|
||||||
top_n = 3,
|
|
||||||
# use_mmr = True,
|
|
||||||
# diversity = 0.9,
|
|
||||||
)
|
|
||||||
yield [topic[0] for topic in topics]
|
|
||||||
|
|
||||||
def preprocess_text(text: str):
|
|
||||||
try:
|
|
||||||
# Used for stopwords removal.
|
|
||||||
stopwords.ensure_loaded()
|
|
||||||
except LookupError:
|
|
||||||
nltk.download("stopwords", quiet = True)
|
|
||||||
stopwords.ensure_loaded()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Used in WordNetLemmatizer.
|
|
||||||
wordnet.ensure_loaded()
|
|
||||||
except LookupError:
|
|
||||||
nltk.download("wordnet", quiet = True)
|
|
||||||
wordnet.ensure_loaded()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Used in word_tokenize.
|
|
||||||
nltk.data.find("tokenizers/punkt")
|
|
||||||
except LookupError:
|
|
||||||
nltk.download("punkt", quiet = True)
|
|
||||||
|
|
||||||
text = text.lower()
|
|
||||||
|
|
||||||
# Remove punctuation
|
|
||||||
text = re.sub(r"[^\w\s-]", "", text)
|
|
||||||
|
|
||||||
# Tokenize the text
|
|
||||||
tokens = word_tokenize(text)
|
|
||||||
|
|
||||||
tagged_tokens = pos_tag(tokens)
|
|
||||||
tokens = [word for word, tag in tagged_tokens if tag in ["NNP", "NN", "JJ"]]
|
|
||||||
|
|
||||||
# Remove stop words
|
|
||||||
stop_words = set(stopwords.words("english"))
|
|
||||||
tokens = [word for word in tokens if word not in stop_words]
|
|
||||||
|
|
||||||
# Lemmatize the text
|
|
||||||
lemmatizer = WordNetLemmatizer()
|
|
||||||
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
|
||||||
|
|
||||||
# Join tokens back to a single string
|
|
||||||
processed_text = " ".join(tokens)
|
|
||||||
|
|
||||||
return processed_text
|
|
||||||
|
|
||||||
|
|
||||||
# def clean_text(text: str):
|
|
||||||
# text = re.sub(r"[ \t]{2,}|[\n\r]", " ", text.lower())
|
|
||||||
# # text = re.sub(r"[`\"'.,;!?…]", "", text).strip()
|
|
||||||
# return text
|
|
||||||
|
|
||||||
# def remove_stop_words(text: str):
|
|
||||||
# try:
|
|
||||||
# stopwords.ensure_loaded()
|
|
||||||
# except LookupError:
|
|
||||||
# download("stopwords")
|
|
||||||
# stopwords.ensure_loaded()
|
|
||||||
|
|
||||||
# stop_words = set(stopwords.words("english"))
|
|
||||||
# text = text.split()
|
|
||||||
# text = [word for word in text if not word in stop_words]
|
|
||||||
# return " ".join(text)
|
|
||||||
|
|
||||||
Loading…
Add table
Reference in a new issue