fix: remove keybert import (#120)

2024-07-20 17:08:03 +02:00 · 2024-07-20 17:08:03 +02:00 · 86c7aa23a8
commit 86c7aa23a8
parent 6ef4bbe862
2 changed files with 0 additions and 86 deletions
--- a/cognee/modules/data/extraction/init.py
+++ b/cognee/modules/data/extraction/init.py
@ -1 +0,0 @@
 from .extract_topics import extract_topics_keybert
--- a/cognee/modules/data/extraction/extract_topics.py
+++ b/cognee/modules/data/extraction/extract_topics.py
@ -1,85 +0,0 @@
 import re
 import nltk
 from nltk.tag import pos_tag
 from nltk.corpus import stopwords, wordnet
 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 def extract_topics_keybert(texts: list[str]):
    from keybert import KeyBERT
    kw_model = KeyBERT()
    for text in texts:
        topics = kw_model.extract_keywords(
            preprocess_text(text),
            keyphrase_ngram_range = (1, 2),
            top_n = 3,
            # use_mmr = True,
            # diversity = 0.9,
        )
        yield [topic[0] for topic in topics]
 def preprocess_text(text: str):
    try:
        # Used for stopwords removal.
        stopwords.ensure_loaded()
    except LookupError:
        nltk.download("stopwords", quiet = True)
        stopwords.ensure_loaded()
    try:
        # Used in WordNetLemmatizer.
        wordnet.ensure_loaded()
    except LookupError:
        nltk.download("wordnet", quiet = True)
        wordnet.ensure_loaded()
    try:
        # Used in word_tokenize.
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt", quiet = True)
    text = text.lower()
    # Remove punctuation
    text = re.sub(r"[^\w\s-]", "", text)
    # Tokenize the text
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    tokens = [word for word, tag in tagged_tokens if tag in ["NNP", "NN", "JJ"]]
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to a single string
    processed_text = " ".join(tokens)
    return processed_text
 # def clean_text(text: str):
 #     text = re.sub(r"[ \t]{2,}|[\n\r]", " ", text.lower())
 #     # text = re.sub(r"[`\"'.,;!?…]", "", text).strip()
 #     return text
 # def remove_stop_words(text: str):
 #     try:
 #         stopwords.ensure_loaded()
 #     except LookupError:
 #         download("stopwords")
 #         stopwords.ensure_loaded()
 #     stop_words = set(stopwords.words("english"))
 #     text = text.split()
 #     text = [word for word in text if not word in stop_words]
 #     return " ".join(text)
		`@ -1 +0,0 @@`
			`from .extract_topics import extract_topics_keybert`