removed extra things
This commit is contained in:
parent
00c7dfae49
commit
76b8e16bcb
3 changed files with 58 additions and 56 deletions
|
|
@ -1,48 +1,46 @@
|
|||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
|
||||
from cognee.shared.utils import extract_pos_tags
|
||||
|
||||
|
||||
def extract_keywords(text: str) -> list[str]:
|
||||
"""
|
||||
Extract keywords from the provided text string.
|
||||
# def extract_keywords(text: str) -> list[str]:
|
||||
# """
|
||||
# Extract keywords from the provided text string.
|
||||
|
||||
This function raises an KeyWordExtractionError if the input text is empty. It processes the
|
||||
text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
|
||||
relevant keywords based on their frequency. The function returns a list of up to 15
|
||||
keywords, each having more than 3 characters.
|
||||
# This function raises an KeyWordExtractionError if the input text is empty. It processes the
|
||||
# text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
|
||||
# relevant keywords based on their frequency. The function returns a list of up to 15
|
||||
# keywords, each having more than 3 characters.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
# Parameters:
|
||||
# -----------
|
||||
|
||||
- text (str): The input text from which to extract keywords.
|
||||
# - text (str): The input text from which to extract keywords.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
# Returns:
|
||||
# --------
|
||||
|
||||
- list[str]: A list of keywords extracted from the text, containing up to 15 nouns
|
||||
with more than 3 characters.
|
||||
"""
|
||||
if len(text) == 0:
|
||||
raise KeywordExtractionError()
|
||||
# - list[str]: A list of keywords extracted from the text, containing up to 15 nouns
|
||||
# with more than 3 characters.
|
||||
# """
|
||||
# if len(text) == 0:
|
||||
# raise KeywordExtractionError()
|
||||
|
||||
tags = extract_pos_tags(text)
|
||||
nouns = [word for (word, tag) in tags if tag == "NN"]
|
||||
# tags = extract_pos_tags(text)
|
||||
# nouns = [word for (word, tag) in tags if tag == "NN"]
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
tfidf = vectorizer.fit_transform(nouns)
|
||||
# vectorizer = TfidfVectorizer()
|
||||
# tfidf = vectorizer.fit_transform(nouns)
|
||||
|
||||
top_nouns = sorted(
|
||||
vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
|
||||
)
|
||||
# top_nouns = sorted(
|
||||
# vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
|
||||
# )
|
||||
|
||||
keywords = []
|
||||
# keywords = []
|
||||
|
||||
for word in top_nouns:
|
||||
if len(word) > 3:
|
||||
keywords.append(word)
|
||||
if len(keywords) >= 15:
|
||||
break
|
||||
# for word in top_nouns:
|
||||
# if len(word) > 3:
|
||||
# keywords.append(word)
|
||||
# if len(keywords) >= 15:
|
||||
# break
|
||||
|
||||
return keywords
|
||||
# return keywords
|
||||
|
|
|
|||
|
|
@ -18,34 +18,41 @@ proxy_url = "https://test.prometh.ai"
|
|||
|
||||
|
||||
def get_entities(tagged_tokens):
|
||||
import nltk
|
||||
|
||||
nltk.download("maxent_ne_chunker", quiet=True)
|
||||
|
||||
from nltk.chunk import ne_chunk
|
||||
|
||||
return ne_chunk(tagged_tokens)
|
||||
try:
|
||||
import nltk
|
||||
nltk.download("maxent_ne_chunker", quiet=True)
|
||||
from nltk.chunk import ne_chunk
|
||||
return ne_chunk(tagged_tokens)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"NLTK is required for entity extraction. Install with 'pip install cognee[nlp]' to use this feature."
|
||||
)
|
||||
|
||||
|
||||
def extract_pos_tags(sentence):
|
||||
"""Extract Part-of-Speech (POS) tags for words in a sentence."""
|
||||
import nltk
|
||||
try:
|
||||
import nltk
|
||||
|
||||
# Ensure that the necessary NLTK resources are downloaded
|
||||
nltk.download("words", quiet=True)
|
||||
nltk.download("punkt", quiet=True)
|
||||
nltk.download("averaged_perceptron_tagger", quiet=True)
|
||||
# Ensure that the necessary NLTK resources are downloaded
|
||||
nltk.download("words", quiet=True)
|
||||
nltk.download("punkt", quiet=True)
|
||||
nltk.download("averaged_perceptron_tagger", quiet=True)
|
||||
|
||||
from nltk.tag import pos_tag
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.tag import pos_tag
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
# Tokenize the sentence into words
|
||||
tokens = word_tokenize(sentence)
|
||||
# Tokenize the sentence into words
|
||||
tokens = word_tokenize(sentence)
|
||||
|
||||
# Tag each word with its corresponding POS tag
|
||||
pos_tags = pos_tag(tokens)
|
||||
# Tag each word with its corresponding POS tag
|
||||
pos_tags = pos_tag(tokens)
|
||||
|
||||
return pos_tags
|
||||
return pos_tags
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"NLTK is required for POS tagging. Install with 'pip install cognee[nlp]' to use this feature."
|
||||
)
|
||||
|
||||
|
||||
def get_anonymous_id():
|
||||
|
|
|
|||
|
|
@ -25,7 +25,6 @@ dependencies = [
|
|||
"pydantic>=2.10.5,<3.0.0",
|
||||
"pydantic-settings>=2.2.1,<3",
|
||||
"typing_extensions>=4.12.2,<5.0.0",
|
||||
"nltk>=3.9.1,<4.0.0",
|
||||
"numpy>=1.26.4, <=4.0.0",
|
||||
"sqlalchemy>=2.0.39,<3.0.0",
|
||||
"aiosqlite>=0.20.0,<1.0.0",
|
||||
|
|
@ -40,14 +39,11 @@ dependencies = [
|
|||
"jinja2>=3.1.3,<4",
|
||||
"lancedb>=0.24.0,<1.0.0",
|
||||
"alembic>=1.13.3,<2",
|
||||
"pre-commit>=4.0.1,<5",
|
||||
"scikit-learn>=1.6.1,<2",
|
||||
"limits>=4.4.1,<5",
|
||||
"fastapi>=0.115.7,<1.0.0",
|
||||
"python-multipart>=0.0.20,<1.0.0",
|
||||
"fastapi-users[sqlalchemy]>=14.0.1,<15.0.0",
|
||||
"structlog>=25.2.0,<26",
|
||||
"pympler>=1.1,<2.0.0",
|
||||
"pylance>=0.22.0,<1.0.0",
|
||||
"kuzu (==0.11.0)",
|
||||
"python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows
|
||||
|
|
@ -135,6 +131,7 @@ dev = [
|
|||
debug = ["debugpy>=1.8.9,<2.0.0"]
|
||||
visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"]
|
||||
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
|
||||
nlp = ["nltk>=3.9.1,<4.0.0"]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://www.cognee.ai"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue