removed extra things
This commit is contained in:
parent
00c7dfae49
commit
76b8e16bcb
3 changed files with 58 additions and 56 deletions
|
|
@ -1,48 +1,46 @@
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
|
|
||||||
from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
|
from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError
|
||||||
from cognee.shared.utils import extract_pos_tags
|
|
||||||
|
|
||||||
|
|
||||||
def extract_keywords(text: str) -> list[str]:
|
# def extract_keywords(text: str) -> list[str]:
|
||||||
"""
|
# """
|
||||||
Extract keywords from the provided text string.
|
# Extract keywords from the provided text string.
|
||||||
|
|
||||||
This function raises an KeyWordExtractionError if the input text is empty. It processes the
|
# This function raises an KeyWordExtractionError if the input text is empty. It processes the
|
||||||
text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
|
# text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most
|
||||||
relevant keywords based on their frequency. The function returns a list of up to 15
|
# relevant keywords based on their frequency. The function returns a list of up to 15
|
||||||
keywords, each having more than 3 characters.
|
# keywords, each having more than 3 characters.
|
||||||
|
|
||||||
Parameters:
|
# Parameters:
|
||||||
-----------
|
# -----------
|
||||||
|
|
||||||
- text (str): The input text from which to extract keywords.
|
# - text (str): The input text from which to extract keywords.
|
||||||
|
|
||||||
Returns:
|
# Returns:
|
||||||
--------
|
# --------
|
||||||
|
|
||||||
- list[str]: A list of keywords extracted from the text, containing up to 15 nouns
|
# - list[str]: A list of keywords extracted from the text, containing up to 15 nouns
|
||||||
with more than 3 characters.
|
# with more than 3 characters.
|
||||||
"""
|
# """
|
||||||
if len(text) == 0:
|
# if len(text) == 0:
|
||||||
raise KeywordExtractionError()
|
# raise KeywordExtractionError()
|
||||||
|
|
||||||
tags = extract_pos_tags(text)
|
# tags = extract_pos_tags(text)
|
||||||
nouns = [word for (word, tag) in tags if tag == "NN"]
|
# nouns = [word for (word, tag) in tags if tag == "NN"]
|
||||||
|
|
||||||
vectorizer = TfidfVectorizer()
|
# vectorizer = TfidfVectorizer()
|
||||||
tfidf = vectorizer.fit_transform(nouns)
|
# tfidf = vectorizer.fit_transform(nouns)
|
||||||
|
|
||||||
top_nouns = sorted(
|
# top_nouns = sorted(
|
||||||
vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
|
# vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True
|
||||||
)
|
# )
|
||||||
|
|
||||||
keywords = []
|
# keywords = []
|
||||||
|
|
||||||
for word in top_nouns:
|
# for word in top_nouns:
|
||||||
if len(word) > 3:
|
# if len(word) > 3:
|
||||||
keywords.append(word)
|
# keywords.append(word)
|
||||||
if len(keywords) >= 15:
|
# if len(keywords) >= 15:
|
||||||
break
|
# break
|
||||||
|
|
||||||
return keywords
|
# return keywords
|
||||||
|
|
|
||||||
|
|
@ -18,34 +18,41 @@ proxy_url = "https://test.prometh.ai"
|
||||||
|
|
||||||
|
|
||||||
def get_entities(tagged_tokens):
|
def get_entities(tagged_tokens):
|
||||||
import nltk
|
try:
|
||||||
|
import nltk
|
||||||
nltk.download("maxent_ne_chunker", quiet=True)
|
nltk.download("maxent_ne_chunker", quiet=True)
|
||||||
|
from nltk.chunk import ne_chunk
|
||||||
from nltk.chunk import ne_chunk
|
return ne_chunk(tagged_tokens)
|
||||||
|
except ImportError:
|
||||||
return ne_chunk(tagged_tokens)
|
raise ImportError(
|
||||||
|
"NLTK is required for entity extraction. Install with 'pip install cognee[nlp]' to use this feature."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_pos_tags(sentence):
|
def extract_pos_tags(sentence):
|
||||||
"""Extract Part-of-Speech (POS) tags for words in a sentence."""
|
"""Extract Part-of-Speech (POS) tags for words in a sentence."""
|
||||||
import nltk
|
try:
|
||||||
|
import nltk
|
||||||
|
|
||||||
# Ensure that the necessary NLTK resources are downloaded
|
# Ensure that the necessary NLTK resources are downloaded
|
||||||
nltk.download("words", quiet=True)
|
nltk.download("words", quiet=True)
|
||||||
nltk.download("punkt", quiet=True)
|
nltk.download("punkt", quiet=True)
|
||||||
nltk.download("averaged_perceptron_tagger", quiet=True)
|
nltk.download("averaged_perceptron_tagger", quiet=True)
|
||||||
|
|
||||||
from nltk.tag import pos_tag
|
from nltk.tag import pos_tag
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
# Tokenize the sentence into words
|
# Tokenize the sentence into words
|
||||||
tokens = word_tokenize(sentence)
|
tokens = word_tokenize(sentence)
|
||||||
|
|
||||||
# Tag each word with its corresponding POS tag
|
# Tag each word with its corresponding POS tag
|
||||||
pos_tags = pos_tag(tokens)
|
pos_tags = pos_tag(tokens)
|
||||||
|
|
||||||
return pos_tags
|
return pos_tags
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"NLTK is required for POS tagging. Install with 'pip install cognee[nlp]' to use this feature."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_anonymous_id():
|
def get_anonymous_id():
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,6 @@ dependencies = [
|
||||||
"pydantic>=2.10.5,<3.0.0",
|
"pydantic>=2.10.5,<3.0.0",
|
||||||
"pydantic-settings>=2.2.1,<3",
|
"pydantic-settings>=2.2.1,<3",
|
||||||
"typing_extensions>=4.12.2,<5.0.0",
|
"typing_extensions>=4.12.2,<5.0.0",
|
||||||
"nltk>=3.9.1,<4.0.0",
|
|
||||||
"numpy>=1.26.4, <=4.0.0",
|
"numpy>=1.26.4, <=4.0.0",
|
||||||
"sqlalchemy>=2.0.39,<3.0.0",
|
"sqlalchemy>=2.0.39,<3.0.0",
|
||||||
"aiosqlite>=0.20.0,<1.0.0",
|
"aiosqlite>=0.20.0,<1.0.0",
|
||||||
|
|
@ -40,14 +39,11 @@ dependencies = [
|
||||||
"jinja2>=3.1.3,<4",
|
"jinja2>=3.1.3,<4",
|
||||||
"lancedb>=0.24.0,<1.0.0",
|
"lancedb>=0.24.0,<1.0.0",
|
||||||
"alembic>=1.13.3,<2",
|
"alembic>=1.13.3,<2",
|
||||||
"pre-commit>=4.0.1,<5",
|
|
||||||
"scikit-learn>=1.6.1,<2",
|
|
||||||
"limits>=4.4.1,<5",
|
"limits>=4.4.1,<5",
|
||||||
"fastapi>=0.115.7,<1.0.0",
|
"fastapi>=0.115.7,<1.0.0",
|
||||||
"python-multipart>=0.0.20,<1.0.0",
|
"python-multipart>=0.0.20,<1.0.0",
|
||||||
"fastapi-users[sqlalchemy]>=14.0.1,<15.0.0",
|
"fastapi-users[sqlalchemy]>=14.0.1,<15.0.0",
|
||||||
"structlog>=25.2.0,<26",
|
"structlog>=25.2.0,<26",
|
||||||
"pympler>=1.1,<2.0.0",
|
|
||||||
"pylance>=0.22.0,<1.0.0",
|
"pylance>=0.22.0,<1.0.0",
|
||||||
"kuzu (==0.11.0)",
|
"kuzu (==0.11.0)",
|
||||||
"python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows
|
"python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows
|
||||||
|
|
@ -135,6 +131,7 @@ dev = [
|
||||||
debug = ["debugpy>=1.8.9,<2.0.0"]
|
debug = ["debugpy>=1.8.9,<2.0.0"]
|
||||||
visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"]
|
visualization = ["networkx>=3.4.2,<4", "matplotlib>=3.8.3,<4"]
|
||||||
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
|
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
|
||||||
|
nlp = ["nltk>=3.9.1,<4.0.0"]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Homepage = "https://www.cognee.ai"
|
Homepage = "https://www.cognee.ai"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue