fix: download nltk files when needed
This commit is contained in:
parent
bdd664a2aa
commit
e58251b00c
8 changed files with 54 additions and 40 deletions
|
|
@ -5,7 +5,6 @@ import dlt
|
|||
import duckdb
|
||||
import cognee.modules.ingestion as ingestion
|
||||
from cognee.infrastructure import infrastructure_config
|
||||
from cognee.infrastructure.files import get_file_metadata
|
||||
from cognee.infrastructure.files.storage import LocalStorage
|
||||
from cognee.modules.discovery import discover_directory_datasets
|
||||
|
||||
|
|
@ -85,7 +84,7 @@ async def add_files(file_paths: List[str], dataset_name: str):
|
|||
|
||||
data_id = ingestion.identify(classified_data)
|
||||
|
||||
file_metadata = get_file_metadata(classified_data.get_data())
|
||||
file_metadata = classified_data.get_metadata()
|
||||
|
||||
yield {
|
||||
"id": data_id,
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
import nltk
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from cognee.utils import extract_pos_tags
|
||||
|
||||
def extract_keywords(text: str) -> list[str]:
|
||||
if len(text) == 0:
|
||||
raise ValueError("extract_keywords cannot extract keywords from empty text.")
|
||||
|
||||
tokens = nltk.word_tokenize(text)
|
||||
|
||||
tags = nltk.pos_tag(tokens)
|
||||
tags = extract_pos_tags(text)
|
||||
nouns = [word for (word, tag) in tags if tag == "NN"]
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
|
|
|||
|
|
@ -56,9 +56,9 @@ async def add_cognitive_layer_graphs(
|
|||
dict(relationship_name = "contains"),
|
||||
))
|
||||
|
||||
pos_tags = await extract_pos_tags(node.entity_description)
|
||||
named_entities = await extract_named_entities(node.entity_description)
|
||||
sentiment = await extract_sentiment_vader(node.entity_description)
|
||||
pos_tags = extract_pos_tags(node.entity_description)
|
||||
named_entities = extract_named_entities(node.entity_description)
|
||||
sentiment = extract_sentiment_vader(node.entity_description)
|
||||
|
||||
graph_nodes.append((
|
||||
node_id,
|
||||
|
|
|
|||
|
|
@ -13,9 +13,14 @@ class BinaryData(IngestionData):
|
|||
self.data = data
|
||||
|
||||
def get_identifier(self):
|
||||
metadata = self.get_metadata()
|
||||
|
||||
return metadata["mime_type"] + "_" + "|".join(metadata["keywords"])
|
||||
|
||||
def get_metadata(self):
|
||||
self.ensure_metadata()
|
||||
|
||||
return self.metadata["mime_type"] + "_" + "|".join(self.metadata["keywords"])
|
||||
return self.metadata
|
||||
|
||||
def ensure_metadata(self):
|
||||
if self.metadata is None:
|
||||
|
|
|
|||
|
|
@ -8,3 +8,6 @@ class IngestionData(Protocol):
|
|||
|
||||
def get_identifier(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_metadata(self):
|
||||
raise NotImplementedError()
|
||||
|
|
|
|||
|
|
@ -7,14 +7,24 @@ def create_text_data(data: str):
|
|||
|
||||
class TextData(IngestionData):
|
||||
data: str = None
|
||||
metadata = None
|
||||
|
||||
def __init__(self, data: BinaryIO):
|
||||
self.data = data
|
||||
|
||||
def get_identifier(self):
|
||||
keywords = extract_keywords(self.data)
|
||||
keywords = self.get_metadata()["keywords"]
|
||||
|
||||
return "text/plain" + "_" + "|".join(keywords)
|
||||
|
||||
def get_metadata(self):
|
||||
self.ensure_metadata()
|
||||
|
||||
return self.metadata
|
||||
|
||||
def ensure_metadata(self):
|
||||
if self.metadata is None:
|
||||
self.metadata = dict(keywords = extract_keywords(self.data))
|
||||
|
||||
def get_data(self):
|
||||
return self.data
|
||||
|
|
|
|||
|
|
@ -7,13 +7,11 @@ import numpy as np
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import tiktoken
|
||||
from nltk.sentiment import SentimentIntensityAnalyzer
|
||||
import nltk
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.tag import pos_tag
|
||||
from nltk.chunk import ne_chunk
|
||||
from cognee.config import Config
|
||||
|
||||
config = Config()
|
||||
config.load()
|
||||
|
||||
def get_document_names(doc_input):
|
||||
"""
|
||||
|
|
@ -93,8 +91,6 @@ def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> s
|
|||
return trimmed_text
|
||||
|
||||
|
||||
|
||||
|
||||
def format_dict(d):
|
||||
"""Format a dictionary as a string."""
|
||||
# Initialize an empty list to store formatted items
|
||||
|
|
@ -117,9 +113,6 @@ def format_dict(d):
|
|||
return formatted_string
|
||||
|
||||
|
||||
config = Config()
|
||||
config.load()
|
||||
|
||||
def generate_color_palette(unique_layers):
|
||||
colormap = plt.cm.get_cmap("viridis", len(unique_layers))
|
||||
colors = [colormap(i) for i in range(len(unique_layers))]
|
||||
|
|
@ -140,8 +133,8 @@ def prepare_nodes(graph, include_size=False):
|
|||
nodes_data = []
|
||||
for node in graph.nodes:
|
||||
node_info = graph.nodes[node]
|
||||
description = node_info.get('layer_description', {}).get('layer', 'Default Layer') if isinstance(
|
||||
node_info.get('layer_description'), dict) else node_info.get('layer_description', 'Default Layer')
|
||||
description = node_info.get("layer_description", {}).get("layer", "Default Layer") if isinstance(
|
||||
node_info.get("layer_description"), dict) else node_info.get("layer_description", "Default Layer")
|
||||
# description = node_info['layer_description']['layer'] if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info['layer_description'] else node_info.get('layer_description', node)
|
||||
# if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info.get('layer_description'):
|
||||
# description = node_info['layer_description']['layer']
|
||||
|
|
@ -161,8 +154,6 @@ def prepare_nodes(graph, include_size=False):
|
|||
return pd.DataFrame(nodes_data)
|
||||
|
||||
|
||||
|
||||
|
||||
async def render_graph(graph, include_nodes=False, include_color=False, include_size=False, include_labels=False):
|
||||
await register_graphistry()
|
||||
edges = prepare_edges(graph)
|
||||
|
|
@ -174,7 +165,7 @@ async def render_graph(graph, include_nodes=False, include_color=False, include_
|
|||
|
||||
|
||||
if include_size:
|
||||
plotter = plotter.bind(point_size='size')
|
||||
plotter = plotter.bind(point_size="size")
|
||||
|
||||
|
||||
if include_color:
|
||||
|
|
@ -185,7 +176,7 @@ async def render_graph(graph, include_nodes=False, include_color=False, include_
|
|||
|
||||
|
||||
if include_labels:
|
||||
plotter = plotter.bind(point_label='layer_description')
|
||||
plotter = plotter.bind(point_label = "layer_description")
|
||||
|
||||
|
||||
|
||||
|
|
@ -199,14 +190,23 @@ def sanitize_df(df):
|
|||
return df.replace([np.inf, -np.inf, np.nan], None)
|
||||
|
||||
|
||||
|
||||
# # Ensure that the necessary NLTK resources are downloaded
|
||||
# nltk.download('maxent_ne_chunker')
|
||||
# nltk.download('words')
|
||||
def get_entities(tagged_tokens):
|
||||
nltk.download("maxent_ne_chunker")
|
||||
from nltk.chunk import ne_chunk
|
||||
return ne_chunk(tagged_tokens)
|
||||
|
||||
|
||||
async def extract_pos_tags(sentence):
|
||||
def extract_pos_tags(sentence):
|
||||
"""Extract Part-of-Speech (POS) tags for words in a sentence."""
|
||||
|
||||
# Ensure that the necessary NLTK resources are downloaded
|
||||
nltk.download("words")
|
||||
nltk.download("punkt")
|
||||
nltk.download("averaged_perceptron_tagger")
|
||||
|
||||
from nltk.tag import pos_tag
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
# Tokenize the sentence into words
|
||||
tokens = word_tokenize(sentence)
|
||||
|
||||
|
|
@ -216,20 +216,18 @@ async def extract_pos_tags(sentence):
|
|||
return pos_tags
|
||||
|
||||
|
||||
async def extract_named_entities(sentence):
|
||||
def extract_named_entities(sentence):
|
||||
"""Extract Named Entities from a sentence."""
|
||||
# Tokenize the sentence into words
|
||||
tokens = word_tokenize(sentence)
|
||||
|
||||
# Perform POS tagging on the tokenized sentence
|
||||
tagged = pos_tag(tokens)
|
||||
tagged_tokens = extract_pos_tags(sentence)
|
||||
|
||||
# Perform Named Entity Recognition (NER) on the tagged tokens
|
||||
entities = ne_chunk(tagged)
|
||||
entities = get_entities(tagged_tokens)
|
||||
|
||||
return entities
|
||||
|
||||
async def extract_sentiment_vader(text):
|
||||
|
||||
def extract_sentiment_vader(text):
|
||||
"""
|
||||
Analyzes the sentiment of a given text using the VADER Sentiment Intensity Analyzer.
|
||||
|
||||
|
|
@ -239,6 +237,7 @@ async def extract_sentiment_vader(text):
|
|||
Returns:
|
||||
dict: A dictionary containing the polarity scores for the text.
|
||||
"""
|
||||
from nltk.sentiment import SentimentIntensityAnalyzer
|
||||
|
||||
nltk.download("vader_lexicon")
|
||||
|
||||
|
|
|
|||
|
|
@ -151,10 +151,10 @@
|
|||
"graph_client = await get_graph_client(GraphDBType.NETWORKX)\n",
|
||||
"graph = graph_client.graph\n",
|
||||
"\n",
|
||||
"results = await search_similarity(\"At My Window was released by which American singer-songwriter?\", graph)\n",
|
||||
"results = await search_similarity(\"Who is Ernie Grunwald?\", graph)\n",
|
||||
"\n",
|
||||
"for result in results:\n",
|
||||
" print(\"At My Window\" in result)\n",
|
||||
" print(\"Ernie Grunwald\" in result)\n",
|
||||
" print(result)"
|
||||
]
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue