From e58251b00c4c6ae58826b2e156a1217e37361b94 Mon Sep 17 00:00:00 2001 From: Boris Arzentar Date: Sun, 21 Apr 2024 22:03:18 +0200 Subject: [PATCH] fix: download nltk files when needed --- cognee/api/v1/add/add.py | 3 +- .../data/utils/extract_keywords.py | 6 +-- .../graph/add_cognitive_layer_graphs.py | 6 +-- .../ingestion/data_types/BinaryData.py | 7 ++- .../ingestion/data_types/IngestionData.py | 3 ++ .../modules/ingestion/data_types/TextData.py | 12 ++++- cognee/utils.py | 53 +++++++++---------- notebooks/full_run.ipynb | 4 +- 8 files changed, 54 insertions(+), 40 deletions(-) diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py index 5e6c85311..fbb2f54c1 100644 --- a/cognee/api/v1/add/add.py +++ b/cognee/api/v1/add/add.py @@ -5,7 +5,6 @@ import dlt import duckdb import cognee.modules.ingestion as ingestion from cognee.infrastructure import infrastructure_config -from cognee.infrastructure.files import get_file_metadata from cognee.infrastructure.files.storage import LocalStorage from cognee.modules.discovery import discover_directory_datasets @@ -85,7 +84,7 @@ async def add_files(file_paths: List[str], dataset_name: str): data_id = ingestion.identify(classified_data) - file_metadata = get_file_metadata(classified_data.get_data()) + file_metadata = classified_data.get_metadata() yield { "id": data_id, diff --git a/cognee/infrastructure/data/utils/extract_keywords.py b/cognee/infrastructure/data/utils/extract_keywords.py index c69f590e1..76940fdfc 100644 --- a/cognee/infrastructure/data/utils/extract_keywords.py +++ b/cognee/infrastructure/data/utils/extract_keywords.py @@ -1,13 +1,11 @@ -import nltk from sklearn.feature_extraction.text import TfidfVectorizer +from cognee.utils import extract_pos_tags def extract_keywords(text: str) -> list[str]: if len(text) == 0: raise ValueError("extract_keywords cannot extract keywords from empty text.") - tokens = nltk.word_tokenize(text) - - tags = nltk.pos_tag(tokens) + tags = extract_pos_tags(text) nouns = [word for (word, tag) in tags if tag == "NN"] vectorizer = TfidfVectorizer() diff --git a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py index 806b26ee3..11cc83a19 100644 --- a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py +++ b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py @@ -56,9 +56,9 @@ async def add_cognitive_layer_graphs( dict(relationship_name = "contains"), )) - pos_tags = await extract_pos_tags(node.entity_description) - named_entities = await extract_named_entities(node.entity_description) - sentiment = await extract_sentiment_vader(node.entity_description) + pos_tags = extract_pos_tags(node.entity_description) + named_entities = extract_named_entities(node.entity_description) + sentiment = extract_sentiment_vader(node.entity_description) graph_nodes.append(( node_id, diff --git a/cognee/modules/ingestion/data_types/BinaryData.py b/cognee/modules/ingestion/data_types/BinaryData.py index 82afb6dd1..72e0be3f0 100644 --- a/cognee/modules/ingestion/data_types/BinaryData.py +++ b/cognee/modules/ingestion/data_types/BinaryData.py @@ -13,9 +13,14 @@ class BinaryData(IngestionData): self.data = data def get_identifier(self): + metadata = self.get_metadata() + + return metadata["mime_type"] + "_" + "|".join(metadata["keywords"]) + + def get_metadata(self): self.ensure_metadata() - return self.metadata["mime_type"] + "_" + "|".join(self.metadata["keywords"]) + return self.metadata def ensure_metadata(self): if self.metadata is None: diff --git a/cognee/modules/ingestion/data_types/IngestionData.py b/cognee/modules/ingestion/data_types/IngestionData.py index 147dbda4e..ecd240234 100644 --- a/cognee/modules/ingestion/data_types/IngestionData.py +++ b/cognee/modules/ingestion/data_types/IngestionData.py @@ -8,3 +8,6 @@ class IngestionData(Protocol): def get_identifier(self): raise NotImplementedError() + + def get_metadata(self): + raise NotImplementedError() diff --git a/cognee/modules/ingestion/data_types/TextData.py b/cognee/modules/ingestion/data_types/TextData.py index 8b152e986..e04376051 100644 --- a/cognee/modules/ingestion/data_types/TextData.py +++ b/cognee/modules/ingestion/data_types/TextData.py @@ -7,14 +7,24 @@ def create_text_data(data: str): class TextData(IngestionData): data: str = None + metadata = None def __init__(self, data: BinaryIO): self.data = data def get_identifier(self): - keywords = extract_keywords(self.data) + keywords = self.get_metadata()["keywords"] return "text/plain" + "_" + "|".join(keywords) + def get_metadata(self): + self.ensure_metadata() + + return self.metadata + + def ensure_metadata(self): + if self.metadata is None: + self.metadata = dict(keywords = extract_keywords(self.data)) + def get_data(self): return self.data diff --git a/cognee/utils.py b/cognee/utils.py index 42409d42b..a9eec6318 100644 --- a/cognee/utils.py +++ b/cognee/utils.py @@ -7,13 +7,11 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt import tiktoken -from nltk.sentiment import SentimentIntensityAnalyzer import nltk -from nltk.tokenize import word_tokenize -from nltk.tag import pos_tag -from nltk.chunk import ne_chunk from cognee.config import Config +config = Config() +config.load() def get_document_names(doc_input): """ @@ -93,8 +91,6 @@ def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> s return trimmed_text - - def format_dict(d): """Format a dictionary as a string.""" # Initialize an empty list to store formatted items @@ -117,9 +113,6 @@ def format_dict(d): return formatted_string -config = Config() -config.load() - def generate_color_palette(unique_layers): colormap = plt.cm.get_cmap("viridis", len(unique_layers)) colors = [colormap(i) for i in range(len(unique_layers))] @@ -140,8 +133,8 @@ def prepare_nodes(graph, include_size=False): nodes_data = [] for node in graph.nodes: node_info = graph.nodes[node] - description = node_info.get('layer_description', {}).get('layer', 'Default Layer') if isinstance( - node_info.get('layer_description'), dict) else node_info.get('layer_description', 'Default Layer') + description = node_info.get("layer_description", {}).get("layer", "Default Layer") if isinstance( + node_info.get("layer_description"), dict) else node_info.get("layer_description", "Default Layer") # description = node_info['layer_description']['layer'] if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info['layer_description'] else node_info.get('layer_description', node) # if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info.get('layer_description'): # description = node_info['layer_description']['layer'] @@ -161,8 +154,6 @@ def prepare_nodes(graph, include_size=False): return pd.DataFrame(nodes_data) - - async def render_graph(graph, include_nodes=False, include_color=False, include_size=False, include_labels=False): await register_graphistry() edges = prepare_edges(graph) @@ -174,7 +165,7 @@ async def render_graph(graph, include_nodes=False, include_color=False, include_ if include_size: - plotter = plotter.bind(point_size='size') + plotter = plotter.bind(point_size="size") if include_color: @@ -185,7 +176,7 @@ async def render_graph(graph, include_nodes=False, include_color=False, include_ if include_labels: - plotter = plotter.bind(point_label='layer_description') + plotter = plotter.bind(point_label = "layer_description") @@ -199,14 +190,23 @@ def sanitize_df(df): return df.replace([np.inf, -np.inf, np.nan], None) - -# # Ensure that the necessary NLTK resources are downloaded -# nltk.download('maxent_ne_chunker') -# nltk.download('words') +def get_entities(tagged_tokens): + nltk.download("maxent_ne_chunker") + from nltk.chunk import ne_chunk + return ne_chunk(tagged_tokens) -async def extract_pos_tags(sentence): +def extract_pos_tags(sentence): """Extract Part-of-Speech (POS) tags for words in a sentence.""" + + # Ensure that the necessary NLTK resources are downloaded + nltk.download("words") + nltk.download("punkt") + nltk.download("averaged_perceptron_tagger") + + from nltk.tag import pos_tag + from nltk.tokenize import word_tokenize + # Tokenize the sentence into words tokens = word_tokenize(sentence) @@ -216,20 +216,18 @@ async def extract_pos_tags(sentence): return pos_tags -async def extract_named_entities(sentence): +def extract_named_entities(sentence): """Extract Named Entities from a sentence.""" # Tokenize the sentence into words - tokens = word_tokenize(sentence) - - # Perform POS tagging on the tokenized sentence - tagged = pos_tag(tokens) + tagged_tokens = extract_pos_tags(sentence) # Perform Named Entity Recognition (NER) on the tagged tokens - entities = ne_chunk(tagged) + entities = get_entities(tagged_tokens) return entities -async def extract_sentiment_vader(text): + +def extract_sentiment_vader(text): """ Analyzes the sentiment of a given text using the VADER Sentiment Intensity Analyzer. @@ -239,6 +237,7 @@ async def extract_sentiment_vader(text): Returns: dict: A dictionary containing the polarity scores for the text. """ + from nltk.sentiment import SentimentIntensityAnalyzer nltk.download("vader_lexicon") diff --git a/notebooks/full_run.ipynb b/notebooks/full_run.ipynb index f0b3a0050..1a3192d0b 100644 --- a/notebooks/full_run.ipynb +++ b/notebooks/full_run.ipynb @@ -151,10 +151,10 @@ "graph_client = await get_graph_client(GraphDBType.NETWORKX)\n", "graph = graph_client.graph\n", "\n", - "results = await search_similarity(\"At My Window was released by which American singer-songwriter?\", graph)\n", + "results = await search_similarity(\"Who is Ernie Grunwald?\", graph)\n", "\n", "for result in results:\n", - " print(\"At My Window\" in result)\n", + " print(\"Ernie Grunwald\" in result)\n", " print(result)" ] }