From e58251b00c4c6ae58826b2e156a1217e37361b94 Mon Sep 17 00:00:00 2001
From: Boris Arzentar <borisarzentar@gmail.com>
Date: Sun, 21 Apr 2024 22:03:18 +0200
Subject: [PATCH] fix: download nltk files when needed

---
 cognee/api/v1/add/add.py                      |  3 +-
 .../data/utils/extract_keywords.py            |  6 +--
 .../graph/add_cognitive_layer_graphs.py       |  6 +--
 .../ingestion/data_types/BinaryData.py        |  7 ++-
 .../ingestion/data_types/IngestionData.py     |  3 ++
 .../modules/ingestion/data_types/TextData.py  | 12 ++++-
 cognee/utils.py                               | 53 +++++++++----------
 notebooks/full_run.ipynb                      |  4 +-
 8 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
index 5e6c85311..fbb2f54c1 100644
--- a/cognee/api/v1/add/add.py
+++ b/cognee/api/v1/add/add.py
@@ -5,7 +5,6 @@ import dlt
 import duckdb
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure import infrastructure_config
-from cognee.infrastructure.files import get_file_metadata
 from cognee.infrastructure.files.storage import LocalStorage
 from cognee.modules.discovery import discover_directory_datasets
 
@@ -85,7 +84,7 @@ async def add_files(file_paths: List[str], dataset_name: str):
 
                 data_id = ingestion.identify(classified_data)
 
-                file_metadata = get_file_metadata(classified_data.get_data())
+                file_metadata = classified_data.get_metadata()
 
                 yield {
                     "id": data_id,
diff --git a/cognee/infrastructure/data/utils/extract_keywords.py b/cognee/infrastructure/data/utils/extract_keywords.py
index c69f590e1..76940fdfc 100644
--- a/cognee/infrastructure/data/utils/extract_keywords.py
+++ b/cognee/infrastructure/data/utils/extract_keywords.py
@@ -1,13 +1,11 @@
-import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
+from cognee.utils import extract_pos_tags
 
 def extract_keywords(text: str) -> list[str]:
     if len(text) == 0:
         raise ValueError("extract_keywords cannot extract keywords from empty text.")
 
-    tokens = nltk.word_tokenize(text)
-
-    tags = nltk.pos_tag(tokens)
+    tags = extract_pos_tags(text)
     nouns = [word for (word, tag) in tags if tag == "NN"]
 
     vectorizer = TfidfVectorizer()
diff --git a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py
index 806b26ee3..11cc83a19 100644
--- a/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py
+++ b/cognee/modules/cognify/graph/add_cognitive_layer_graphs.py
@@ -56,9 +56,9 @@ async def add_cognitive_layer_graphs(
                     dict(relationship_name = "contains"),
                 ))
 
-            pos_tags = await extract_pos_tags(node.entity_description)
-            named_entities = await extract_named_entities(node.entity_description)
-            sentiment = await extract_sentiment_vader(node.entity_description)
+            pos_tags = extract_pos_tags(node.entity_description)
+            named_entities = extract_named_entities(node.entity_description)
+            sentiment = extract_sentiment_vader(node.entity_description)
 
             graph_nodes.append((
                 node_id,
diff --git a/cognee/modules/ingestion/data_types/BinaryData.py b/cognee/modules/ingestion/data_types/BinaryData.py
index 82afb6dd1..72e0be3f0 100644
--- a/cognee/modules/ingestion/data_types/BinaryData.py
+++ b/cognee/modules/ingestion/data_types/BinaryData.py
@@ -13,9 +13,14 @@ class BinaryData(IngestionData):
         self.data = data
 
     def get_identifier(self):
+        metadata = self.get_metadata()
+
+        return metadata["mime_type"] + "_" + "|".join(metadata["keywords"])
+
+    def get_metadata(self):
         self.ensure_metadata()
 
-        return self.metadata["mime_type"] + "_" + "|".join(self.metadata["keywords"])
+        return self.metadata
 
     def ensure_metadata(self):
         if self.metadata is None:
diff --git a/cognee/modules/ingestion/data_types/IngestionData.py b/cognee/modules/ingestion/data_types/IngestionData.py
index 147dbda4e..ecd240234 100644
--- a/cognee/modules/ingestion/data_types/IngestionData.py
+++ b/cognee/modules/ingestion/data_types/IngestionData.py
@@ -8,3 +8,6 @@ class IngestionData(Protocol):
 
     def get_identifier(self):
         raise NotImplementedError()
+
+    def get_metadata(self):
+        raise NotImplementedError()
diff --git a/cognee/modules/ingestion/data_types/TextData.py b/cognee/modules/ingestion/data_types/TextData.py
index 8b152e986..e04376051 100644
--- a/cognee/modules/ingestion/data_types/TextData.py
+++ b/cognee/modules/ingestion/data_types/TextData.py
@@ -7,14 +7,24 @@ def create_text_data(data: str):
 
 class TextData(IngestionData):
     data: str = None
+    metadata = None
 
     def __init__(self, data: BinaryIO):
         self.data = data
 
     def get_identifier(self):
-        keywords = extract_keywords(self.data)
+        keywords = self.get_metadata()["keywords"]
 
         return "text/plain" + "_" + "|".join(keywords)
 
+    def get_metadata(self):
+        self.ensure_metadata()
+
+        return self.metadata
+
+    def ensure_metadata(self):
+        if self.metadata is None:
+            self.metadata = dict(keywords = extract_keywords(self.data))
+
     def get_data(self):
         return self.data
diff --git a/cognee/utils.py b/cognee/utils.py
index 42409d42b..a9eec6318 100644
--- a/cognee/utils.py
+++ b/cognee/utils.py
@@ -7,13 +7,11 @@ import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import tiktoken
-from nltk.sentiment import SentimentIntensityAnalyzer
 import nltk
-from nltk.tokenize import word_tokenize
-from nltk.tag import pos_tag
-from nltk.chunk import ne_chunk
 from cognee.config import Config
 
+config = Config()
+config.load()
 
 def get_document_names(doc_input):
     """
@@ -93,8 +91,6 @@ def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> s
     return trimmed_text
 
 
-
-
 def format_dict(d):
     """Format a dictionary as a string."""
     # Initialize an empty list to store formatted items
@@ -117,9 +113,6 @@ def format_dict(d):
     return formatted_string
 
 
-config = Config()
-config.load()
-
 def generate_color_palette(unique_layers):
     colormap = plt.cm.get_cmap("viridis", len(unique_layers))
     colors = [colormap(i) for i in range(len(unique_layers))]
@@ -140,8 +133,8 @@ def prepare_nodes(graph, include_size=False):
     nodes_data = []
     for node in graph.nodes:
         node_info = graph.nodes[node]
-        description = node_info.get('layer_description', {}).get('layer', 'Default Layer') if isinstance(
-            node_info.get('layer_description'), dict) else node_info.get('layer_description', 'Default Layer')
+        description = node_info.get("layer_description", {}).get("layer", "Default Layer") if isinstance(
+            node_info.get("layer_description"), dict) else node_info.get("layer_description", "Default Layer")
         # description = node_info['layer_description']['layer'] if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info['layer_description'] else node_info.get('layer_description', node)
         # if isinstance(node_info.get('layer_description'), dict) and 'layer' in node_info.get('layer_description'):
         #     description = node_info['layer_description']['layer']
@@ -161,8 +154,6 @@ def prepare_nodes(graph, include_size=False):
     return pd.DataFrame(nodes_data)
 
 
-
-
 async def render_graph(graph, include_nodes=False, include_color=False, include_size=False, include_labels=False):
     await register_graphistry()
     edges = prepare_edges(graph)
@@ -174,7 +165,7 @@ async def render_graph(graph, include_nodes=False, include_color=False, include_
 
 
         if include_size:
-            plotter = plotter.bind(point_size='size')
+            plotter = plotter.bind(point_size="size")
 
 
         if include_color:
@@ -185,7 +176,7 @@ async def render_graph(graph, include_nodes=False, include_color=False, include_
 
 
         if include_labels:
-            plotter = plotter.bind(point_label='layer_description')
+            plotter = plotter.bind(point_label = "layer_description")
 
 
 
@@ -199,14 +190,23 @@ def sanitize_df(df):
     return df.replace([np.inf, -np.inf, np.nan], None)
 
 
-
-# # Ensure that the necessary NLTK resources are downloaded
-# nltk.download('maxent_ne_chunker')
-# nltk.download('words')
+def get_entities(tagged_tokens):
+    nltk.download("maxent_ne_chunker")
+    from nltk.chunk import ne_chunk
+    return ne_chunk(tagged_tokens)
 
 
-async def extract_pos_tags(sentence):
+def extract_pos_tags(sentence):
     """Extract Part-of-Speech (POS) tags for words in a sentence."""
+
+    # Ensure that the necessary NLTK resources are downloaded
+    nltk.download("words")
+    nltk.download("punkt")
+    nltk.download("averaged_perceptron_tagger")
+
+    from nltk.tag import pos_tag
+    from nltk.tokenize import word_tokenize
+
     # Tokenize the sentence into words
     tokens = word_tokenize(sentence)
 
@@ -216,20 +216,18 @@ async def extract_pos_tags(sentence):
     return pos_tags
 
 
-async def extract_named_entities(sentence):
+def extract_named_entities(sentence):
     """Extract Named Entities from a sentence."""
     # Tokenize the sentence into words
-    tokens = word_tokenize(sentence)
-
-    # Perform POS tagging on the tokenized sentence
-    tagged = pos_tag(tokens)
+    tagged_tokens = extract_pos_tags(sentence)
 
     # Perform Named Entity Recognition (NER) on the tagged tokens
-    entities = ne_chunk(tagged)
+    entities = get_entities(tagged_tokens)
 
     return entities
 
-async def extract_sentiment_vader(text):
+
+def extract_sentiment_vader(text):
     """
     Analyzes the sentiment of a given text using the VADER Sentiment Intensity Analyzer.
 
@@ -239,6 +237,7 @@ async def extract_sentiment_vader(text):
     Returns:
     dict: A dictionary containing the polarity scores for the text.
     """
+    from nltk.sentiment import SentimentIntensityAnalyzer
 
     nltk.download("vader_lexicon")
 
diff --git a/notebooks/full_run.ipynb b/notebooks/full_run.ipynb
index f0b3a0050..1a3192d0b 100644
--- a/notebooks/full_run.ipynb
+++ b/notebooks/full_run.ipynb
@@ -151,10 +151,10 @@
     "graph_client = await get_graph_client(GraphDBType.NETWORKX)\n",
     "graph = graph_client.graph\n",
     "\n",
-    "results = await search_similarity(\"At My Window was released by which American singer-songwriter?\", graph)\n",
+    "results = await search_similarity(\"Who is Ernie Grunwald?\", graph)\n",
     "\n",
     "for result in results:\n",
-    "    print(\"At My Window\" in result)\n",
+    "    print(\"Ernie Grunwald\" in result)\n",
     "    print(result)"
    ]
   }