diff --git a/cognee/eval_framework/modal_eval_dashboard.py b/cognee/eval_framework/modal_eval_dashboard.py index 9ff6f543c..e2ed8e453 100644 --- a/cognee/eval_framework/modal_eval_dashboard.py +++ b/cognee/eval_framework/modal_eval_dashboard.py @@ -16,7 +16,7 @@ metrics_volume = modal.Volume.from_name("evaluation_dashboard_results", create_i image = ( modal.Image.debian_slim(python_version="3.11") - .pip_install("streamlit", "plotly") + .pip_install("streamlit", "pandas", "plotly") .add_local_file(__file__, "/root/serve_dashboard.py") ) diff --git a/cognee/infrastructure/data/utils/extract_keywords.py b/cognee/infrastructure/data/utils/extract_keywords.py deleted file mode 100644 index c3e47c4c8..000000000 --- a/cognee/infrastructure/data/utils/extract_keywords.py +++ /dev/null @@ -1,45 +0,0 @@ -from cognee.infrastructure.data.exceptions.exceptions import KeywordExtractionError - - -# def extract_keywords(text: str) -> list[str]: -# """ -# Extract keywords from the provided text string. - -# This function raises an KeyWordExtractionError if the input text is empty. It processes the -# text to extract parts of speech, focusing on nouns, and uses TF-IDF to identify the most -# relevant keywords based on their frequency. The function returns a list of up to 15 -# keywords, each having more than 3 characters. - -# Parameters: -# ----------- - -# - text (str): The input text from which to extract keywords. - -# Returns: -# -------- - -# - list[str]: A list of keywords extracted from the text, containing up to 15 nouns -# with more than 3 characters. -# """ -# if len(text) == 0: -# raise KeywordExtractionError() - -# tags = extract_pos_tags(text) -# nouns = [word for (word, tag) in tags if tag == "NN"] - -# vectorizer = TfidfVectorizer() -# tfidf = vectorizer.fit_transform(nouns) - -# top_nouns = sorted( -# vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True -# ) - -# keywords = [] - -# for word in top_nouns: -# if len(word) > 3: -# keywords.append(word) -# if len(keywords) >= 15: -# break - -# return keywords diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index c5240935c..9e5120456 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -75,6 +75,13 @@ class LLMConfig(BaseSettings): def model_post_init(self, __context) -> None: """Initialize the BAML registry after the model is created.""" + # Check if BAML is selected as structured output framework but not available + if self.structured_output_framework == "baml" and ClientRegistry is None: + raise ImportError( + "BAML is selected as structured output framework but not available. " + "Please install with 'pip install cognee[baml]' to use BAML extraction features." + ) + if ClientRegistry is not None: self.baml_registry = ClientRegistry() self.baml_registry.add_llm_client( diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py index 89889d294..697a52a45 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py @@ -37,11 +37,6 @@ async def extract_summary(content: str, response_model: Type[BaseModel]): """ config = get_llm_config() - if config.baml_registry is None: - raise ImportError( - "BAML is not available. Please install with 'pip install cognee[baml]' to use BAML extraction features." - ) - # Use BAML's SummarizeContent function summary_result = await b.SummarizeContent( content, baml_options={"client_registry": config.baml_registry} @@ -82,11 +77,6 @@ async def extract_code_summary(content: str): try: config = get_llm_config() - if config.baml_registry is None: - raise ImportError( - "BAML is not available. Please install with 'pip install cognee[baml]' to use BAML extraction features." - ) - result = await b.SummarizeCode( content, baml_options={"client_registry": config.baml_registry} ) diff --git a/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py b/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py index f87d87d1b..abff07e09 100644 --- a/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +++ b/cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py @@ -16,11 +16,6 @@ async def extract_content_graph( get_logger(level="INFO") - if config.baml_registry is None: - raise ImportError( - "BAML is not available. Please install with 'pip install cognee[baml]' to use BAML extraction features." - ) - # if response_model: # # tb = TypeBuilder() # # country = tb.union \ diff --git a/cognee/shared/utils.py b/cognee/shared/utils.py index 16d0961cc..3071a82cb 100644 --- a/cognee/shared/utils.py +++ b/cognee/shared/utils.py @@ -17,46 +17,6 @@ from cognee.infrastructure.databases.graph import get_graph_engine proxy_url = "https://test.prometh.ai" -def get_entities(tagged_tokens): - try: - import nltk - - nltk.download("maxent_ne_chunker", quiet=True) - from nltk.chunk import ne_chunk - - return ne_chunk(tagged_tokens) - except ImportError: - raise ImportError( - "NLTK is required for entity extraction. Install with 'pip install cognee[nlp]' to use this feature." - ) - - -def extract_pos_tags(sentence): - """Extract Part-of-Speech (POS) tags for words in a sentence.""" - try: - import nltk - - # Ensure that the necessary NLTK resources are downloaded - nltk.download("words", quiet=True) - nltk.download("punkt", quiet=True) - nltk.download("averaged_perceptron_tagger", quiet=True) - - from nltk.tag import pos_tag - from nltk.tokenize import word_tokenize - - # Tokenize the sentence into words - tokens = word_tokenize(sentence) - - # Tag each word with its corresponding POS tag - pos_tags = pos_tag(tokens) - - return pos_tags - except ImportError: - raise ImportError( - "NLTK is required for POS tagging. Install with 'pip install cognee[nlp]' to use this feature." - ) - - def get_anonymous_id(): """Creates or reads a anonymous user id""" tracking_id = os.getenv("TRACKING_ID", None) diff --git a/pyproject.toml b/pyproject.toml index 6667819c8..2a4c36132 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,8 @@ dependencies = [ "python-magic-bin<0.5 ; platform_system == 'Windows'", # Only needed for Windows "fastembed<=0.6.0 ", "networkx>=3.4.2,<4", - "matplotlib>=3.8.3,<4" + "matplotlib>=3.8.3,<4", + "baml-py>=0.201.0" ] @@ -66,18 +67,12 @@ distributed = [ # Database backends neo4j = ["neo4j>=5.28.0,<6"] neptune = ["langchain_aws>=0.2.22"] -# PostgreSQL support (binary - no compilation required) postgres = [ + "psycopg2>=2.9.10,<3", "psycopg2-binary>=2.9.10,<3.0.0", # Pre-compiled binary, no PostgreSQL headers needed "pgvector>=0.3.5,<0.4", "asyncpg>=0.30.0,<1.0.0", ] -# PostgreSQL support (source - requires PostgreSQL development headers) -postgres-source = [ - "psycopg2>=2.9.10,<3 ; platform_system != 'Windows'", # Requires libpq-dev, build tools - "pgvector>=0.3.5,<0.4", - "asyncpg>=0.30.0,<1.0.0", -] notebook = ["notebook>=7.1.0,<8"] langchain = [ "langsmith>=0.2.3,<1.0.0",