Merge branch 'dev' into feature/cog-3014-refactor-delete-feature

This commit is contained in:
Boris 2025-10-21 12:43:22 +02:00 committed by GitHub
commit 423e49c834
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 1621 additions and 2772 deletions

View file

@ -28,11 +28,10 @@ EMBEDDING_ENDPOINT=""
EMBEDDING_API_VERSION="" EMBEDDING_API_VERSION=""
EMBEDDING_DIMENSIONS=3072 EMBEDDING_DIMENSIONS=3072
EMBEDDING_MAX_TOKENS=8191 EMBEDDING_MAX_TOKENS=8191
EMBEDDING_BATCH_SIZE=36
# If embedding key is not provided same key set for LLM_API_KEY will be used # If embedding key is not provided same key set for LLM_API_KEY will be used
#EMBEDDING_API_KEY="your_api_key" #EMBEDDING_API_KEY="your_api_key"
# Note: OpenAI support up to 2048 elements and Gemini supports a maximum of 100 elements in an embedding batch,
# Cognee sets the optimal batch size for OpenAI and Gemini, but a custom size can be defined if necessary for other models
#EMBEDDING_BATCH_SIZE=2048
# If using BAML structured output these env variables will be used # If using BAML structured output these env variables will be used
BAML_LLM_PROVIDER=openai BAML_LLM_PROVIDER=openai
@ -248,10 +247,10 @@ LITELLM_LOG="ERROR"
#LLM_PROVIDER="ollama" #LLM_PROVIDER="ollama"
#LLM_ENDPOINT="http://localhost:11434/v1" #LLM_ENDPOINT="http://localhost:11434/v1"
#EMBEDDING_PROVIDER="ollama" #EMBEDDING_PROVIDER="ollama"
#EMBEDDING_MODEL="avr/sfr-embedding-mistral:latest" #EMBEDDING_MODEL="nomic-embed-text:latest"
#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings" #EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
#EMBEDDING_DIMENSIONS=4096 #EMBEDDING_DIMENSIONS=768
#HUGGINGFACE_TOKENIZER="Salesforce/SFR-Embedding-Mistral" #HUGGINGFACE_TOKENIZER="nomic-ai/nomic-embed-text-v1.5"
########## OpenRouter (also free) ######################################################### ########## OpenRouter (also free) #########################################################

View file

@ -110,6 +110,81 @@ jobs:
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./examples/python/dynamic_steps_example.py run: uv run python ./examples/python/dynamic_steps_example.py
test-temporal-example:
name: Run Temporal Tests
runs-on: ubuntu-22.04
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: '3.11.x'
- name: Run Temporal Example
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./examples/python/temporal_example.py
test-ontology-example:
name: Run Ontology Tests
runs-on: ubuntu-22.04
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: '3.11.x'
- name: Run Ontology Demo Example
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./examples/python/ontology_demo_example.py
test-agentic-reasoning:
name: Run Agentic Reasoning Tests
runs-on: ubuntu-22.04
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: '3.11.x'
- name: Run Agentic Reasoning Example
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./examples/python/agentic_reasoning_procurement_example.py
test-memify: test-memify:
name: Run Memify Example name: Run Memify Example
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04

View file

@ -9,7 +9,7 @@ on:
python-versions: python-versions:
required: false required: false
type: string type: string
default: '["3.10.x", "3.11.x", "3.12.x"]' default: '["3.10.x", "3.12.x", "3.13.x"]'
secrets: secrets:
LLM_PROVIDER: LLM_PROVIDER:
required: true required: true

View file

@ -85,7 +85,7 @@ jobs:
needs: [basic-tests, e2e-tests] needs: [basic-tests, e2e-tests]
uses: ./.github/workflows/test_different_operating_systems.yml uses: ./.github/workflows/test_different_operating_systems.yml
with: with:
python-versions: '["3.10.x", "3.11.x", "3.12.x"]' python-versions: '["3.10.x", "3.11.x", "3.12.x", "3.13.x"]'
secrets: inherit secrets: inherit
# Matrix-based vector database tests # Matrix-based vector database tests

View file

@ -71,7 +71,7 @@ Build dynamic memory for Agents and replace RAG using scalable, modular ECL (Ext
## Get Started ## Get Started
Get started quickly with a Google Colab <a href="https://colab.research.google.com/drive/1jHbWVypDgCLwjE71GSXhRL3YxYhCZzG1?usp=sharing">notebook</a> , <a href="https://deepnote.com/workspace/cognee-382213d0-0444-4c89-8265-13770e333c02/project/cognee-demo-78ffacb9-5832-4611-bb1a-560386068b30/notebook/Notebook-1-75b24cda566d4c24ab348f7150792601?utm_source=share-modal&utm_medium=product-shared-content&utm_campaign=notebook&utm_content=78ffacb9-5832-4611-bb1a-560386068b30">Deepnote notebook</a> or <a href="https://github.com/topoteretes/cognee/tree/main/cognee-starter-kit">starter repo</a> Get started quickly with a Google Colab <a href="https://colab.research.google.com/drive/12Vi9zID-M3fpKpKiaqDBvkk98ElkRPWy?usp=sharing">notebook</a> , <a href="https://deepnote.com/workspace/cognee-382213d0-0444-4c89-8265-13770e333c02/project/cognee-demo-78ffacb9-5832-4611-bb1a-560386068b30/notebook/Notebook-1-75b24cda566d4c24ab348f7150792601?utm_source=share-modal&utm_medium=product-shared-content&utm_campaign=notebook&utm_content=78ffacb9-5832-4611-bb1a-560386068b30">Deepnote notebook</a> or <a href="https://github.com/topoteretes/cognee/tree/main/cognee-starter-kit">starter repo</a>
## About cognee ## About cognee
@ -224,12 +224,12 @@ We now have a paper you can cite:
```bibtex ```bibtex
@misc{markovic2025optimizinginterfaceknowledgegraphs, @misc{markovic2025optimizinginterfaceknowledgegraphs,
title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning}, title={Optimizing the Interface Between Knowledge Graphs and LLMs for Complex Reasoning},
author={Vasilije Markovic and Lazar Obradovic and Laszlo Hajdu and Jovan Pavlovic}, author={Vasilije Markovic and Lazar Obradovic and Laszlo Hajdu and Jovan Pavlovic},
year={2025}, year={2025},
eprint={2505.24478}, eprint={2505.24478},
archivePrefix={arXiv}, archivePrefix={arXiv},
primaryClass={cs.AI}, primaryClass={cs.AI},
url={https://arxiv.org/abs/2505.24478}, url={https://arxiv.org/abs/2505.24478},
} }
``` ```

View file

@ -44,6 +44,7 @@ async def cognify(
graph_model: BaseModel = KnowledgeGraph, graph_model: BaseModel = KnowledgeGraph,
chunker=TextChunker, chunker=TextChunker,
chunk_size: int = None, chunk_size: int = None,
chunks_per_batch: int = None,
config: Config = None, config: Config = None,
vector_db_config: dict = None, vector_db_config: dict = None,
graph_db_config: dict = None, graph_db_config: dict = None,
@ -106,6 +107,7 @@ async def cognify(
Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2) Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2)
Default limits: ~512-8192 tokens depending on models. Default limits: ~512-8192 tokens depending on models.
Smaller chunks = more granular but potentially fragmented knowledge. Smaller chunks = more granular but potentially fragmented knowledge.
chunks_per_batch: Number of chunks to be processed in a single batch in Cognify tasks.
vector_db_config: Custom vector database configuration for embeddings storage. vector_db_config: Custom vector database configuration for embeddings storage.
graph_db_config: Custom graph database configuration for relationship storage. graph_db_config: Custom graph database configuration for relationship storage.
run_in_background: If True, starts processing asynchronously and returns immediately. run_in_background: If True, starts processing asynchronously and returns immediately.
@ -210,10 +212,18 @@ async def cognify(
} }
if temporal_cognify: if temporal_cognify:
tasks = await get_temporal_tasks(user, chunker, chunk_size) tasks = await get_temporal_tasks(
user=user, chunker=chunker, chunk_size=chunk_size, chunks_per_batch=chunks_per_batch
)
else: else:
tasks = await get_default_tasks( tasks = await get_default_tasks(
user, graph_model, chunker, chunk_size, config, custom_prompt user=user,
graph_model=graph_model,
chunker=chunker,
chunk_size=chunk_size,
config=config,
custom_prompt=custom_prompt,
chunks_per_batch=chunks_per_batch,
) )
# By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for
@ -240,6 +250,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
chunk_size: int = None, chunk_size: int = None,
config: Config = None, config: Config = None,
custom_prompt: Optional[str] = None, custom_prompt: Optional[str] = None,
chunks_per_batch: int = 100,
) -> list[Task]: ) -> list[Task]:
if config is None: if config is None:
ontology_config = get_ontology_env_config() ontology_config = get_ontology_env_config()
@ -258,6 +269,9 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
"ontology_config": {"ontology_resolver": get_default_ontology_resolver()} "ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
} }
if chunks_per_batch is None:
chunks_per_batch = 100
default_tasks = [ default_tasks = [
Task(classify_documents), Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]), Task(check_permissions_on_dataset, user=user, permissions=["write"]),
@ -271,20 +285,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
graph_model=graph_model, graph_model=graph_model,
config=config, config=config,
custom_prompt=custom_prompt, custom_prompt=custom_prompt,
task_config={"batch_size": 10}, task_config={"batch_size": chunks_per_batch},
), # Generate knowledge graphs from the document chunks. ), # Generate knowledge graphs from the document chunks.
Task( Task(
summarize_text, summarize_text,
task_config={"batch_size": 10}, task_config={"batch_size": chunks_per_batch},
), ),
Task(add_data_points, task_config={"batch_size": 10}), Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
] ]
return default_tasks return default_tasks
async def get_temporal_tasks( async def get_temporal_tasks(
user: User = None, chunker=TextChunker, chunk_size: int = None user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10
) -> list[Task]: ) -> list[Task]:
""" """
Builds and returns a list of temporal processing tasks to be executed in sequence. Builds and returns a list of temporal processing tasks to be executed in sequence.
@ -301,10 +315,14 @@ async def get_temporal_tasks(
user (User, optional): The user requesting task execution, used for permission checks. user (User, optional): The user requesting task execution, used for permission checks.
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
Returns: Returns:
list[Task]: A list of Task objects representing the temporal processing pipeline. list[Task]: A list of Task objects representing the temporal processing pipeline.
""" """
if chunks_per_batch is None:
chunks_per_batch = 10
temporal_tasks = [ temporal_tasks = [
Task(classify_documents), Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]), Task(check_permissions_on_dataset, user=user, permissions=["write"]),
@ -313,9 +331,9 @@ async def get_temporal_tasks(
max_chunk_size=chunk_size or get_max_chunk_tokens(), max_chunk_size=chunk_size or get_max_chunk_tokens(),
chunker=chunker, chunker=chunker,
), ),
Task(extract_events_and_timestamps, task_config={"chunk_size": 10}), Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}),
Task(extract_knowledge_graph_from_events), Task(extract_knowledge_graph_from_events),
Task(add_data_points, task_config={"batch_size": 10}), Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
] ]
return temporal_tasks return temporal_tasks

View file

@ -1,7 +1,6 @@
from uuid import UUID from uuid import UUID
from typing import Union, Optional, List, Type from typing import Union, Optional, List, Type
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.engine.models.node_set import NodeSet from cognee.modules.engine.models.node_set import NodeSet
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult
@ -9,9 +8,6 @@ from cognee.modules.users.methods import get_default_user
from cognee.modules.search.methods import search as search_function from cognee.modules.search.methods import search as search_function
from cognee.modules.data.methods import get_authorized_existing_datasets from cognee.modules.data.methods import get_authorized_existing_datasets
from cognee.modules.data.exceptions import DatasetNotFoundError from cognee.modules.data.exceptions import DatasetNotFoundError
from cognee.shared.logging_utils import get_logger
logger = get_logger()
async def search( async def search(
@ -179,13 +175,6 @@ async def search(
if not datasets: if not datasets:
raise DatasetNotFoundError(message="No datasets found.") raise DatasetNotFoundError(message="No datasets found.")
graph_engine = await get_graph_engine()
is_empty = await graph_engine.is_empty()
if is_empty:
logger.warning("Search attempt on an empty knowledge graph")
return []
filtered_search_results = await search_function( filtered_search_results = await search_function(
query_text=query_text, query_text=query_text,
query_type=query_type, query_type=query_type,

View file

@ -162,5 +162,5 @@ def create_graph_engine(
raise EnvironmentError( raise EnvironmentError(
f"Unsupported graph database provider: {graph_database_provider}. " f"Unsupported graph database provider: {graph_database_provider}. "
f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['neo4j', 'kuzu', 'kuzu-remote', 'memgraph', 'neptune', 'neptune_analytics'])}" f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['neo4j', 'kuzu', 'kuzu-remote', 'neptune', 'neptune_analytics'])}"
) )

View file

@ -39,11 +39,6 @@ class GraphDBInterface(ABC):
- get_connections - get_connections
""" """
@abstractmethod
async def is_empty(self) -> bool:
logger.warning("is_empty() is not implemented")
return True
@abstractmethod @abstractmethod
async def query(self, query: str, params: dict) -> List[Any]: async def query(self, query: str, params: dict) -> List[Any]:
""" """

View file

@ -197,15 +197,6 @@ class KuzuAdapter(GraphDBInterface):
except FileNotFoundError: except FileNotFoundError:
logger.warning(f"Kuzu S3 storage file not found: {self.db_path}") logger.warning(f"Kuzu S3 storage file not found: {self.db_path}")
async def is_empty(self) -> bool:
query = """
MATCH (n)
RETURN true
LIMIT 1;
"""
query_result = await self.query(query)
return len(query_result) == 0
async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]: async def query(self, query: str, params: Optional[dict] = None) -> List[Tuple]:
""" """
Execute a Kuzu query asynchronously with automatic reconnection. Execute a Kuzu query asynchronously with automatic reconnection.

View file

@ -86,15 +86,6 @@ class Neo4jAdapter(GraphDBInterface):
async with self.driver.session(database=self.graph_database_name) as session: async with self.driver.session(database=self.graph_database_name) as session:
yield session yield session
async def is_empty(self) -> bool:
query = """
RETURN EXISTS {
MATCH (n)
} AS node_exists;
"""
query_result = await self.query(query)
return not query_result[0]["node_exists"]
@deadlock_retry() @deadlock_retry()
async def query( async def query(
self, self,
@ -1073,7 +1064,7 @@ class Neo4jAdapter(GraphDBInterface):
query_nodes = f""" query_nodes = f"""
MATCH (n) MATCH (n)
WHERE {where_clause} WHERE {where_clause}
RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties RETURN n.id AS id, labels(n) AS labels, properties(n) AS properties
""" """
result_nodes = await self.query(query_nodes) result_nodes = await self.query(query_nodes)
@ -1088,7 +1079,7 @@ class Neo4jAdapter(GraphDBInterface):
query_edges = f""" query_edges = f"""
MATCH (n)-[r]->(m) MATCH (n)-[r]->(m)
WHERE {where_clause} AND {where_clause.replace("n.", "m.")} WHERE {where_clause} AND {where_clause.replace("n.", "m.")}
RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties RETURN n.id AS source, n.id AS target, TYPE(r) AS type, properties(r) AS properties
""" """
result_edges = await self.query(query_edges) result_edges = await self.query(query_edges)

View file

@ -1,8 +1,17 @@
from cognee.shared.logging_utils import get_logger import os
import logging
from typing import List, Optional from typing import List, Optional
from fastembed import TextEmbedding from fastembed import TextEmbedding
import litellm import litellm
import os from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
)
from cognee.shared.logging_utils import get_logger
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
from cognee.infrastructure.databases.exceptions import EmbeddingException from cognee.infrastructure.databases.exceptions import EmbeddingException
from cognee.infrastructure.llm.tokenizer.TikToken import ( from cognee.infrastructure.llm.tokenizer.TikToken import (
@ -57,6 +66,13 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
enable_mocking = str(enable_mocking).lower() enable_mocking = str(enable_mocking).lower()
self.mock = enable_mocking in ("true", "1", "yes") self.mock = enable_mocking in ("true", "1", "yes")
@retry(
stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def embed_text(self, text: List[str]) -> List[List[float]]: async def embed_text(self, text: List[str]) -> List[List[float]]:
""" """
Embed the given text into numerical vectors. Embed the given text into numerical vectors.

View file

@ -1,15 +1,21 @@
import asyncio import asyncio
import logging
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from typing import List, Optional from typing import List, Optional
import numpy as np import numpy as np
import math import math
from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
)
import litellm import litellm
import os import os
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
from cognee.infrastructure.databases.exceptions import EmbeddingException from cognee.infrastructure.databases.exceptions import EmbeddingException
from cognee.infrastructure.llm.tokenizer.Gemini import (
GeminiTokenizer,
)
from cognee.infrastructure.llm.tokenizer.HuggingFace import ( from cognee.infrastructure.llm.tokenizer.HuggingFace import (
HuggingFaceTokenizer, HuggingFaceTokenizer,
) )
@ -19,10 +25,6 @@ from cognee.infrastructure.llm.tokenizer.Mistral import (
from cognee.infrastructure.llm.tokenizer.TikToken import ( from cognee.infrastructure.llm.tokenizer.TikToken import (
TikTokenTokenizer, TikTokenTokenizer,
) )
from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
embedding_rate_limit_async,
embedding_sleep_and_retry_async,
)
litellm.set_verbose = False litellm.set_verbose = False
logger = get_logger("LiteLLMEmbeddingEngine") logger = get_logger("LiteLLMEmbeddingEngine")
@ -76,8 +78,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
enable_mocking = str(enable_mocking).lower() enable_mocking = str(enable_mocking).lower()
self.mock = enable_mocking in ("true", "1", "yes") self.mock = enable_mocking in ("true", "1", "yes")
@embedding_sleep_and_retry_async() @retry(
@embedding_rate_limit_async stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def embed_text(self, text: List[str]) -> List[List[float]]: async def embed_text(self, text: List[str]) -> List[List[float]]:
""" """
Embed a list of text strings into vector representations. Embed a list of text strings into vector representations.

View file

@ -3,8 +3,16 @@ from cognee.shared.logging_utils import get_logger
import aiohttp import aiohttp
from typing import List, Optional from typing import List, Optional
import os import os
import litellm
import logging
import aiohttp.http_exceptions import aiohttp.http_exceptions
from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
)
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
from cognee.infrastructure.llm.tokenizer.HuggingFace import ( from cognee.infrastructure.llm.tokenizer.HuggingFace import (
@ -69,7 +77,6 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
enable_mocking = str(enable_mocking).lower() enable_mocking = str(enable_mocking).lower()
self.mock = enable_mocking in ("true", "1", "yes") self.mock = enable_mocking in ("true", "1", "yes")
@embedding_rate_limit_async
async def embed_text(self, text: List[str]) -> List[List[float]]: async def embed_text(self, text: List[str]) -> List[List[float]]:
""" """
Generate embedding vectors for a list of text prompts. Generate embedding vectors for a list of text prompts.
@ -92,7 +99,13 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
embeddings = await asyncio.gather(*[self._get_embedding(prompt) for prompt in text]) embeddings = await asyncio.gather(*[self._get_embedding(prompt) for prompt in text])
return embeddings return embeddings
@embedding_sleep_and_retry_async() @retry(
stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def _get_embedding(self, prompt: str) -> List[float]: async def _get_embedding(self, prompt: str) -> List[float]:
""" """
Internal method to call the Ollama embeddings endpoint for a single prompt. Internal method to call the Ollama embeddings endpoint for a single prompt.

View file

@ -24,11 +24,10 @@ class EmbeddingConfig(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="allow") model_config = SettingsConfigDict(env_file=".env", extra="allow")
def model_post_init(self, __context) -> None: def model_post_init(self, __context) -> None:
# If embedding batch size is not defined use 2048 as default for OpenAI and 100 for all other embedding models
if not self.embedding_batch_size and self.embedding_provider.lower() == "openai": if not self.embedding_batch_size and self.embedding_provider.lower() == "openai":
self.embedding_batch_size = 2048 self.embedding_batch_size = 36
elif not self.embedding_batch_size: elif not self.embedding_batch_size:
self.embedding_batch_size = 100 self.embedding_batch_size = 36
def to_dict(self) -> dict: def to_dict(self) -> dict:
""" """

View file

@ -124,6 +124,12 @@ def guess_file_type(file: BinaryIO) -> filetype.Type:
""" """
file_type = filetype.guess(file) file_type = filetype.guess(file)
# If file type could not be determined consider it a plain text file as they don't have magic number encoding
if file_type is None:
from filetype.types.base import Type
file_type = Type("text/plain", "txt")
if file_type is None: if file_type is None:
raise FileTypeException(f"Unknown file detected: {file.name}.") raise FileTypeException(f"Unknown file detected: {file.name}.")

View file

@ -1,19 +1,24 @@
import logging
from typing import Type from typing import Type
from pydantic import BaseModel from pydantic import BaseModel
import litellm
import instructor import instructor
from cognee.shared.logging_utils import get_logger
from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
)
from cognee.infrastructure.llm.exceptions import MissingSystemPromptPathError
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
LLMInterface, LLMInterface,
) )
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import (
rate_limit_async,
sleep_and_retry_async,
)
from cognee.infrastructure.llm.LLMGateway import LLMGateway
from cognee.infrastructure.llm.config import get_llm_config from cognee.infrastructure.llm.config import get_llm_config
logger = get_logger()
class AnthropicAdapter(LLMInterface): class AnthropicAdapter(LLMInterface):
""" """
@ -35,8 +40,13 @@ class AnthropicAdapter(LLMInterface):
self.model = model self.model = model
self.max_completion_tokens = max_completion_tokens self.max_completion_tokens = max_completion_tokens
@sleep_and_retry_async() @retry(
@rate_limit_async stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def acreate_structured_output( async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel] self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel: ) -> BaseModel:

View file

@ -12,11 +12,18 @@ from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
LLMInterface, LLMInterface,
) )
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( import logging
rate_limit_async, from cognee.shared.logging_utils import get_logger
sleep_and_retry_async, from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
) )
logger = get_logger()
class GeminiAdapter(LLMInterface): class GeminiAdapter(LLMInterface):
""" """
@ -58,8 +65,13 @@ class GeminiAdapter(LLMInterface):
self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
@sleep_and_retry_async() @retry(
@rate_limit_async stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def acreate_structured_output( async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel] self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel: ) -> BaseModel:

View file

@ -12,11 +12,18 @@ from cognee.infrastructure.llm.exceptions import ContentPolicyFilterError
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
LLMInterface, LLMInterface,
) )
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import ( import logging
rate_limit_async, from cognee.shared.logging_utils import get_logger
sleep_and_retry_async, from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
) )
logger = get_logger()
class GenericAPIAdapter(LLMInterface): class GenericAPIAdapter(LLMInterface):
""" """
@ -58,8 +65,13 @@ class GenericAPIAdapter(LLMInterface):
self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON) self.aclient = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.JSON)
@sleep_and_retry_async() @retry(
@rate_limit_async stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def acreate_structured_output( async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel] self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel: ) -> BaseModel:

View file

@ -1,20 +1,23 @@
import litellm import litellm
import instructor import instructor
from pydantic import BaseModel from pydantic import BaseModel
from typing import Type, Optional from typing import Type
from litellm import acompletion, JSONSchemaValidationError from litellm import JSONSchemaValidationError
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from cognee.modules.observability.get_observe import get_observe from cognee.modules.observability.get_observe import get_observe
from cognee.infrastructure.llm.exceptions import MissingSystemPromptPathError
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
LLMInterface, LLMInterface,
) )
from cognee.infrastructure.llm.LLMGateway import LLMGateway
from cognee.infrastructure.llm.config import get_llm_config from cognee.infrastructure.llm.config import get_llm_config
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import (
rate_limit_async, import logging
sleep_and_retry_async, from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
) )
logger = get_logger() logger = get_logger()
@ -47,8 +50,13 @@ class MistralAdapter(LLMInterface):
api_key=get_llm_config().llm_api_key, api_key=get_llm_config().llm_api_key,
) )
@sleep_and_retry_async() @retry(
@rate_limit_async stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def acreate_structured_output( async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel] self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel: ) -> BaseModel:
@ -99,31 +107,3 @@ class MistralAdapter(LLMInterface):
logger.error(f"Schema validation failed: {str(e)}") logger.error(f"Schema validation failed: {str(e)}")
logger.debug(f"Raw response: {e.raw_response}") logger.debug(f"Raw response: {e.raw_response}")
raise ValueError(f"Response failed schema validation: {str(e)}") raise ValueError(f"Response failed schema validation: {str(e)}")
def show_prompt(self, text_input: str, system_prompt: str) -> str:
"""
Format and display the prompt for a user query.
Parameters:
-----------
- text_input (str): Input text from the user to be included in the prompt.
- system_prompt (str): The system prompt that will be shown alongside the user input.
Returns:
--------
- str: The formatted prompt string combining system prompt and user input.
"""
if not text_input:
text_input = "No user input provided."
if not system_prompt:
raise MissingSystemPromptPathError()
system_prompt = LLMGateway.read_query_prompt(system_prompt)
formatted_prompt = (
f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n"""
if system_prompt
else None
)
return formatted_prompt

View file

@ -1,4 +1,6 @@
import base64 import base64
import litellm
import logging
import instructor import instructor
from typing import Type from typing import Type
from openai import OpenAI from openai import OpenAI
@ -7,11 +9,17 @@ from pydantic import BaseModel
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
LLMInterface, LLMInterface,
) )
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import (
rate_limit_async,
sleep_and_retry_async,
)
from cognee.infrastructure.files.utils.open_data_file import open_data_file from cognee.infrastructure.files.utils.open_data_file import open_data_file
from cognee.shared.logging_utils import get_logger
from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
)
logger = get_logger()
class OllamaAPIAdapter(LLMInterface): class OllamaAPIAdapter(LLMInterface):
@ -47,8 +55,13 @@ class OllamaAPIAdapter(LLMInterface):
OpenAI(base_url=self.endpoint, api_key=self.api_key), mode=instructor.Mode.JSON OpenAI(base_url=self.endpoint, api_key=self.api_key), mode=instructor.Mode.JSON
) )
@sleep_and_retry_async() @retry(
@rate_limit_async stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def acreate_structured_output( async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel] self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel: ) -> BaseModel:
@ -90,7 +103,13 @@ class OllamaAPIAdapter(LLMInterface):
return response return response
@rate_limit_async @retry(
stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def create_transcript(self, input_file: str) -> str: async def create_transcript(self, input_file: str) -> str:
""" """
Generate an audio transcript from a user query. Generate an audio transcript from a user query.
@ -123,7 +142,13 @@ class OllamaAPIAdapter(LLMInterface):
return transcription.text return transcription.text
@rate_limit_async @retry(
stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def transcribe_image(self, input_file: str) -> str: async def transcribe_image(self, input_file: str) -> str:
""" """
Transcribe content from an image using base64 encoding. Transcribe content from an image using base64 encoding.

View file

@ -7,6 +7,15 @@ from openai import ContentFilterFinishReasonError
from litellm.exceptions import ContentPolicyViolationError from litellm.exceptions import ContentPolicyViolationError
from instructor.core import InstructorRetryException from instructor.core import InstructorRetryException
import logging
from tenacity import (
retry,
stop_after_delay,
wait_exponential_jitter,
retry_if_not_exception_type,
before_sleep_log,
)
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import ( from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
LLMInterface, LLMInterface,
) )
@ -14,19 +23,13 @@ from cognee.infrastructure.llm.exceptions import (
ContentPolicyFilterError, ContentPolicyFilterError,
) )
from cognee.infrastructure.files.utils.open_data_file import open_data_file from cognee.infrastructure.files.utils.open_data_file import open_data_file
from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.rate_limiter import (
rate_limit_async,
rate_limit_sync,
sleep_and_retry_async,
sleep_and_retry_sync,
)
from cognee.modules.observability.get_observe import get_observe from cognee.modules.observability.get_observe import get_observe
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
observe = get_observe()
logger = get_logger() logger = get_logger()
observe = get_observe()
class OpenAIAdapter(LLMInterface): class OpenAIAdapter(LLMInterface):
""" """
@ -97,8 +100,13 @@ class OpenAIAdapter(LLMInterface):
self.fallback_endpoint = fallback_endpoint self.fallback_endpoint = fallback_endpoint
@observe(as_type="generation") @observe(as_type="generation")
@sleep_and_retry_async() @retry(
@rate_limit_async stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def acreate_structured_output( async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel] self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel: ) -> BaseModel:
@ -148,10 +156,7 @@ class OpenAIAdapter(LLMInterface):
InstructorRetryException, InstructorRetryException,
) as e: ) as e:
if not (self.fallback_model and self.fallback_api_key): if not (self.fallback_model and self.fallback_api_key):
raise ContentPolicyFilterError( raise e
f"The provided input contains content that is not aligned with our content policy: {text_input}"
) from e
try: try:
return await self.aclient.chat.completions.create( return await self.aclient.chat.completions.create(
model=self.fallback_model, model=self.fallback_model,
@ -186,8 +191,13 @@ class OpenAIAdapter(LLMInterface):
) from error ) from error
@observe @observe
@sleep_and_retry_sync() @retry(
@rate_limit_sync stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
def create_structured_output( def create_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel] self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel: ) -> BaseModel:
@ -231,7 +241,13 @@ class OpenAIAdapter(LLMInterface):
max_retries=self.MAX_RETRIES, max_retries=self.MAX_RETRIES,
) )
@rate_limit_async @retry(
stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def create_transcript(self, input): async def create_transcript(self, input):
""" """
Generate an audio transcript from a user query. Generate an audio transcript from a user query.
@ -263,7 +279,13 @@ class OpenAIAdapter(LLMInterface):
return transcription return transcription
@rate_limit_async @retry(
stop=stop_after_delay(128),
wait=wait_exponential_jitter(2, 128),
retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
before_sleep=before_sleep_log(logger, logging.DEBUG),
reraise=True,
)
async def transcribe_image(self, input) -> BaseModel: async def transcribe_image(self, input) -> BaseModel:
""" """
Generate a transcription of an image from a user query. Generate a transcription of an image from a user query.

View file

@ -105,7 +105,6 @@ class LoaderEngine:
async def load_file( async def load_file(
self, self,
file_path: str, file_path: str,
file_stream: Optional[Any],
preferred_loaders: Optional[List[str]] = None, preferred_loaders: Optional[List[str]] = None,
**kwargs, **kwargs,
): ):

View file

@ -14,14 +14,6 @@ from cognee.infrastructure.loaders.external.pypdf_loader import PyPdfLoader
logger = get_logger(__name__) logger = get_logger(__name__)
try:
from unstructured.partition.pdf import partition_pdf
except ImportError as e:
logger.info(
"unstructured[pdf] not installed, can't use AdvancedPdfLoader, will use PyPdfLoader instead."
)
raise ImportError from e
@dataclass @dataclass
class _PageBuffer: class _PageBuffer:
@ -88,6 +80,8 @@ class AdvancedPdfLoader(LoaderInterface):
**kwargs, **kwargs,
} }
# Use partition to extract elements # Use partition to extract elements
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(**partition_kwargs) elements = partition_pdf(**partition_kwargs)
# Process elements into text content # Process elements into text content

View file

@ -88,6 +88,7 @@ async def run_tasks_distributed(
pipeline_name: str = "unknown_pipeline", pipeline_name: str = "unknown_pipeline",
context: Optional[dict] = None, context: Optional[dict] = None,
incremental_loading: bool = False, incremental_loading: bool = False,
data_per_batch: int = 20,
): ):
if not user: if not user:
user = await get_default_user() user = await get_default_user()

View file

@ -1,6 +1,6 @@
from cognee.shared.logging_utils import get_logger import asyncio
from cognee.infrastructure.databases.exceptions import EmbeddingException from cognee.shared.logging_utils import get_logger
from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
@ -33,18 +33,23 @@ async def index_data_points(data_points: list[DataPoint]):
indexed_data_point.metadata["index_fields"] = [field_name] indexed_data_point.metadata["index_fields"] = [field_name]
index_points[index_name].append(indexed_data_point) index_points[index_name].append(indexed_data_point)
for index_name_and_field, indexable_points in index_points.items(): tasks: list[asyncio.Task] = []
first_occurence = index_name_and_field.index("_") batch_size = vector_engine.embedding_engine.get_batch_size()
index_name = index_name_and_field[:first_occurence]
field_name = index_name_and_field[first_occurence + 1 :] for index_name_and_field, points in index_points.items():
try: first = index_name_and_field.index("_")
# In case the amount of indexable points is too large we need to send them in batches index_name = index_name_and_field[:first]
batch_size = vector_engine.embedding_engine.get_batch_size() field_name = index_name_and_field[first + 1 :]
for i in range(0, len(indexable_points), batch_size):
batch = indexable_points[i : i + batch_size] # Create embedding requests per batch to run in parallel later
await vector_engine.index_data_points(index_name, field_name, batch) for i in range(0, len(points), batch_size):
except EmbeddingException as e: batch = points[i : i + batch_size]
logger.warning(f"Failed to index data points for {index_name}.{field_name}: {e}") tasks.append(
asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
)
# Run all embedding requests in parallel
await asyncio.gather(*tasks)
return data_points return data_points

View file

@ -1,3 +1,5 @@
import asyncio
from cognee.modules.engine.utils.generate_edge_id import generate_edge_id from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
from cognee.shared.logging_utils import get_logger from cognee.shared.logging_utils import get_logger
from collections import Counter from collections import Counter
@ -76,15 +78,20 @@ async def index_graph_edges(
indexed_data_point.metadata["index_fields"] = [field_name] indexed_data_point.metadata["index_fields"] = [field_name]
index_points[index_name].append(indexed_data_point) index_points[index_name].append(indexed_data_point)
# Get maximum batch size for embedding model
batch_size = vector_engine.embedding_engine.get_batch_size()
tasks: list[asyncio.Task] = []
for index_name, indexable_points in index_points.items(): for index_name, indexable_points in index_points.items():
index_name, field_name = index_name.split(".") index_name, field_name = index_name.split(".")
# Get maximum batch size for embedding model # Create embedding tasks to run in parallel later
batch_size = vector_engine.embedding_engine.get_batch_size()
# We save the data in batches of {batch_size} to not put a lot of pressure on the database
for start in range(0, len(indexable_points), batch_size): for start in range(0, len(indexable_points), batch_size):
batch = indexable_points[start : start + batch_size] batch = indexable_points[start : start + batch_size]
await vector_engine.index_data_points(index_name, field_name, batch) tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
# Start all embedding tasks and wait for completion
await asyncio.gather(*tasks)
return None return None

View file

@ -1,7 +1,6 @@
from typing import List from typing import List
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.tasks.storage.add_data_points import add_data_points from cognee.tasks.storage.add_data_points import add_data_points
from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine
import cognee import cognee
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
import json import json
@ -64,7 +63,6 @@ async def create_connected_test_graph():
async def get_metrics(provider: str, include_optional=True): async def get_metrics(provider: str, include_optional=True):
create_graph_engine.cache_clear()
cognee.config.set_graph_database_provider(provider) cognee.config.set_graph_database_provider(provider)
graph_engine = await get_graph_engine() graph_engine = await get_graph_engine()
await graph_engine.delete_graph() await graph_engine.delete_graph()

View file

@ -1,7 +1,12 @@
from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics
import asyncio import asyncio
async def main():
from cognee.tests.tasks.descriptive_metrics.metrics_test_utils import assert_metrics
await assert_metrics(provider="neo4j", include_optional=False)
await assert_metrics(provider="neo4j", include_optional=True)
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(assert_metrics(provider="neo4j", include_optional=False)) asyncio.run(main())
asyncio.run(assert_metrics(provider="neo4j", include_optional=True))

View file

@ -47,26 +47,10 @@ async def main():
pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt" pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt"
) )
from cognee.infrastructure.databases.graph import get_graph_engine
graph_engine = await get_graph_engine()
is_empty = await graph_engine.is_empty()
assert is_empty, "Kuzu graph database is not empty"
await cognee.add([explanation_file_path_quantum], dataset_name) await cognee.add([explanation_file_path_quantum], dataset_name)
is_empty = await graph_engine.is_empty()
assert is_empty, "Kuzu graph database should be empty before cognify"
await cognee.cognify([dataset_name]) await cognee.cognify([dataset_name])
is_empty = await graph_engine.is_empty()
assert not is_empty, "Kuzu graph database should not be empty"
from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine() vector_engine = get_vector_engine()
@ -130,10 +114,11 @@ async def main():
assert not os.path.isdir(data_root_directory), "Local data files are not deleted" assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)
from cognee.infrastructure.databases.graph import get_graph_engine
is_empty = await graph_engine.is_empty() graph_engine = await get_graph_engine()
nodes, edges = await graph_engine.get_graph_data()
assert is_empty, "Kuzu graph database is not empty" assert len(nodes) == 0 and len(edges) == 0, "Kuzu graph database is not empty"
finally: finally:
# Ensure cleanup even if tests fail # Ensure cleanup even if tests fail

View file

@ -1,105 +0,0 @@
import os
import pathlib
import cognee
from cognee.infrastructure.files.storage import get_storage_config
from cognee.modules.search.operations import get_history
from cognee.modules.users.methods import get_default_user
from cognee.shared.logging_utils import get_logger
from cognee.modules.search.types import SearchType
logger = get_logger()
async def main():
cognee.config.set_graph_database_provider("memgraph")
data_directory_path = str(
pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_memgraph")
).resolve()
)
cognee.config.data_root_directory(data_directory_path)
cognee_directory_path = str(
pathlib.Path(
os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_memgraph")
).resolve()
)
cognee.config.system_root_directory(cognee_directory_path)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
dataset_name = "cs_explanations"
explanation_file_path_nlp = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
)
await cognee.add([explanation_file_path_nlp], dataset_name)
explanation_file_path_quantum = os.path.join(
pathlib.Path(__file__).parent, "test_data/Quantum_computers.txt"
)
await cognee.add([explanation_file_path_quantum], dataset_name)
await cognee.cognify([dataset_name])
from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine()
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"]
search_results = await cognee.search(
query_type=SearchType.GRAPH_COMPLETION, query_text=random_node_name
)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(query_type=SearchType.CHUNKS, query_text=random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(
query_type=SearchType.SUMMARIES, query_text=random_node_name
)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted results are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(
query_type=SearchType.NATURAL_LANGUAGE,
query_text=f"Find nodes connected to node with name {random_node_name}",
)
assert len(search_results) != 0, "Query related natural language don't exist."
print("\nExtracted results are:\n")
for result in search_results:
print(f"{result}\n")
user = await get_default_user()
history = await get_history(user.id)
assert len(history) == 8, "Search history is not correct."
await cognee.prune.prune_data()
data_root_directory = get_storage_config()["data_root_directory"]
assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
await cognee.prune.prune_system(metadata=True)
from cognee.infrastructure.databases.graph import get_graph_engine
graph_engine = await get_graph_engine()
nodes, edges = await graph_engine.get_graph_data()
assert len(nodes) == 0 and len(edges) == 0, "Memgraph graph database is not empty"
if __name__ == "__main__":
import asyncio
asyncio.run(main())

View file

@ -35,14 +35,6 @@ async def main():
explanation_file_path_nlp = os.path.join( explanation_file_path_nlp = os.path.join(
pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt" pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
) )
from cognee.infrastructure.databases.graph import get_graph_engine
graph_engine = await get_graph_engine()
is_empty = await graph_engine.is_empty()
assert is_empty, "Graph has to be empty"
await cognee.add([explanation_file_path_nlp], dataset_name) await cognee.add([explanation_file_path_nlp], dataset_name)
explanation_file_path_quantum = os.path.join( explanation_file_path_quantum = os.path.join(
@ -50,16 +42,9 @@ async def main():
) )
await cognee.add([explanation_file_path_quantum], dataset_name) await cognee.add([explanation_file_path_quantum], dataset_name)
is_empty = await graph_engine.is_empty()
assert is_empty, "Graph has to be empty before cognify"
await cognee.cognify([dataset_name]) await cognee.cognify([dataset_name])
is_empty = await graph_engine.is_empty()
assert not is_empty, "Graph shouldn't be empty"
from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.vector import get_vector_engine
vector_engine = get_vector_engine() vector_engine = get_vector_engine()
@ -132,8 +117,11 @@ async def main():
assert not os.path.isdir(data_root_directory), "Local data files are not deleted" assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
await cognee.prune.prune_system(metadata=True) await cognee.prune.prune_system(metadata=True)
is_empty = await graph_engine.is_empty() from cognee.infrastructure.databases.graph import get_graph_engine
assert is_empty, "Neo4j graph database is not empty"
graph_engine = await get_graph_engine()
nodes, edges = await graph_engine.get_graph_data()
assert len(nodes) == 0 and len(edges) == 0, "Neo4j graph database is not empty"
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,21 +0,0 @@
import pytest
import cognee
@pytest.mark.asyncio
async def test_empty_search_raises_SearchOnEmptyGraphError_on_empty_graph():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add("Sample input")
result = await cognee.search("Sample query")
assert result == []
@pytest.mark.asyncio
async def test_empty_search_doesnt_raise_SearchOnEmptyGraphError():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await cognee.add("Sample input")
await cognee.cognify()
result = await cognee.search("Sample query")
assert result != []

View file

@ -83,16 +83,16 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"import os\n", "import os\n",
"import pathlib\n", "import pathlib\n",
"from cognee import config, add, cognify, search, SearchType, prune, visualize_graph\n", "from cognee import config, add, cognify, search, SearchType, prune, visualize_graph\n",
"from dotenv import load_dotenv" "from dotenv import load_dotenv"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@ -106,7 +106,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"# load environment variables from file .env\n", "# load environment variables from file .env\n",
"load_dotenv()\n", "load_dotenv()\n",
@ -145,9 +147,7 @@
" \"vector_db_url\": f\"neptune-graph://{graph_identifier}\", # Neptune Analytics endpoint with the format neptune-graph://<GRAPH_ID>\n", " \"vector_db_url\": f\"neptune-graph://{graph_identifier}\", # Neptune Analytics endpoint with the format neptune-graph://<GRAPH_ID>\n",
" }\n", " }\n",
")" ")"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@ -159,19 +159,19 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# Prune data and system metadata before running, only if we want \"fresh\" state.\n", "# Prune data and system metadata before running, only if we want \"fresh\" state.\n",
"await prune.prune_data()\n", "await prune.prune_data()\n",
"await prune.prune_system(metadata=True)" "await prune.prune_system(metadata=True)"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"## Setup data and cognify\n", "## Setup data and cognify\n",
"\n", "\n",
@ -180,7 +180,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"# Add sample text to the dataset\n", "# Add sample text to the dataset\n",
"sample_text_1 = \"\"\"Neptune Analytics is a memory-optimized graph database engine for analytics. With Neptune\n", "sample_text_1 = \"\"\"Neptune Analytics is a memory-optimized graph database engine for analytics. With Neptune\n",
@ -205,9 +207,7 @@
"\n", "\n",
"# Cognify the text data.\n", "# Cognify the text data.\n",
"await cognify([dataset_name])" "await cognify([dataset_name])"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@ -215,14 +215,16 @@
"source": [ "source": [
"## Graph Memory visualization\n", "## Graph Memory visualization\n",
"\n", "\n",
"Initialize Memgraph as a Graph Memory store and save to .artefacts/graph_visualization.html\n", "Initialize Neptune as a Graph Memory store and save to .artefacts/graph_visualization.html\n",
"\n", "\n",
"![visualization](./neptune_analytics_demo.png)" "![visualization](./neptune_analytics_demo.png)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"# Get a graphistry url (Register for a free account at https://www.graphistry.com)\n", "# Get a graphistry url (Register for a free account at https://www.graphistry.com)\n",
"# url = await render_graph()\n", "# url = await render_graph()\n",
@ -235,9 +237,7 @@
" ).resolve()\n", " ).resolve()\n",
")\n", ")\n",
"await visualize_graph(graph_file_path)" "await visualize_graph(graph_file_path)"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@ -250,19 +250,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"# Completion query that uses graph data to form context.\n", "# Completion query that uses graph data to form context.\n",
"graph_completion = await search(query_text=\"What is Neptune Analytics?\", query_type=SearchType.GRAPH_COMPLETION)\n", "graph_completion = await search(query_text=\"What is Neptune Analytics?\", query_type=SearchType.GRAPH_COMPLETION)\n",
"print(\"\\nGraph completion result is:\")\n", "print(\"\\nGraph completion result is:\")\n",
"print(graph_completion)" "print(graph_completion)"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"## SEARCH: RAG Completion\n", "## SEARCH: RAG Completion\n",
"\n", "\n",
@ -271,19 +271,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"# Completion query that uses document chunks to form context.\n", "# Completion query that uses document chunks to form context.\n",
"rag_completion = await search(query_text=\"What is Neptune Analytics?\", query_type=SearchType.RAG_COMPLETION)\n", "rag_completion = await search(query_text=\"What is Neptune Analytics?\", query_type=SearchType.RAG_COMPLETION)\n",
"print(\"\\nRAG Completion result is:\")\n", "print(\"\\nRAG Completion result is:\")\n",
"print(rag_completion)" "print(rag_completion)"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"## SEARCH: Graph Insights\n", "## SEARCH: Graph Insights\n",
"\n", "\n",
@ -291,8 +291,10 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# Search graph insights\n", "# Search graph insights\n",
"insights_results = await search(query_text=\"Neptune Analytics\", query_type=SearchType.GRAPH_COMPLETION)\n", "insights_results = await search(query_text=\"Neptune Analytics\", query_type=SearchType.GRAPH_COMPLETION)\n",
@ -302,13 +304,11 @@
" tgt_node = result[2].get(\"name\", result[2][\"type\"])\n", " tgt_node = result[2].get(\"name\", result[2][\"type\"])\n",
" relationship = result[1].get(\"relationship_name\", \"__relationship__\")\n", " relationship = result[1].get(\"relationship_name\", \"__relationship__\")\n",
" print(f\"- {src_node} -[{relationship}]-> {tgt_node}\")" " print(f\"- {src_node} -[{relationship}]-> {tgt_node}\")"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"## SEARCH: Entity Summaries\n", "## SEARCH: Entity Summaries\n",
"\n", "\n",
@ -316,8 +316,10 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"# Query all summaries related to query.\n", "# Query all summaries related to query.\n",
"summaries = await search(query_text=\"Neptune Analytics\", query_type=SearchType.SUMMARIES)\n", "summaries = await search(query_text=\"Neptune Analytics\", query_type=SearchType.SUMMARIES)\n",
@ -326,13 +328,11 @@
" type = summary[\"type\"]\n", " type = summary[\"type\"]\n",
" text = summary[\"text\"]\n", " text = summary[\"text\"]\n",
" print(f\"- {type}: {text}\")" " print(f\"- {type}: {text}\")"
], ]
"outputs": [],
"execution_count": null
}, },
{ {
"metadata": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {},
"source": [ "source": [
"## SEARCH: Chunks\n", "## SEARCH: Chunks\n",
"\n", "\n",
@ -340,8 +340,10 @@
] ]
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [ "source": [
"chunks = await search(query_text=\"Neptune Analytics\", query_type=SearchType.CHUNKS)\n", "chunks = await search(query_text=\"Neptune Analytics\", query_type=SearchType.CHUNKS)\n",
"print(\"\\nChunk results are:\")\n", "print(\"\\nChunk results are:\")\n",
@ -349,9 +351,7 @@
" type = chunk[\"type\"]\n", " type = chunk[\"type\"]\n",
" text = chunk[\"text\"]\n", " text = chunk[\"text\"]\n",
" print(f\"- {type}: {text}\")" " print(f\"- {type}: {text}\")"
], ]
"outputs": [],
"execution_count": null
} }
], ],
"metadata": { "metadata": {

12
poetry.lock generated
View file

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. # This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
[[package]] [[package]]
name = "accelerate" name = "accelerate"
@ -6631,7 +6631,7 @@ description = "Fundamental package for array computing in Python"
optional = false optional = false
python-versions = ">=3.11" python-versions = ">=3.11"
groups = ["main"] groups = ["main"]
markers = "python_version == \"3.12\" or python_full_version == \"3.13.0\"" markers = "python_version >= \"3.12\""
files = [ files = [
{file = "numpy-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ffc4f5caba7dfcbe944ed674b7eef683c7e94874046454bb79ed7ee0236f59d"}, {file = "numpy-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ffc4f5caba7dfcbe944ed674b7eef683c7e94874046454bb79ed7ee0236f59d"},
{file = "numpy-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7e946c7170858a0295f79a60214424caac2ffdb0063d4d79cb681f9aa0aa569"}, {file = "numpy-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7e946c7170858a0295f79a60214424caac2ffdb0063d4d79cb681f9aa0aa569"},
@ -11199,7 +11199,7 @@ description = "Easily download, build, install, upgrade, and uninstall Python pa
optional = true optional = true
python-versions = ">=3.9" python-versions = ">=3.9"
groups = ["main"] groups = ["main"]
markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\") or python_version == \"3.12\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or python_full_version == \"3.13.0\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\"" markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (extra == \"docs\" or extra == \"docling\" or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\") or python_version >= \"3.12\" and (extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\" or extra == \"docs\" or extra == \"docling\") or extra == \"notebook\" or extra == \"dev\" or extra == \"llama-index\" or extra == \"deepeval\" or extra == \"dlt\""
files = [ files = [
{file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
{file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
@ -13506,7 +13506,7 @@ dev = ["coverage", "deptry", "gitpython", "mkdocs-material", "mkdocs-minify-plug
distributed = ["modal"] distributed = ["modal"]
dlt = ["dlt"] dlt = ["dlt"]
docling = ["docling", "transformers"] docling = ["docling", "transformers"]
docs = ["unstructured"] docs = ["lxml", "unstructured"]
evals = ["gdown", "matplotlib", "pandas", "plotly", "scikit-learn"] evals = ["gdown", "matplotlib", "pandas", "plotly", "scikit-learn"]
graphiti = ["graphiti-core"] graphiti = ["graphiti-core"]
groq = ["groq"] groq = ["groq"]
@ -13527,5 +13527,5 @@ scraping = ["APScheduler", "beautifulsoup4", "lxml", "playwright", "protego", "t
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = ">=3.10,<=3.13" python-versions = ">=3.10,<3.14"
content-hash = "8d8172ac8ddc3c30ca79a1677ecf2a28897d52c0a564d8fb5646c8565c313a0f" content-hash = "bcab5420339473ec08b89cde588899b60999762fb8ca9a011240d47ea86198e3"

View file

@ -7,7 +7,7 @@ authors = [
{ name = "Vasilije Markovic" }, { name = "Vasilije Markovic" },
{ name = "Boris Arzentar" }, { name = "Boris Arzentar" },
] ]
requires-python = ">=3.10,<=3.13" requires-python = ">=3.10,<3.14"
readme = "README.md" readme = "README.md"
license = "Apache-2.0" license = "Apache-2.0"
classifiers = [ classifiers = [
@ -56,6 +56,7 @@ dependencies = [
"gunicorn>=20.1.0,<24", "gunicorn>=20.1.0,<24",
"websockets>=15.0.1,<16.0.0", "websockets>=15.0.1,<16.0.0",
"mistralai>=1.9.10", "mistralai>=1.9.10",
"tenacity>=9.0.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]
@ -64,14 +65,16 @@ api=[]
distributed = [ distributed = [
"modal>=1.0.5,<2.0.0", "modal>=1.0.5,<2.0.0",
] ]
scraping = [ scraping = [
"tavily-python>=0.7.0", "tavily-python>=0.7.12",
"beautifulsoup4>=4.13.1", "beautifulsoup4>=4.13.1",
"playwright>=1.9.0", "playwright>=1.9.0",
"lxml>=4.9.3,<5.0.0", "lxml>=4.9.3",
"protego>=0.1", "protego>=0.1",
"APScheduler>=3.10.0,<=3.11.0" "APScheduler>=3.10.0,<=3.11.0"
] ]
neo4j = ["neo4j>=5.28.0,<6"] neo4j = ["neo4j>=5.28.0,<6"]
neptune = ["langchain_aws>=0.2.22"] neptune = ["langchain_aws>=0.2.22"]
postgres = [ postgres = [
@ -101,7 +104,7 @@ chromadb = [
"chromadb>=0.6,<0.7", "chromadb>=0.6,<0.7",
"pypika==0.48.9", "pypika==0.48.9",
] ]
docs = ["unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"] docs = ["lxml<6.0.0", "unstructured[csv, doc, docx, epub, md, odt, org, ppt, pptx, rst, rtf, tsv, xlsx, pdf]>=0.18.1,<19"]
codegraph = [ codegraph = [
"fastembed<=0.6.0 ; python_version < '3.13'", "fastembed<=0.6.0 ; python_version < '3.13'",
"transformers>=4.46.3,<5", "transformers>=4.46.3,<5",

2458
uv.lock generated

File diff suppressed because it is too large Load diff