Merge branch 'main' into COG-698

This commit is contained in:
Vasilije 2024-12-11 13:17:15 +01:00 committed by GitHub
commit 0f0e34e097
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
60 changed files with 460 additions and 1399 deletions

35
.github/dependabot.yaml vendored Normal file
View file

@ -0,0 +1,35 @@
# Configuration: https://dependabot.com/docs/config-file/
# Docs: https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
allow:
- dependency-type: "all"
commit-message:
prefix: ":arrow_up:"
open-pull-requests-limit: 50
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
allow:
- dependency-type: "all"
commit-message:
prefix: ":arrow_up:"
open-pull-requests-limit: 50
- package-ecosystem: "docker"
directory: "/"
schedule:
interval: "weekly"
allow:
- dependency-type: "all"
commit-message:
prefix: ":arrow_up:"
open-pull-requests-limit: 50

View file

@ -0,0 +1,16 @@
name: community | Greetings
on: [pull_request, issues]
jobs:
greeting:
runs-on: ubuntu-latest
steps:
- uses: actions/first-interaction@v1
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
pr-message: 'Hello @${{ github.actor }}, thank you for submitting a PR! We will respond as soon as possible.'
issue-message: |
Hello @${{ github.actor }}, thank you for your interest in our work!
If this is a bug report, please provide screenshots and **minimum viable code to reproduce your issue**, otherwise we can not help you.

View file

@ -1,36 +0,0 @@
name: analytics | Update Cognee Stats Daily
on:
schedule:
- cron: '0 1 * * *' # Runs every day at 01:00 UTC
jobs:
update_stats:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
with:
persist-credentials: false
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Install Dependencies
run: |
pip install requests posthog
- name: Run Update Script
env:
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_API_HOST: ${{ secrets.POSTHOG_API_HOST }}
run: |
cd tools # Change to the 'tools' directory
echo "Current working directory after changing to tools:"
pwd # Print the working directory again
echo "List of folders in the tools directory:"
ls -la # List all files and folders in the 'tools' directory
python daily_pypi_downloads.py # Run the script

View file

@ -1,44 +0,0 @@
name: analytics | Push GitHub Data to PostHog
on:
schedule:
- cron: '0 0 * * *' # Runs every day at midnight
workflow_dispatch:
jobs:
push-data:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests posthog
- name: Print working directory, list folders, and run script
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
echo "Current working directory:"
pwd # Print the current working directory
echo "List of folders in the current directory:"
ls -la # List all files and folders in the current directory
echo "Changing to tools directory..."
cd tools # Change to the 'tools' directory
echo "Current working directory after changing to tools:"
pwd # Print the working directory again
echo "List of folders in the tools directory:"
ls -la # List all files and folders in the 'tools' directory
python push_to_posthog.py # Run the script

View file

@ -46,7 +46,7 @@ jobs:
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}

View file

@ -11,7 +11,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- name: Github Releases To Discord
uses: SethCohen/github-releases-to-discord@v1.15.0
uses: SethCohen/github-releases-to-discord@v1.16.2
with:
webhook_url: ${{ secrets.WEBHOOK_URL }}
color: "2105893"

View file

@ -40,7 +40,7 @@ class CogneeGraph(CogneeAbstractGraph):
edge.node1.add_skeleton_edge(edge)
edge.node2.add_skeleton_edge(edge)
else:
raise EntityAlreadyExistsError(message=f"Edge {edge} already exists in the graph.")
print(f"Edge {edge} already exists in the graph.")
def get_node(self, node_id: str) -> Node:
return self.nodes.get(node_id, None)

View file

@ -65,6 +65,12 @@ class Node:
def get_attribute(self, key: str) -> Union[str, int, float]:
return self.attributes[key]
def get_skeleton_edges(self):
return self.skeleton_edges
def get_skeleton_neighbours(self):
return self.skeleton_neighbours
def __repr__(self) -> str:
return f"Node({self.id}, attributes={self.attributes})"
@ -109,8 +115,14 @@ class Edge:
def add_attribute(self, key: str, value: Any) -> None:
self.attributes[key] = value
def get_attribute(self, key: str, value: Any) -> Union[str, int, float]:
return self.attributes[key]
def get_attribute(self, key: str) -> Optional[Union[str, int, float]]:
return self.attributes.get(key)
def get_source_node(self):
return self.node1
def get_destination_node(self):
return self.node2
def __repr__(self) -> str:
direction = "->" if self.directed else "--"

View file

@ -0,0 +1,116 @@
import asyncio
import logging
from typing import Set, List
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.models import User
from cognee.shared.utils import send_telemetry
async def code_description_to_code_part_search(query: str, user: User = None, top_k = 2) -> list:
if user is None:
user = await get_default_user()
if user is None:
raise PermissionError("No user found in the system. Please create a user.")
retrieved_codeparts = await code_description_to_code_part(query, user, top_k)
return retrieved_codeparts
async def code_description_to_code_part(
query: str,
user: User,
top_k: int
) -> List[str]:
"""
Maps a code description query to relevant code parts using a CodeGraph pipeline.
Args:
query (str): The search query describing the code parts.
user (User): The user performing the search.
top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher)
Returns:
Set[str]: A set of unique code parts matching the query.
Raises:
ValueError: If arguments are invalid.
RuntimeError: If an unexpected error occurs during execution.
"""
if not query or not isinstance(query, str):
raise ValueError("The query must be a non-empty string.")
if top_k <= 0 or not isinstance(top_k, int):
raise ValueError("top_k must be a positive integer.")
try:
vector_engine = get_vector_engine()
graph_engine = await get_graph_engine()
except Exception as init_error:
logging.error("Failed to initialize engines: %s", init_error, exc_info=True)
raise RuntimeError("System initialization error. Please try again later.") from init_error
send_telemetry("code_description_to_code_part_search EXECUTION STARTED", user.id)
logging.info("Search initiated by user %s with query: '%s' and top_k: %d", user.id, query, top_k)
try:
results = await vector_engine.search(
"code_summary_text", query_text=query, limit=top_k
)
if not results:
logging.warning("No results found for query: '%s' by user: %s", query, user.id)
return []
memory_fragment = CogneeGraph()
await memory_fragment.project_graph_from_db(
graph_engine,
node_properties_to_project=['id', 'type', 'text', 'source_code'],
edge_properties_to_project=['relationship_name']
)
code_pieces_to_return = set()
for node in results:
node_id = str(node.id)
node_to_search_from = memory_fragment.get_node(node_id)
if not node_to_search_from:
logging.debug("Node %s not found in memory fragment graph", node_id)
continue
for code_file in node_to_search_from.get_skeleton_neighbours():
for code_file_edge in code_file.get_skeleton_edges():
if code_file_edge.get_attribute('relationship_name') == 'contains':
code_pieces_to_return.add(code_file_edge.get_destination_node())
logging.info("Search completed for user: %s, query: '%s'. Found %d code pieces.",
user.id, query, len(code_pieces_to_return))
return list(code_pieces_to_return)
except Exception as exec_error:
logging.error(
"Error during code description to code part search for user: %s, query: '%s'. Error: %s",
user.id, query, exec_error, exc_info=True
)
send_telemetry("code_description_to_code_part_search EXECUTION FAILED", user.id)
raise RuntimeError("An error occurred while processing your request.") from exec_error
if __name__ == "__main__":
async def main():
query = "I am looking for a class with blue eyes"
user = None
try:
results = await code_description_to_code_part_search(query, user)
print("Retrieved Code Parts:", results)
except Exception as e:
print(f"An error occurred: {e}")
asyncio.run(main())

View file

@ -14,6 +14,7 @@ class TextSummary(DataPoint):
class CodeSummary(DataPoint):
__tablename__ = "code_summary"
text: str
made_from: CodeFile

View file

@ -42,19 +42,6 @@ def test_add_edge_success(setup_graph):
assert edge in node2.skeleton_edges
def test_add_duplicate_edge(setup_graph):
"""Test adding a duplicate edge raises an exception."""
graph = setup_graph
node1 = Node("node1")
node2 = Node("node2")
graph.add_node(node1)
graph.add_node(node2)
edge = Edge(node1, node2)
graph.add_edge(edge)
with pytest.raises(EntityAlreadyExistsError, match="Edge .* already exists in the graph."):
graph.add_edge(edge)
def test_get_node_success(setup_graph):
"""Test retrieving an existing node."""
graph = setup_graph

View file

@ -0,0 +1,76 @@
import pytest
from unittest.mock import AsyncMock, patch
@pytest.mark.asyncio
async def test_code_description_to_code_part_no_results():
"""Test that code_description_to_code_part handles no search results."""
mock_user = AsyncMock()
mock_user.id = "user123"
mock_vector_engine = AsyncMock()
mock_vector_engine.search.return_value = []
with patch("cognee.modules.retrieval.description_to_codepart_search.get_vector_engine", return_value=mock_vector_engine), \
patch("cognee.modules.retrieval.description_to_codepart_search.get_graph_engine", return_value=AsyncMock()), \
patch("cognee.modules.retrieval.description_to_codepart_search.CogneeGraph", return_value=AsyncMock()):
from cognee.modules.retrieval.description_to_codepart_search import code_description_to_code_part
result = await code_description_to_code_part("search query", mock_user, 2)
assert result == []
@pytest.mark.asyncio
async def test_code_description_to_code_part_invalid_query():
"""Test that code_description_to_code_part raises ValueError for invalid query."""
mock_user = AsyncMock()
with pytest.raises(ValueError, match="The query must be a non-empty string."):
from cognee.modules.retrieval.description_to_codepart_search import code_description_to_code_part
await code_description_to_code_part("", mock_user, 2)
@pytest.mark.asyncio
async def test_code_description_to_code_part_invalid_top_k():
"""Test that code_description_to_code_part raises ValueError for invalid top_k."""
mock_user = AsyncMock()
with pytest.raises(ValueError, match="top_k must be a positive integer."):
from cognee.modules.retrieval.description_to_codepart_search import code_description_to_code_part
await code_description_to_code_part("search query", mock_user, 0)
@pytest.mark.asyncio
async def test_code_description_to_code_part_initialization_error():
"""Test that code_description_to_code_part raises RuntimeError for engine initialization errors."""
mock_user = AsyncMock()
with patch("cognee.modules.retrieval.description_to_codepart_search.get_vector_engine", side_effect=Exception("Engine init failed")), \
patch("cognee.modules.retrieval.description_to_codepart_search.get_graph_engine", return_value=AsyncMock()):
from cognee.modules.retrieval.description_to_codepart_search import code_description_to_code_part
with pytest.raises(RuntimeError, match="System initialization error. Please try again later."):
await code_description_to_code_part("search query", mock_user, 2)
@pytest.mark.asyncio
async def test_code_description_to_code_part_execution_error():
"""Test that code_description_to_code_part raises RuntimeError for execution errors."""
mock_user = AsyncMock()
mock_user.id = "user123"
mock_vector_engine = AsyncMock()
mock_vector_engine.search.side_effect = Exception("Execution error")
with patch("cognee.modules.retrieval.description_to_codepart_search.get_vector_engine", return_value=mock_vector_engine), \
patch("cognee.modules.retrieval.description_to_codepart_search.get_graph_engine", return_value=AsyncMock()), \
patch("cognee.modules.retrieval.description_to_codepart_search.CogneeGraph", return_value=AsyncMock()):
from cognee.modules.retrieval.description_to_codepart_search import code_description_to_code_part
with pytest.raises(RuntimeError, match="An error occurred while processing your request."):
await code_description_to_code_part("search query", mock_user, 2)

View file

@ -1,299 +0,0 @@
# Cognee API Reference
## Overview
The Cognee API provides a set of endpoints for managing datasets, performing cognitive tasks, and configuring various settings in the system. The API is built on FastAPI and includes multiple routes to handle different functionalities. This reference outlines the available endpoints and their usage.
## Base URL
The base URL for all API requests is determined by the server's deployment environment. Typically, this will be:
- **Development**: `http://localhost:8000`
- **Production**: Depending on your server setup.
## Endpoints
### 1. Root
- **URL**: `/`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Root endpoint that returns a welcome message.
**Response**:
```json
{
"status": 200,
"body": {
"message": "Hello, World, I am alive!"
}
}
```
### 2. Health Check
- **URL**: `/health`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Health check endpoint that returns the server status.
**Response**:
```json
{
"status": 200
}
```
### 3. Get Datasets
- **URL**: `/datasets`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Retrieve a list of available datasets.
**Response**:
```json
{
"status": 200,
"body": [
{
"id": "dataset_id_1",
"name": "Dataset Name 1",
"description": "Description of Dataset 1",
...
},
...
]
}
```
### 4. Delete Dataset
- **URL**: `/datasets/{dataset_id}`
- **Method**: `DELETE`
- **Auth Required**: No
- **Description**: Delete a specific dataset by its ID.
**Path Parameters**:
- `dataset_id`: The ID of the dataset to delete.
**Response**:
```json
{
"status": 200
}
```
### 5. Get Dataset Graph
- **URL**: `/datasets/{dataset_id}/graph`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Retrieve the graph visualization URL for a specific dataset.
**Path Parameters**:
- `dataset_id`: The ID of the dataset.
**Response**:
```json
"http://example.com/path/to/graph"
```
### 6. Get Dataset Data
- **URL**: `/datasets/{dataset_id}/data`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Retrieve data associated with a specific dataset.
**Path Parameters**:
- `dataset_id`: The ID of the dataset.
**Response**:
```json
{
"status": 200,
"body": [
{
"data_id": "data_id_1",
"content": "Data content here",
...
},
...
]
}
```
### 7. Get Dataset Status
- **URL**: `/datasets/status`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Retrieve the status of one or more datasets.
**Query Parameters**:
- `dataset`: A list of dataset IDs to check status for.
**Response**:
```json
{
"status": 200,
"body": {
"dataset_id_1": "Status 1",
"dataset_id_2": "Status 2",
...
}
}
```
### 8. Get Raw Data
- **URL**: `/datasets/{dataset_id}/data/{data_id}/raw`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Retrieve the raw data file for a specific data entry in a dataset.
**Path Parameters**:
- `dataset_id`: The ID of the dataset.
- `data_id`: The ID of the data entry.
**Response**: Raw file download.
### 9. Add Data
- **URL**: `/add`
- **Method**: `POST`
- **Auth Required**: No
- **Description**: Add new data to a dataset. The data can be uploaded from a file or a URL.
**Form Parameters**:
- `datasetId`: The ID of the dataset to add data to.
- `data`: A list of files to upload.
**Request**
```json
{
"dataset_id": "ID_OF_THE_DATASET_TO_PUT_DATA_IN", // Optional, we use "main" as default.
"files": File[]
}
```
**Response**:
```json
{
"status": 200
}
```
### 10. Cognify
- **URL**: `/cognify`
- **Method**: `POST`
- **Auth Required**: No
- **Description**: Perform cognitive processing on the specified datasets.
**Request Body**:
```json
{
"datasets": ["ID_OF_THE_DATASET_1", "ID_OF_THE_DATASET_2", ...]
}
```
**Response**:
```json
{
"status": 200
}
```
### 11. Search
- **URL**: `/search`
- **Method**: `POST`
- **Auth Required**: No
- **Description**: Search for nodes in the graph based on the provided query parameters.
**Request Body**:
```json
{
"searchType": "INSIGHTS", // Or "SUMMARIES" or "CHUNKS"
"query": "QUERY_TO_MATCH_DATA"
}
```
**Response**
For "INSIGHTS" search type:
```json
{
"status": 200,
"body": [[
{ "name" "source_node_name" },
{ "relationship_name" "between_nodes_relationship_name" },
{ "name" "target_node_name" },
]]
}
```
For "SUMMARIES" search type:
```json
{
"status": 200,
"body": [
{ "text" "summary_text" },
{ "text" "summary_text" },
{ "text" "summary_text" },
...
]
}
```
For "CHUNKS" search type:
```json
{
"status": 200,
"body": [
{ "text" "chunk_text" },
{ "text" "chunk_text" },
{ "text" "chunk_text" },
...
]
}
```
### 12. Get Settings
- **URL**: `/settings`
- **Method**: `GET`
- **Auth Required**: No
- **Description**: Retrieve the current system settings.
**Response**:
```json
{
"status": 200,
"body": {
"llm": {...},
"vectorDB": {...},
...
}
}
```
### 13. Save Settings
- **URL**: `/settings`
- **Method**: `POST`
- **Auth Required**: No
- **Description**: Save new settings for the system, including LLM and vector DB configurations.
**Request Body**:
- `llm`: Optional. The configuration for the LLM provider.
- `vectorDB`: Optional. The configuration for the vector database provider.
**Response**:
```json
{
"status": 200
}
```

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.1 KiB

View file

@ -1,6 +0,0 @@
Graph data models are fundamental structures used to represent and store data in the form of graphs, which consist of nodes (or vertices) and edges (or links). This model is particularly effective for illustrating relationships and connections among various data entities, making it invaluable in domains such as social networks, recommendation systems, logistics, biological networks, and more. Here's an overview of key concepts and types of graph data models:
Key Concepts:
Nodes (Vertices): Represent entities or objects within the graph, such as people in a social network, stations in a transportation map, or proteins in biological networks.
Edges (Links): Depict the relationships or interactions between nodes. Edges can be directed (indicating a one-way relationship) or undirected (indicating a mutual relationship).
Properties: Both nodes and edges can have properties (key-value pairs) that provide additional information, such as weights, types, or other attributes relevant to the application.

View file

@ -1,8 +0,0 @@
Function calling in the context of Large Language Models (LLMs) like GPT-3, GPT-4, and their derivatives extends beyond traditional programming paradigms. In this scenario, function calling involves prompting the LLM to simulate the behavior of a function within its generated output. This capability allows users to interact with LLMs in a structured way, effectively requesting specific operations or information retrieval tasks by framing their prompts as function calls.
How LLM Function Calling Works:
Prompt Construction: The user constructs a prompt that mimics a function call in programming. This prompt includes the "name" of the function (often a description of the task) and the "arguments" (the specific inputs or conditions for the task). For example, a prompt might look like "Generate a summary for the following article:" followed by the article text.
LLM Interpretation: The LLM interprets this structured prompt and understands it as a request to perform a specific task, similar to how a function in a program would be invoked. The model then generates an output that aligns with the expected behavior of the function described in the prompt.
Parameters and Outputs: In LLM function calling, the parameters are the details provided in the prompt, and the output is the generated text that the model produces in response. This output is intended to fulfill the function's "purpose" as inferred from the prompt.

View file

@ -1 +0,0 @@
A multilayer graph network is a sophisticated structure used to model complex systems where entities and their interactions can exist in multiple layers, each representing a different type of relationship, context, or domain. Unlike traditional graphs that capture connections in a single, uniform setting, multilayer graphs provide a more nuanced framework, allowing for the representation of diverse interconnections and dependencies across various dimensions or layers.

View file

@ -1,11 +0,0 @@
Propositions are fundamental elements in the study of logic, linguistics, and natural language processing. They represent atomic expressions within texts that encapsulate distinct factoids, conveying specific pieces of information. In essence, a proposition is a declarative statement that can either be true or false, but not both simultaneously.
This binary nature makes propositions crucial for logical deductions, reasoning, and the construction of arguments.
In a natural language context, propositions are presented in a concise and self-contained format.
They are designed to convey information clearly and unambiguously, making them easily interpretable by humans and computable by machines. For example, the statement "The Eiffel Tower is in Paris" is a proposition because it presents a specific fact about the location of the Eiffel Tower, and its truth value can be assessed as either true or false.
The concept of propositions extends beyond mere statements of fact to include assertions about concepts, relationships, and conditions.
For instance, "If it rains, the ground gets wet" is a conditional proposition that establishes a cause-and-effect relationship between two events.
In computational linguistics and natural language processing, propositions are vital for tasks such as information extraction, knowledge representation, and question answering.

View file

@ -1,87 +0,0 @@
# Conceptual Overview - cognee
## Introduction
!!! info "What is cognee?"
cognee is a data processing framework that enables LLMs to produce deterministic and traceable outputs.
cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines.
Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy.
## Core Concepts
### Concept 1: Data Pipelines
Most of the data we provide to a system can be categorized as unstructured, semi-structured, or structured. Rows from a database would belong to structured data, jsons to semi-structured data, and logs that we input into the system could be considered unstructured.
To organize and process this data, we need to ensure we have custom loaders for all data types, which can help us unify and organize it properly.
<figure markdown>
![Data Pipelines](img/pipelines.png)
<figcaption>Data Pipeline Example</figcaption>
</figure>
In the example above, we have a pipeline in which data has been imported from various sources, normalized, and stored in a database. Relevant identifiers and relationships between the data are also created in this process.
To create an effective data pipeline for processing various types of data—structured, semi-structured, and unstructured—its crucial to understand each type's specific handling and processing needs. Let's expand on the concepts involved in setting up such a data pipeline.
Data Types and Their Handling
- Structured Data: This includes data that adheres to a fixed schema, such as rows in a relational database or data in CSV files. The processing of structured data typically involves SQL queries for extraction, transformations through simple functions or procedures, and loading into destination tables or databases.
- Semi-structured Data: JSON files, XML, or even some APIs' data fit this category. These data types don't have a rigid schema but have some organizational properties that can be exploited. Semi-structured data often requires parsers that can navigate its structure (like trees for XML or key-value pairs for JSON) to extract necessary information. Libraries such as json in Python or lxml for XML handling can be very useful here.
- Unstructured Data: This category includes text files, logs, or even images and videos.
### Concept 2: Data Enrichment with LLMs
LLMs are adept at processing unstructured data. They can easily extract summaries, keywords, and other useful information from documents. We use function calling with Pydantic models to extract information from the unstructured data.
<figure markdown>
![Data Enrichment](img/enrichment.png)
<figcaption>Data Enrichment Example</figcaption>
</figure>
We decompose the loaded content into graphs, allowing us to more precisely map out the relationships between entities and concepts.
### Concept 3: Graphs
Knowledge graphs simply map out knowledge, linking specific facts and their connections.
When Large Language Models (LLMs) process text, they infer these links, leading to occasional inaccuracies due to their probabilistic nature.
Clearly defined relationships enhance their accuracy.
This structured approach can extend beyond concepts to document layouts, pages, or other organizational schemas.
<figure markdown>
![Graph structure](img/graph_structure.png)
<figcaption>Graph Structure</figcaption>
</figure>
### Concept 4: Vector and Graph Retrieval
Cognee lets you use multiple vector and graph retrieval methods to find the most relevant information.
!!! info "Learn more?"
Check out learning materials to see how you can use these methods in your projects.
### Concept 5: Auto-Optimizing Pipelines
Integrating knowledge graphs into Retrieval-Augmented Generation (RAG) pipelines leads to an intriguing outcome: the system's adeptness at contextual understanding allows it to be evaluated in a way Machine Learning (ML) engineers are accustomed to.
This involves bombarding the RAG system with hundreds of synthetic questions, enabling the knowledge graph to evolve and refine its context autonomously over time.
This method paves the way for developing self-improving memory engines that can adapt to new data and user feedback.
## Architecture Overview
A high-level diagram of cognee's architecture, illustrating the main components and their interactions.
<figure markdown>
![Architecture](img/architecture.png)
<figcaption>Architecture</figcaption>
</figure>
Main components:
- **Data Pipelines**: Responsible for ingesting, processing, and transforming data from various sources.
- **LLMs**: Large Language Models that process unstructured data and generate text.
- **Graph Store**: Knowledge graphs that represent relationships between entities and concepts.
- **Vector Store**: Database that stores vector representations of data for efficient retrieval.
- **Search**: Retrieves relevant information from the knowledge graph and vector stores.
## How It Fits Into Your Projects
!!! info "How cognee fits into your projects"
cognee is a self-contained library that simplifies the process of loading and structuring data in LLMs.
By integrating cognee into your data pipelines, you can leverage the power of LLMs, knowledge graphs, and vector retrieval to create accurate and explainable AI solutions.
cognee provides a self-contained library that simplifies the process of loading and structuring LLM context, enabling you to create accurate and explainable AI solutions.

View file

@ -1,93 +0,0 @@
# Configuration
## 🚀 Configure Vector and Graph Stores
You can configure the vector and graph stores using the environment variables in your .env file or programmatically.
We use [Pydantic Settings](https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support)
We have a global configuration object (cognee.config) and individual configurations on pipeline and data store levels
Check available configuration options:
``` python
from cognee.infrastructure.databases.vector import get_vectordb_config
from cognee.infrastructure.databases.graph.config import get_graph_config
from cognee.infrastructure.databases.relational import get_relational_config
from cognee.infrastructure.llm.config import get_llm_config
print(get_vectordb_config().to_dict())
print(get_graph_config().to_dict())
print(get_relational_config().to_dict())
print(get_llm_config().to_dict())
```
Setting the environment variables in your .env file, and Pydantic will pick them up:
```bash
GRAPH_DATABASE_PROVIDER = 'lancedb'
```
Otherwise, you can set the configuration yourself:
```python
cognee.config.set_llm_provider('ollama')
```
## 🚀 Getting Started with Local Models
You'll need to run the local model on your machine or use one of the providers hosting the model.
!!! note "We had some success with mixtral, but 7b models did not work well. We recommend using mixtral for now."
### Ollama
Set up Ollama by following instructions on [Ollama website](https://ollama.com/)
Set the environment variable in your .env to use the model
```bash
LLM_PROVIDER = 'ollama'
```
Otherwise, you can set the configuration for the model:
```bash
cognee.config.set_llm_provider('ollama')
```
You can also set the HOST and model name:
```bash
cognee.config.set_llm_endpoint("http://localhost:11434/v1")
cognee.config.set_llm_model("mistral:instruct")
```
### Anyscale
```bash
LLM_PROVIDER = 'custom'
```
Otherwise, you can set the configuration for the model:
```bash
cognee.config.set_llm_provider('custom')
```
You can also set the HOST and model name:
```bash
LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
LLM_ENDPOINT = "https://api.endpoints.anyscale.com/v1"
LLM_API_KEY = "your_api_key"
```
You can set the same way HOST and model name for any other provider that has an API endpoint.

View file

@ -1,32 +0,0 @@
# Data Engineering and LLMOps
!!! tip "This is a work in progress and any feedback is welcome"
## Table of Contents
1. [Data Engineering](#data-engineering)
2. [Large Language Model Operations (LLM Ops)](#large-language-model-operations-llm-ops)
## Data Engineering
Data Engineering focuses on managing and analyzing big data. It revolves around five key aspects:
### Volume
The size and amount of data that companies manage and analyze.
### Value
The insights and patterns derived from data that lead to business benefits.
### Variety
The diversity of data types, including unstructured, semi-structured, and raw data.
### Velocity
The speed at which data is received, stored, and managed.
### Veracity
The accuracy or truthfulness of data.
## Large Language Model Operations (LLM Ops)
The emerging field of Large Language Model Operations (LLM Ops) inherits many practices from data engineering. LLM Ops involves the deployment, monitoring, and maintenance of systems using LLMs to manage and build new generation of AI powered applications.
For more in-depth information on LLM Ops, see [Resource Name](link-to-resource).

View file

@ -1,46 +0,0 @@
# How data ingestion with cognee works
# Why bother with data ingestion?
In order to use cognee, you need to ingest data into the cognee data store.
This data can be events, customer data, or third-party data.
In order to build reliable models and pipelines, we need to structure and process various types of datasets and data sources in the same way.
Some of the operations like normalization, deduplication, and data cleaning are common across all data sources.
This is where cognee comes in. It provides a unified interface to ingest data from various sources and process it in a consistent way.
For this we use dlt (Data Loading Tool) which is a part of cognee infrastructure.
# Example
Let's say you have a dataset of customer reviews in a PDF file. You want to ingest this data into cognee and use it to train a model.
You can use the following code to ingest the data:
```python
dataset_name = "artificial_intelligence"
ai_text_file_path = os.path.join(pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf")
await cognee.add([ai_text_file_path], dataset_name)
```
cognee uses dlt to ingest the data and allows you to use:
1. SQL databases. Supports PostgreSQL, MySQL, MS SQL Server, BigQuery, Redshift, and more.
2. REST API generic source. Loads data from REST APIs using declarative configuration.
3. OpenAPI source generator. Generates a source from an OpenAPI 3.x spec using the REST API source.
4. Cloud and local storage. Retrieves data from AWS S3, Google Cloud Storage, Azure Blob Storage, local files, and more.
# What happens under the hood?
We use dlt as a loader to ingest data into the cognee metadata store. We can ingest data from various sources like SQL databases, REST APIs, OpenAPI specs, and cloud storage.
This enables us to have a common data model we can then use to build models and pipelines.
The models and pipelines we build in this way end up in the cognee data store, which is a unified interface to access the data.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 76 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 126 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 428 KiB

View file

@ -1,31 +0,0 @@
# New to cognee?
The getting started guide covers adding a cognee data store to your AI app, sending data, identifying users, extracting actions and insights, and interconnecting separate datasets.
[Get started](quickstart.md)
## Ingest Data
Learn how to manage the ingestion of events, customer data, or third-party data for use with cognee.
[Explore](data_ingestion.md)
## Tasks and Pipelines
Analyze and enrich your data and improve LLM answers with a series of tasks and pipelines.
[Learn about tasks](templates.md)
## API
Push or pull data to build custom functionality or create bespoke views for your business needs.
[Explore](api_reference.md)
## Resources
### Resources
- [Research](research.md)
- [Community](https://discord.gg/52QTb5JK){:target="_blank"}

View file

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

View file

@ -1,60 +0,0 @@
# Running cognee with local models
## 🚀 Getting Started with Local Models
You'll need to run the local model on your machine or use one of the providers hosting the model.
!!! note "We had some success with mixtral, but 7b models did not work well. We recommend using mixtral for now."
### Ollama
Set up Ollama by following instructions on [Ollama website](https://ollama.com/)
Set the environment variable in your .env to use the model
```bash
LLM_PROVIDER = 'ollama'
```
Otherwise, you can set the configuration for the model:
```bash
cognee.config.llm_provider = 'ollama'
```
You can also set the HOST and model name:
```bash
cognee.config.llm_endpoint = "http://localhost:11434/v1"
cognee.config.llm_model = "mistral:instruct"
```
### Anyscale
```bash
LLM_PROVIDER = 'custom'
```
Otherwise, you can set the configuration for the model:
```bash
cognee.config.llm_provider = 'custom'
```
You can also set the HOST and model name:
```bash
LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
LLM_ENDPOINT = "https://api.endpoints.anyscale.com/v1"
LLM_API_KEY = "your_api_key"
```
You can set the same way HOST and model name for any other provider that has an API endpoint.

View file

@ -1,17 +0,0 @@
{% extends "base.html" %}
{% block meta %}
{{ super() }}
<meta property="og:type" content="website" />
<meta property="og:site_name" content="cognee" />
<meta property="og:description" content="Deterministic data engine for LLMs" />
<meta property="og:title" content="{{ page.title|striptags }}" />
<meta property="og:url" content="{{ page.canonical_url }}" />
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:site" content="@tricalt" />
<meta name="twitter:creator" content="@tricalt" />
<meta name="twitter:title" content="{{ page.title|striptags }}" />
<meta name="twitter:description" content="desc" />
{% endblock %}

View file

@ -1,15 +0,0 @@
<script>
var segmentKey = "{{ config.extra.analytics.key }}"
/* Wait for page to load and application to mount */
document.addEventListener("DOMContentLoaded", function() {
try {
!function(){var i="analytics",analytics=window[i]=window[i]||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{analytics.invoked=!0;analytics.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","reset","group","track","ready","alias","debug","page","screen","once","off","on","addSourceMiddleware","addIntegrationMiddleware","setAnonymousId","addDestinationMiddleware","register"];analytics.factory=function(e){return function(){if(window[i].initialized)return window[i][e].apply(window[i],arguments);var n=Array.prototype.slice.call(arguments);if(["track","screen","alias","group","page","identify"].indexOf(e)>-1){var c=document.querySelector("link[rel='canonical']");n.push({__t:"bpc",c:c&&c.getAttribute("href")||void 0,p:location.pathname,u:location.href,s:location.search,t:document.title,r:document.referrer})}n.unshift(e);analytics.push(n);return analytics}};for(var n=0;n<analytics.methods.length;n++){var key=analytics.methods[n];analytics[key]=analytics.factory(key)}analytics.load=function(key,n){var t=document.createElement("script");t.type="text/javascript";t.async=!0;t.setAttribute("data-global-segment-analytics-key",i);t.src="https://cdn.segment.com/analytics.js/v1/" + key + "/analytics.min.js";var r=document.getElementsByTagName("script")[0];r.parentNode.insertBefore(t,r);analytics._loadOptions=n};analytics._writeKey=segmentKey;;analytics.SNIPPET_VERSION="5.2.0";
analytics.load(segmentKey);
analytics.page();
}}();
} catch (error) {
console.error("Failed to load Segment analytics", error);
}
});
</script>

View file

@ -1,81 +0,0 @@
# PIPELINES
Cognee uses [tasks](https://github.com/topoteretes/cognee/blob/main/cognee/modules/pipelines/tasks/Task.py) grouped into pipelines that populate graph and vector stores. [These tasks](https://github.com/topoteretes/cognee/tree/main/cognee/tasks) analyze and enrich data, enhancing the quality of answers produced by Large Language Models (LLMs).
The tasks are managed and executed asynchronously using the `run_tasks` and `run_tasks_parallel` functions.
```python
pipeline = run_tasks(tasks, documents)
async for result in pipeline:
print(result)
```
## Main pipeline: [cognee.cognify](https://github.com/topoteretes/cognee/blob/168cb5d1bf1964b5b0c645b2f3d8638d84554fda/cognee/api/v1/cognify/cognify_v2.py#L38)
This is the main pipeline currently implemented in cognee. It is designed to process data in a structured way and populate the graph and vector stores.
This function is the entry point for processing datasets. It handles dataset retrieval, user authorization, and manages the execution of a pipeline of tasks that process documents.
### Parameters
- `datasets: Union[str, list[str]] = None`: A string or list of dataset names to be processed.
- `user: User = None`: The user requesting the processing. If not provided, the default user is retrieved.
### Steps in the Function
#### User Authentication
```python
if user is None:
user = await get_default_user()
```
If no user is provided, the function retrieves the default user.
#### Handling Empty or String Dataset Input
```python
existing_datasets = await get_datasets(user.id)
if datasets is None or len(datasets) == 0:
datasets = existing_datasets
if type(datasets[0]) == str:
datasets = await get_datasets_by_name(datasets, user.id)
```
If no datasets are provided, the function retrieves all datasets owned by the user. If a list of dataset names (strings) is provided, they are converted into dataset objects.
#### Selecting datasets from the input list that are owned by the user
```python
existing_datasets_map = {
generate_dataset_name(dataset.name): True for dataset in existing_datasets
}
```
#### Run Cognify Pipeline for Each Dataset
```python
awaitables = []
for dataset in datasets:
dataset_name = generate_dataset_name(dataset.name)
if dataset_name in existing_datasets_map:
awaitables.append(run_cognify_pipeline(dataset, user))
return await asyncio.gather(*awaitables)
The `run_cognify_pipeline` function is defined within `cognify` and is responsible for processing a single dataset. This is where most of the heavy lifting occurs. The function processes multiple datasets concurrently using `asyncio.gather`.
#### Pipeline Tasks
The pipeline consists of several tasks, each responsible for different parts of the processing:
- `classify_documents`: Converts each of the documents into one of the specific Document types: PdfDocument, AudioDocument, ImageDocument or TextDocument
- `check_permissions_on_documents`: Checks if the user has the necessary permissions to access the documents. In this case, it checks for "write" permission.
- `extract_chunks_from_documents`: Extracts text chunks based on the document type.
- `add_data_points`: Creates nodes and edges from the chunks and their properties. Adds them to the graph engine.
- `extract_graph_from_data`: Generates knowledge graphs from the document chunks.
- `summarize_text`: Extracts a summary for each chunk using an llm.

View file

@ -1,69 +0,0 @@
# QUICKSTART
!!! tip "To understand how cognee works check out the [conceptual overview](conceptual_overview.md)"
## Setup
To run cognee, you will need the following:
1. OpenAI API key (Ollama or Anyscale could work as [well](local_models.md))
Add your LLM API key to the environment variables
```
import os
os.environ["LLM_API_KEY"] = "YOUR_OPENAI_API_KEY"
```
or
```
cognee.config.llm_api_key = "YOUR_OPENAI_API_KEY"
```
If you are using Networkx, create an account on Graphistry to visualize results:
```
cognee.config.set_graphistry_config({
"username": "YOUR_USERNAME",
"password": "YOUR_PASSWORD"
})
```
If you want to run Postgres instead of Sqlite, run postgres Docker container.
Navigate to cognee folder and run:
```
docker compose up postgres
```
Add the following environment variables to .env file
```
DB_HOST=127.0.0.1
DB_PORT=5432
DB_USERNAME=cognee # or any username you want
DB_PASSWORD=cognee # or any password you want
DB_NAME=cognee_db # or any db name you want
DB_PROVIDER=postgres
```
## Run
cognee is asynchronous by design, meaning that operations like adding information, processing it, and querying it can run concurrently without blocking the execution of other tasks.
Make sure to await the results of the functions that you call.
```
import cognee
text = """Natural language processing (NLP) is an interdisciplinary
subfield of computer science and information retrieval"""
await cognee.add(text) # Add a new piece of information
await cognee.cognify() # Use LLMs and cognee to create knowledge
search_results = await cognee.search(SearchType.INSIGHTS, query_text='Tell me about NLP') # Query cognee for the knowledge
for result_text in search_results:
print(result_text)
```
In the example above, we add a piece of information to cognee, use LLMs to create a GraphRAG, and then query cognee for the knowledge.
cognee is composable and you can build your own cognee pipelines using our [templates.](templates.md)

View file

@ -1,78 +0,0 @@
## RAG Stack
Core elements of a RAG stack are the building blocks that we can use to get to more personalized and deterministic outputs.
!!! tip "This is a work in progress and any feedback is welcome"
## What is a RAG?
!!! note "What is RAG?"
RAG stands for Retrieval Augmented Generation. It is a model that combines the power of large language models (LLMs) like GPT-4 with the efficiency of information retrieval systems. The goal of RAG is to generate text that is both fluent and factually accurate by retrieving relevant information from a knowledge base.
To try building a simple RAG and understand the limitations, check out this simple guide with examples: [RAGs: Retrieval-Augmented Generation Explained](rag/rag_explained.md)
## The Building Blocks of a RAG Stack
### 1. Data Sources
You can get your data from a variety of sources, including:
- APIs like Twitter, Reddit, and Google
- Web scraping tools like Scrapy and Beautiful Soup
- Documents like PDFs, Word, and Excel files
- Relational databases like DuckDB, PSQL and MySQL
- Data warehouses like Snowflake and Databricks
- Customer data platforms like Segment
<figure markdown>
![Data Sources](img/sources.png)
<figcaption>Some data sources</figcaption>
</figure>
The goal here is to give the data structure and connect it so that it can be used in your deterministic LLM stack.
### 2. Data Loaders
<figure markdown>
![Data Loader](img/loaders.png)
<figcaption>Data Loaders</figcaption>
</figure>
Data loading into a data lake or warehouse involves using tools like Apache Airflow, dlt, dbt, and Databricks. The process includes data extraction, transformation, and loading for model usage, aiming for a clean, structured dataset ready for enrichment.
Check out how we do it with dlt: [Data Loading Tool (dlt)](dlt/dlt.md)
### 3. Vector Computation and Vector Stores
Data is transformed into vectors using OpenAI or custom models. Understanding where to run these models and integrating your computing infrastructure with tools like custom spark pipelines is essential. The aim is to achieve ready-to-use pipelines and models.
<figure markdown>
![Vector Stores](img/vector_dbs.png)
<figcaption>Vector Stores </figcaption>
</figure>
Image [Source](https://blog.det.life/why-you-shouldnt-invest-in-vector-databases-c0cd3f59d23c)
### 4. Graph Computation and Graph Stores
Creating a knowledge graph from your data allows for querying and information retrieval. It's essential to know how to construct, maintain, and use it for text generation. The aim is an accurate, current, and easily queried knowledge graph.
<figure markdown>
![Graph Stores](img/graph_example.png)
<figcaption>Graph Example</figcaption>
</figure>
### 5. Search
The process involves querying and retrieving vectors from Vector DBs or hybrid DBs, and using search tools to rank these vectors. The aim is to index vectors and search for relevant ones as needed.
#### Vector Similarity Search
Identifies objects with vector representations closest to the query vector, finding the most similar items based on various dimensions of comparison.
#### Image Search
Utilizes images as the input for conducting a similarity search, analyzing the content of the image to find similar images based on visual features.
#### Keyword Search
Employs the BM25F algorithm for ranking results based on keyword matches. Relevance is calculated using term frequency, inverse document frequency, and field-length normalization.
#### Hybrid Search
Merges the BM25 algorithm with vector similarity search techniques to enhance the relevance and accuracy of search results. Leverages both textual and vector-based features for ranking.
#### Generative Search
Utilizes the outputs of search results as prompts for a Large Language Model (LLM). Can generate summaries, extrapolations, or new content based on the aggregated search results.
#### Reranking
Involves the application of a reranker module to adjust the initial ranking of search results. Optimizes result relevance based on additional criteria or more complex models.
#### Aggregation
Involves compiling and summarizing data from a set of search results. Provides insights or overviews based on the collective information found.
#### Filters
Apply constraints or conditions to the search process to narrow down the results. Filters can be based on specific attributes, metadata, or other criteria relevant to the search domain.
#### Graph Search
Involves traversing a graph data structure to find specific nodes or paths. It can be used to find relationships between different entities in a knowledge graph.

View file

@ -1,62 +0,0 @@
# Research
The page is dedicated to collecting all research that was collected in the past one year from various sources.
This is not an exhaustive list, and any PRs would be welcome
### Research Papers
- [2024/06/04] [Symbolic reasoning](https://arxiv.org/abs/2402.01817)
- [2024/06/04] [Transformers and episodic memory](https://arxiv.org/abs/2405.14992)
- [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103)
- [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143)
- [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/)
- [2015/07/30] [Multilayer Network of Language](https://arxiv.org/abs/1507.08539)
- [2023/12/12] [Dense X Retrieval: What Retrieval Granularity Should We Use?](https://arxiv.org/pdf/2312.06648.pdf)
- [2024/01/05] [Retrieval-Augmented Generation for Large Language Models: A Survey](https://arxiv.org/pdf/2312.10997.pdf)
- [2022/10/20] [Cognitive modelling with multilayer networks: Insights, advancements and future challenges](https://arxiv.org/pdf/2210.00500.pdf)
- [2023/09/20] CoAla framework and relevant literature [literature](https://github.com/ysymyth/awesome-language-agents)
- [2023/06/09] [Mind2Web: Towards a Generalist Agent for the Web](https://arxiv.org/pdf/2306.06070.pdf), Xiang Deng, et al. [[code]](https://github.com/OSU-NLP-Group/Mind2Web) [[demo]](https://osu-nlp-group.github.io/Mind2Web/)
- [2023/06/28] AI Agents in Langchain [https://docs.google.com/presentation/d/1L_CHsg26sDxPmKj285Ob5T2xsAUejBlfiGQSnsSHTk0/edit#slide=id.g254e571859c_0_164](https://docs.google.com/presentation/d/1L_CHsg26sDxPmKj285Ob5T2xsAUejBlfiGQSnsSHTk0/edit#slide=id.g254e571859c_0_164)
- [2023/06/27] Agent infra [https://lilianweng.github.io/posts/2023-06-23-agent/](https://lilianweng.github.io/posts/2023-06-23-agent/)
- [2023/06/05] [Orca: Progressive Learning from Complex Explanation Traces of GPT-4](https://arxiv.org/pdf/2306.02707.pdf), Subhabrata Mukherjee et al.
- [2023/05/25] 📚[Voyager: An Open-Ended Embodied Agent with Large Language Models](https://arxiv.org/pdf/2305.16291.pdf), Guanzhi Wang, et al. [[code]](https://github.com/MineDojo/Voyager) [[website]](https://voyager.minedojo.org/), Shishir G. Patil, et al.
- [2023/05/24] 📚[Gorilla: Gorilla: Large Language Model Connected with Massive APIs](https://arxiv.org/abs/2305.15334)
- [2023/05/17] 📚[Tree of Thoughts: Deliberate Problem Solving with Large Language Models](https://arxiv.org/abs/2305.10601), Shunyu Yao, et al.[[code]](https://github.com/kyegomez/tree-of-thoughts) [[code-orig]](https://github.com/ysymyth/tree-of-thought-llm)
- [2023/05/12] 📚[MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers](https://arxiv.org/abs/2305.07185), Lili Yu, et al.
- [2023/05/09] 📚[FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance](https://arxiv.org/abs/2305.05176), Lingjiao Chen, et al.
- [2023/05/01] 📚[Learning to Reason and Memorize with Self-Notes](https://arxiv.org/abs/2305.00833), Jack Lanchantin, et al.
- [2023/04/24] 📚[WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244), Can Xu, et al.
- [2023/04/22] 📚[LLM+P: Empowering Large Language Models with Optimal Planning Proficiency](https://arxiv.org/abs/2304.11477), Bo Liu, et al.
- [2023/04/07] 📚[Generative Agents: Interactive Simulacra of Human Behavior](https://arxiv.org/abs/2304.03442), Joon Sung Park, et al. [[code]](https://github.com/mkturkcan/generative-agents)
- [2023/03/30] [Self-Refine: Iterative Refinement with Self-Feedback](https://arxiv.org/abs/2303.17651), Aman Madaan, et al.[[code]](https://github.com/madaan/self-refine)
- [2023/03/30] [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace](https://arxiv.org/pdf/2303.17580.pdf), Yongliang Shen, et al. [[code]](https://github.com/microsoft/JARVIS) [[demo]](https://huggingface.co/spaces/microsoft/HuggingGPT)
- [2023/03/20] [Reflexion: Language Agents with Verbal Reinforcement Learning](https://arxiv.org/pdf/2303.11366.pdf), Noah Shinn , et al. [[code]](https://github.com/noahshinn024/reflexion)
- [2023/02/23] 📚[Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection](https://arxiv.org/abs/2302.12173), Sahar Abdelnab, et al.
- [2023/02/09] 📚[Toolformer: Language Models Can Teach Themselves to Use Tools](https://arxiv.org/pdf/2302.04761.pdf), Timo Schick, et al. [[code]](https://github.com/lucidrains/toolformer-pytorch)
- [2022/12/12] 📚[LMQL: Prompting Is Programming: A Query Language for Large Language Models](https://arxiv.org/abs/2212.06094), Luca Beurer-Kellner, et al.
- [2022/10/06] [ReAct: Synergizing Reasoning and Acting in Language Models](https://arxiv.org/pdf/2210.03629.pdf), Shunyu Yao, et al. [[code]](https://github.com/ysymyth/ReAct)
- [2022/07/12] 📚[Inner Monologue: Embodied Reasoning through Planning with Language Models](https://arxiv.org/pdf/2207.05608.pdf), Wenlong Huang, et al. [[demo]](https://innermonologue.github.io/)
- [2022/04/04] [Do As I Can, Not As I Say: Grounding Language in Robotic Affordances](https://github.com/Significant-Gravitas/Nexus/wiki/Awesome-Resources), Michael Ahn, e al. [[demo]](https://say-can.github.io/)
- [2021/12/17] [WebGPT: Browser-assisted question-answering with human feedback](https://arxiv.org/pdf/2112.09332.pdf), Reiichiro Nakano, et al.
- [2021/06/17] 📚[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685), Edward J. Hu, et al.
- [2023/04/03] [Generative Agents](https://arxiv.org/abs/2304.03442)
- [2023/05/17] [Three of thought: Deliberate Problem Solving with Large Language Mode](https://arxiv.org/abs/2305.10601)ls
### Knowledge Graphs
- [2023/06/09] [Taxonomies: Overview](https://www.brighttalk.com/webcast/9273/605659?utm_source=brighttalk-portal&utm_medium=web&utm_campaign=topic&utm_content=upcoming)
### Blog Articles
- [2023/04/29] [AUTO-GPT: UNLEASHING THE POWER OF AUTONOMOUS AI AGENTS](https://www.leewayhertz.com/autogpt/) By Akash Takyar
- [2023/04/20] [Conscious Machines: Experiments, Theory, and Implementations(Chinese)](https://pattern.swarma.org/article/230) By Jiang Zhang
- [2023/04/18] [Autonomous Agents & Agent Simulations](https://blog.langchain.dev/agents-round/) By Langchain
- [2023/04/16] [4 Autonomous AI Agents you need to know](https://towardsdatascience.com/4-autonomous-ai-agents-you-need-to-know-d612a643fa92) By Sophia Yang
- [2023/03/31] [ChatGPT that learns to use tools](https://zhuanlan.zhihu.com/p/618448188) By Haojie Pan
### Talks
- [2023/06/05] [Two Paths to Intelligence](https://www.youtube.com/watch?v=rGgGOccMEiY&t=1497s) by Geoffrey Hinton
- [2023/05/24] [State of GPT](https://www.youtube.com/watch?v=bZQun8Y4L2A) by Andrej Karpathy | OpenAI
- [2024/03/15] Podcast on AI, Memory by Bill Gurley

View file

@ -1,21 +0,0 @@
## Cognee Search Module
This module contains the search function that is used to search for nodes in the graph. It supports various search types and integrates with user permissions to filter results accordingly.
### Search Types
The `SearchType` enum defines the different types of searches that can be performed:
- `INSIGHTS`: Search for insights from the knowledge graph.
- `SUMMARIES`: Search for summaries of the texts provided.
- `CHUNKS`: Search for the whole chunks of data.
### Search Function
The `search` function is the main entry point for performing a search. It handles user authentication, retrieves document IDs for the user, and filters the search results based on user permissions.
```python
from cognee import search, SearchType
await search(SearchType.INSIGHTS, "your_query")
```

View file

@ -1,51 +0,0 @@
[data-md-color-scheme = "cognee"] {
color-scheme: dark;
--md-default-bg-color: #0C0121;
--md-default-bg-color--light: #240067;
--md-default-fg-color: #57DFD7;
--md-default-fg-color--light: #85ded8;
--md-default-fg-color--dark: #4dc6be;
/* --md-primary-fg-color: #0C0121; */
--md-primary-fg-color: #7233BA;
--md-primary-fg-color--light: #8a49d4;
--md-primary-fg-color--dark: #522488;
/* --md-primary-bg-color: hsla(0, 0%, 100%, 1);
--md-primary-bg-color--light: */
--md-accent-fg-color: #41a29b;
--md-typeset-color: white;
--md-typeset-a-color: #57DFD7;
--md-footer-bg-color: #0C0121;
--md-footer-bg-color--dark: #0C0121;
}
.md-header {
background-color: var(--md-default-bg-color);
}
/* Remove unnecessary title from the header */
.md-header__title {
display: none;
}
/* Spread header elements evenly when there is no title */
.md-header__inner {
justify-content: space-between;
}
.md-tabs {
background-color: var(--md-default-bg-color);
}
.md-button--primary:hover {
background-color: #8a49d4 !important;
}
.md-typeset .md-button {
border-radius: 32px;
}

View file

@ -1,4 +0,0 @@
# Team
![About us](img/team.png)

View file

@ -1,29 +0,0 @@
# Why use cognee?
cognee is one of the first OSS tools that enables easy, scalable and flexible use of LLMs to process large volumes of documents using GraphRAG approach.
LLMs don't have a semantic layer, and they don't have a way to understand the data they are processing. This is where cognee comes in.
We let you define logical structures for your data and then use these structures to guide the LLMs to process the data in a way that makes sense to you.
cognee helps you avoid the overly complicated set of tools and processes to give you somewhat reliable output
***From***
![Bad Architecture](img/bad_architecture.png)
***To***
![Good Architecture](img/good_architecture.png)
??? note "Why use cognee?"
Its hard to answer the question of why use cognee without answering why you need thin LLM frameworks in the first place.:)
- **Cost-effective** — cognee extends the capabilities of your LLMs without the need for expensive data processing tools.
- **Self-contained** — cognee runs as a simple-to-use library meaning you can add it to your application easily
- **Easy to use** — Navigate graphs instead of embeddings to understand your data faster and better
- **Flexible** — cognee lets you control your input and provide your own Pydantic data models.

View file

@ -1,14 +1,72 @@
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.metrics import BaseMetric, GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.official_hotpot_metrics import exact_match_score, f1_score
correctness_metric = GEval(
name="Correctness",
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT
],
evaluation_steps=[
"Determine whether the actual output is factually correct based on the expected output."
]
)
name="Correctness",
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT
],
evaluation_steps=[
"Determine whether the actual output is factually correct based on the expected output."
]
)
class f1_score_metric(BaseMetric):
"""F1 score taken directly from the official hotpot benchmark
implementation and wrapped into a deepeval metric."""
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def measure(self, test_case: LLMTestCase):
f1, precision, recall = f1_score(
prediction=test_case.actual_output,
ground_truth=test_case.expected_output,
)
self.score = f1
self.success = self.score >= self.threshold
return self.score
# Reusing regular measure as async F1 score is not implemented
async def a_measure(self, test_case: LLMTestCase):
return self.measure(test_case)
def is_successful(self):
return self.success
@property
def __name__(self):
return "Official hotpot F1 score"
class em_score_metric(BaseMetric):
"""Exact Match score taken directly from the official hotpot benchmark
implementation and wrapped into a deepeval metric."""
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def measure(self, test_case: LLMTestCase):
self.score = exact_match_score(
prediction=test_case.actual_output,
ground_truth=test_case.expected_output,
)
self.success = self.score >= self.threshold
return self.score
# Reusing regular measure as async F1 score is not implemented
async def a_measure(self, test_case: LLMTestCase):
return self.measure(test_case)
def is_successful(self):
return self.success
@property
def __name__(self):
return "Official hotpot EM score"

View file

@ -111,7 +111,9 @@ if __name__ == "__main__":
parser.add_argument("--with_cognee", action="store_true")
parser.add_argument("--num_samples", type=int, default=500)
parser.add_argument("--metric", type=str, default="correctness_metric")
parser.add_argument("--metric", type=str, default="correctness_metric",
help="Valid options are Deepeval metrics (e.g. AnswerRelevancyMetric) \
and metrics defined in evals/deepeval_metrics.py, e.g. f1_score_metric")
args = parser.parse_args()
@ -120,6 +122,8 @@ if __name__ == "__main__":
metric = metric_cls()
except AttributeError:
metric = getattr(evals.deepeval_metrics, args.metric)
if isinstance(metric, type):
metric = metric()
if args.with_cognee:
answer_provider = answer_with_cognee

View file

@ -0,0 +1,86 @@
"""
These are the official evaluation metrics for HotpotQA taken from https://hotpotqa.github.io/
"""
import re
import string
import sys
from collections import Counter
import ujson as json
def normalize_answer(s):
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
normalized_prediction = normalize_answer(prediction)
normalized_ground_truth = normalize_answer(ground_truth)
ZERO_METRIC = (0, 0, 0)
if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
return ZERO_METRIC
if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
return ZERO_METRIC
prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return ZERO_METRIC
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1, precision, recall
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def update_answer(metrics, prediction, gold):
em = exact_match_score(prediction, gold)
f1, prec, recall = f1_score(prediction, gold)
metrics['em'] += float(em)
metrics['f1'] += f1
metrics['prec'] += prec
metrics['recall'] += recall
return em, prec, recall
def update_sp(metrics, prediction, gold):
cur_sp_pred = set(map(tuple, prediction))
gold_sp_pred = set(map(tuple, gold))
tp, fp, fn = 0, 0, 0
for e in cur_sp_pred:
if e in gold_sp_pred:
tp += 1
else:
fp += 1
for e in gold_sp_pred:
if e not in cur_sp_pred:
fn += 1
prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
em = 1.0 if fp + fn == 0 else 0.0
metrics['sp_em'] += em
metrics['sp_f1'] += f1
metrics['sp_prec'] += prec
metrics['sp_recall'] += recall
return em, prec, recall

View file

@ -1,152 +0,0 @@
site_name: cognee
site_author: Vasilije Markovic
site_description: desc
repo_name: cognee
repo_url: https://github.com/topoteretes/cognee
site_url: https://www.congee.ai
edit_uri: edit/main/docs/
copyright: Copyright &copy; 2024 cognee
theme:
name: material
logo: assets/logo.png
favicon: assets/favicon.png
icon:
repo: fontawesome/brands/github
edit: material/pencil
view: material/eye
theme:
admonition:
note: octicons/tag-16
abstract: octicons/checklist-16
info: octicons/info-16
tip: octicons/squirrel-16
success: octicons/check-16
question: octicons/question-16
warning: octicons/alert-16
failure: octicons/x-circle-16
danger: octicons/zap-16
bug: octicons/bug-16
example: octicons/beaker-16
quote: octicons/quote-16
features:
- announce.dismiss
- content.action.edit
- content.action.view
- content.code.annotate
- content.code.copy
- content.code.select
- content.tabs.link
- content.tooltips
- header.autohide
- navigation.expand
- navigation.footer
- navigation.indexes
- navigation.instant
- navigation.instant.prefetch
- navigation.instant.progress
- navigation.prune
- navigation.sections
- navigation.tabs
- navigation.top
- navigation.tracking
- navigation.path
- search.highlight
- search.share
- search.suggest
- toc.follow
# - toc.integrate
palette:
- scheme: cognee
primary: custom
font:
text: Roboto
code: Roboto Mono
custom_dir: docs/overrides
extra:
analytics:
provider: segment
key: !ENV DOCS_SEGMENT_KEY
extra_css:
- stylesheets/extra.css
# Extensions
markdown_extensions:
- abbr
- admonition
- pymdownx.details
- attr_list
- def_list
- footnotes
- md_in_html
- toc:
permalink: true
- pymdownx.arithmatex:
generic: true
- pymdownx.betterem:
smart_enable: all
- pymdownx.caret
- pymdownx.details
- pymdownx.emoji:
emoji_generator: !!python/name:material.extensions.emoji.to_svg
emoji_index: !!python/name:material.extensions.emoji.twemoji
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.keys
- pymdownx.magiclink:
normalize_issue_symbols: true
repo_url_shorthand: true
user: tricalt
repo: cognee
- pymdownx.mark
- pymdownx.smartsymbols
- pymdownx.snippets:
auto_append:
- includes/mkdocs.md
- pymdownx.superfences:
custom_fences:
- name: mermaid
class: mermaid
format: !!python/name:pymdownx.superfences.fence_code_format
- pymdownx.tabbed:
alternate_style: true
combine_header_slug: true
- pymdownx.tasklist:
custom_checkbox: true
nav:
- Overview:
- Overview: 'index.md'
- Start here:
- Installation: 'quickstart.md'
- Add data: 'data_ingestion.md'
- Create LLM enriched data store: 'templates.md'
- Explore data: 'search.md'
- Configuration: 'configuration.md'
- What is cognee:
- Introduction: 'conceptual_overview.md'
- API reference: 'api_reference.md'
plugins:
- search:
separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])'
- minify:
minify_html: true
minify_js: true
minify_css: true
htmlmin_opts:
remove_comments: true
cache_safe: true
- mkdocstrings:
handlers:
python:
options:
members_order: alphabetical
allow_inspection: true
show_bases: true

71
poetry.lock generated
View file

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "aiofiles"
@ -2893,6 +2893,8 @@ optional = false
python-versions = "*"
files = [
{file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"},
{file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"},
{file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"},
]
[package.dependencies]
@ -3103,13 +3105,13 @@ test = ["jupyter-server (>=2.0.0)", "pytest (>=7.0)", "pytest-jupyter[server] (>
[[package]]
name = "jupyterlab"
version = "4.2.6"
version = "4.3.3"
description = "JupyterLab computational environment"
optional = true
python-versions = ">=3.8"
files = [
{file = "jupyterlab-4.2.6-py3-none-any.whl", hash = "sha256:78dd42cae5b460f377624b03966a8730e3b0692102ddf5933a2a3730c1bc0a20"},
{file = "jupyterlab-4.2.6.tar.gz", hash = "sha256:625f3ac19da91f9706baf66df25723b2f1307c1159fc7293035b066786d62a4a"},
{file = "jupyterlab-4.3.3-py3-none-any.whl", hash = "sha256:32a8fd30677e734ffcc3916a4758b9dab21b02015b668c60eb36f84357b7d4b1"},
{file = "jupyterlab-4.3.3.tar.gz", hash = "sha256:76fa39e548fdac94dc1204af5956c556f54c785f70ee26aa47ea08eda4d5bbcd"},
]
[package.dependencies]
@ -3124,15 +3126,15 @@ jupyter-server = ">=2.4.0,<3"
jupyterlab-server = ">=2.27.1,<3"
notebook-shim = ">=0.2"
packaging = "*"
setuptools = ">=40.1.0"
setuptools = ">=40.8.0"
tomli = {version = ">=1.2.2", markers = "python_version < \"3.11\""}
tornado = ">=6.2.0"
traitlets = "*"
[package.extras]
dev = ["build", "bump2version", "coverage", "hatch", "pre-commit", "pytest-cov", "ruff (==0.3.5)"]
docs = ["jsx-lexer", "myst-parser", "pydata-sphinx-theme (>=0.13.0)", "pytest", "pytest-check-links", "pytest-jupyter", "sphinx (>=1.8,<7.3.0)", "sphinx-copybutton"]
docs-screenshots = ["altair (==5.3.0)", "ipython (==8.16.1)", "ipywidgets (==8.1.2)", "jupyterlab-geojson (==3.4.0)", "jupyterlab-language-pack-zh-cn (==4.1.post2)", "matplotlib (==3.8.3)", "nbconvert (>=7.0.0)", "pandas (==2.2.1)", "scipy (==1.12.0)", "vega-datasets (==0.9.0)"]
dev = ["build", "bump2version", "coverage", "hatch", "pre-commit", "pytest-cov", "ruff (==0.6.9)"]
docs = ["jsx-lexer", "myst-parser", "pydata-sphinx-theme (>=0.13.0)", "pytest", "pytest-check-links", "pytest-jupyter", "sphinx (>=1.8,<8.1.0)", "sphinx-copybutton"]
docs-screenshots = ["altair (==5.4.1)", "ipython (==8.16.1)", "ipywidgets (==8.1.5)", "jupyterlab-geojson (==3.4.0)", "jupyterlab-language-pack-zh-cn (==4.2.post3)", "matplotlib (==3.9.2)", "nbconvert (>=7.0.0)", "pandas (==2.2.3)", "scipy (==1.14.1)", "vega-datasets (==0.9.0)"]
test = ["coverage", "pytest (>=7.0)", "pytest-check-links (>=0.7)", "pytest-console-scripts", "pytest-cov", "pytest-jupyter (>=0.5.3)", "pytest-timeout", "pytest-tornasync", "requests", "requests-cache", "virtualenv"]
upgrade-extension = ["copier (>=9,<10)", "jinja2-time (<0.3)", "pydantic (<3.0)", "pyyaml-include (<3.0)", "tomli-w (<2.0)"]
@ -4529,18 +4531,18 @@ twitter = ["twython"]
[[package]]
name = "notebook"
version = "7.2.2"
version = "7.3.1"
description = "Jupyter Notebook - A web-based notebook environment for interactive computing"
optional = true
python-versions = ">=3.8"
files = [
{file = "notebook-7.2.2-py3-none-any.whl", hash = "sha256:c89264081f671bc02eec0ed470a627ed791b9156cad9285226b31611d3e9fe1c"},
{file = "notebook-7.2.2.tar.gz", hash = "sha256:2ef07d4220421623ad3fe88118d687bc0450055570cdd160814a59cf3a1c516e"},
{file = "notebook-7.3.1-py3-none-any.whl", hash = "sha256:212e1486b2230fe22279043f33c7db5cf9a01d29feb063a85cb139747b7c9483"},
{file = "notebook-7.3.1.tar.gz", hash = "sha256:84381c2a82d867517fd25b86e986dae1fe113a70b98f03edff9b94e499fec8fa"},
]
[package.dependencies]
jupyter-server = ">=2.4.0,<3"
jupyterlab = ">=4.2.0,<4.3"
jupyterlab = ">=4.3.2,<4.4"
jupyterlab-server = ">=2.27.1,<3"
notebook-shim = ">=0.2,<0.3"
tornado = ">=6.2.0"
@ -6889,28 +6891,29 @@ files = [
[[package]]
name = "ruff"
version = "0.2.2"
version = "0.8.2"
description = "An extremely fast Python linter and code formatter, written in Rust."
optional = false
python-versions = ">=3.7"
files = [
{file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
{file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
{file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
{file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
{file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
{file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
{file = "ruff-0.8.2-py3-none-linux_armv6l.whl", hash = "sha256:c49ab4da37e7c457105aadfd2725e24305ff9bc908487a9bf8d548c6dad8bb3d"},
{file = "ruff-0.8.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ec016beb69ac16be416c435828be702ee694c0d722505f9c1f35e1b9c0cc1bf5"},
{file = "ruff-0.8.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f05cdf8d050b30e2ba55c9b09330b51f9f97d36d4673213679b965d25a785f3c"},
{file = "ruff-0.8.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60f578c11feb1d3d257b2fb043ddb47501ab4816e7e221fbb0077f0d5d4e7b6f"},
{file = "ruff-0.8.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cbd5cf9b0ae8f30eebc7b360171bd50f59ab29d39f06a670b3e4501a36ba5897"},
{file = "ruff-0.8.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b402ddee3d777683de60ff76da801fa7e5e8a71038f57ee53e903afbcefdaa58"},
{file = "ruff-0.8.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:705832cd7d85605cb7858d8a13d75993c8f3ef1397b0831289109e953d833d29"},
{file = "ruff-0.8.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:32096b41aaf7a5cc095fa45b4167b890e4c8d3fd217603f3634c92a541de7248"},
{file = "ruff-0.8.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e769083da9439508833cfc7c23e351e1809e67f47c50248250ce1ac52c21fb93"},
{file = "ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fe716592ae8a376c2673fdfc1f5c0c193a6d0411f90a496863c99cd9e2ae25d"},
{file = "ruff-0.8.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:81c148825277e737493242b44c5388a300584d73d5774defa9245aaef55448b0"},
{file = "ruff-0.8.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d261d7850c8367704874847d95febc698a950bf061c9475d4a8b7689adc4f7fa"},
{file = "ruff-0.8.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1ca4e3a87496dc07d2427b7dd7ffa88a1e597c28dad65ae6433ecb9f2e4f022f"},
{file = "ruff-0.8.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:729850feed82ef2440aa27946ab39c18cb4a8889c1128a6d589ffa028ddcfc22"},
{file = "ruff-0.8.2-py3-none-win32.whl", hash = "sha256:ac42caaa0411d6a7d9594363294416e0e48fc1279e1b0e948391695db2b3d5b1"},
{file = "ruff-0.8.2-py3-none-win_amd64.whl", hash = "sha256:2aae99ec70abf43372612a838d97bfe77d45146254568d94926e8ed5bbb409ea"},
{file = "ruff-0.8.2-py3-none-win_arm64.whl", hash = "sha256:fb88e2a506b70cfbc2de6fae6681c4f944f7dd5f2fe87233a7233d888bad73e8"},
{file = "ruff-0.8.2.tar.gz", hash = "sha256:b84f4f414dda8ac7f75075c1fa0b905ac0ff25361f42e6d5da681a465e0f78e5"},
]
[[package]]
@ -7183,13 +7186,13 @@ win32 = ["pywin32"]
[[package]]
name = "sentry-sdk"
version = "2.19.0"
version = "2.19.2"
description = "Python client for Sentry (https://sentry.io)"
optional = false
python-versions = ">=3.6"
files = [
{file = "sentry_sdk-2.19.0-py2.py3-none-any.whl", hash = "sha256:7b0b3b709dee051337244a09a30dbf6e95afe0d34a1f8b430d45e0982a7c125b"},
{file = "sentry_sdk-2.19.0.tar.gz", hash = "sha256:ee4a4d2ae8bfe3cac012dcf3e4607975904c137e1738116549fc3dbbb6ff0e36"},
{file = "sentry_sdk-2.19.2-py2.py3-none-any.whl", hash = "sha256:ebdc08228b4d131128e568d696c210d846e5b9d70aa0327dec6b1272d9d40b84"},
{file = "sentry_sdk-2.19.2.tar.gz", hash = "sha256:467df6e126ba242d39952375dd816fbee0f217d119bf454a8ce74cf1e7909e8d"},
]
[package.dependencies]
@ -8868,4 +8871,4 @@ weaviate = ["weaviate-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9.0,<3.12"
content-hash = "c1f30981f79db94213a89aec3207f0b4775944968e97dda8aa49c3aa143ce7b5"
content-hash = "18d78e556471b4b63c948138233ef4b38bba02f649a469180c2b1c292f0d61df"

View file

@ -103,7 +103,7 @@ notebook = {version = "^7.1.0", optional = true}
deptry = "^0.20.0"
debugpy = "1.8.2"
pylint = "^3.0.3"
ruff = "^0.2.2"
ruff = ">=0.2.2,<0.9.0"
tweepy = "4.14.0"
gitpython = "^3.1.43"