Merge remote-tracking branch 'origin/main' into feat/COG-184-add-falkordb

This commit is contained in:
Boris Arzentar 2024-11-07 11:29:09 +01:00
commit 758698a35b
49 changed files with 2109 additions and 1674 deletions

81
.github/workflows/auto-comment.yml vendored Normal file
View file

@ -0,0 +1,81 @@
name: Issue and PR Auto Comments
on:
issues:
types:
- opened
- closed
- assigned
pull_request_target:
types:
- opened
- closed
permissions:
contents: read
jobs:
auto-comment:
permissions:
issues: write
pull-requests: write
runs-on: ubuntu-latest
steps:
# configuration for auto-comment actions
- name: Configure Auto Comments
uses: wow-actions/auto-comment@v1
with:
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
issuesOpened: |
👋 @{{ author }}
Thank you for raising an issue. We will investigate the matter and get back to you as soon as possible.
To help us address your issue efficiently, please ensure you have provided:
- A clear description of the problem
- Steps to reproduce (if applicable)
- Expected vs actual behavior
- Any relevant screenshots or error messages
Our team typically responds within 2-3 business days.
issuesClosed: |
✅ @{{ author }}
This issue has been closed. If you have any further questions or if the issue resurfaces,
please feel free to:
- Add a comment to this thread
- Open a new issue with reference to this one
Thank you for helping us improve!
pullRequestOpened: |
👍 @{{ author }}
Thank you for your pull request and contributing to our community!
Please ensure you have:
- [ ] Followed our contributing guidelines
- [ ] Added/updated tests (if applicable)
- [ ] Updated documentation (if applicable)
- [ ] Added a descriptive PR title
Our team will review your contribution as soon as possible. Feel free to reach out if you need any assistance.
# Separate action for merged PRs
- name: Handle Merged Pull Requests
if: github.event.pull_request.merged == true
uses: actions-cool/pr-welcome@v1.2.1
with:
token: ${{ secrets.GH_TOKEN }}
comment: |
🎉 Fantastic work @${{ github.event.pull_request.user.login }}! 🎉
Your pull request has been merged successfully. Thank you for your valuable contribution!
We appreciate the time and effort you've put into improving our project.
Your changes will be included in our next release.
Keep up the great work! 💪
emoji: 'rocket'
pr-emoji: '+1, heart, rocket'

View file

@ -20,10 +20,16 @@ jobs:
uses: docker/setup-buildx-action@v3
- name: Build Docker images
env:
ENVIRONMENT: dev
ENV: dev
run: |
docker compose -f docker-compose.yml build
- name: Run Docker Compose
env:
ENVIRONMENT: dev
ENV: dev
run: |
docker compose -f docker-compose.yml up -d

View file

@ -25,7 +25,7 @@ RUN pip install poetry
RUN poetry config virtualenvs.create false
# Install the dependencies
RUN poetry install --no-root --no-dev
RUN poetry install --all-extras --no-root --no-dev
# Set the PYTHONPATH environment variable to include the /app directory

View file

@ -29,6 +29,10 @@ If you have questions, join our <a href="https://discord.gg/NQPKmU5CCg">Discord
pip install cognee
```
### With pip with PostgreSQL support
```bash
pip install cognee[postgres]
```
### With poetry
@ -36,6 +40,11 @@ pip install cognee
poetry add cognee
```
### With poetry with PostgreSQL support
```bash
poetry add cognee -E postgres
```
## 💻 Basic Usage
@ -50,7 +59,7 @@ os.environ["LLM_API_KEY"] = "YOUR OPENAI_API_KEY"
or
```
import cognee
cognee.config.llm_api_key = "YOUR_OPENAI_API_KEY"
cognee.config.set_llm_api_key("YOUR_OPENAI_API_KEY")
```
You can also set the variables by creating .env file, here is our <a href="https://github.com/topoteretes/cognee/blob/main/.env.template">template.</a>
To use different LLM providers, for more info check out our <a href="https://topoteretes.github.io/cognee">documentation</a>
@ -73,26 +82,54 @@ docker-compose up
```
Then navigate to localhost:3000
If you want to use the UI with PostgreSQL through docker-compose make sure to set the following values in the .env file:
```
DB_PROVIDER=postgres
DB_HOST=postgres
DB_PORT=5432
DB_NAME=cognee_db
DB_USERNAME=cognee
DB_PASSWORD=cognee
```
### Simple example
Run the default cognee pipeline:
First, copy `.env.template` to `.env` and add your OpenAI API key to the LLM_API_KEY field.
```
Optionally, set `VECTOR_DB_PROVIDER="lancedb"` in `.env` to simplify setup.
This script will run the default pipeline:
```python
import cognee
import asyncio
from cognee.api.v1.search import SearchType
text = """Natural language processing (NLP) is an interdisciplinary
subfield of computer science and information retrieval"""
async def main():
await cognee.prune.prune_data() # Reset cognee data
await cognee.prune.prune_system(metadata=True) # Reset cognee system state
await cognee.add(text) # Add a new piece of information
text = """
Natural language processing (NLP) is an interdisciplinary
subfield of computer science and information retrieval.
"""
await cognee.cognify() # Use LLMs and cognee to create a knowledge graph
await cognee.add(text) # Add text to cognee
await cognee.cognify() # Use LLMs and cognee to create knowledge graph
search_results = await cognee.search("INSIGHTS", {'query': 'NLP'}) # Query cognee for the insights
search_results = await cognee.search( # Search cognee for insights
SearchType.INSIGHTS,
{'query': 'Tell me about NLP'}
)
for result in search_results:
do_something_with_result(result)
for result_text in search_results: # Display results
print(result_text)
asyncio.run(main())
```
A version of this example is here: `examples/pyton/simple_example.py`
### Create your own memory store

View file

@ -1,24 +1,11 @@
""" FastAPI server for the Cognee API. """
from datetime import datetime
import os
from uuid import UUID
import aiohttp
import uvicorn
import logging
import sentry_sdk
from typing import List, Union, Optional, Literal
from typing_extensions import Annotated
from fastapi import FastAPI, HTTPException, Form, UploadFile, Query, Depends
from fastapi.responses import JSONResponse, FileResponse, Response
from fastapi import FastAPI
from fastapi.responses import JSONResponse, Response
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from cognee.api.DTO import InDTO, OutDTO
from cognee.api.v1.search import SearchType
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user
from cognee.modules.pipelines.models import PipelineRunStatus
# Set up logging
logging.basicConfig(
@ -65,9 +52,12 @@ app.add_middleware(
from cognee.api.v1.users.routers import get_auth_router, get_register_router,\
get_reset_password_router, get_verify_router, get_users_router
from cognee.api.v1.permissions.get_permissions_router import get_permissions_router
from cognee.api.v1.permissions.routers import get_permissions_router
from cognee.api.v1.settings.routers import get_settings_router
from cognee.api.v1.datasets.routers import get_datasets_router
from cognee.api.v1.cognify.routers import get_cognify_router
from cognee.api.v1.search.routers import get_search_router
from cognee.api.v1.add.routers import get_add_router
from fastapi import Request
from fastapi.encoders import jsonable_encoder
@ -137,261 +127,35 @@ def health_check():
"""
return Response(status_code = 200)
app.include_router(
get_datasets_router(),
prefix="/api/v1/datasets",
tags=["datasets"]
)
class ErrorResponseDTO(BaseModel):
message: str
app.include_router(
get_add_router(),
prefix="/api/v1/add",
tags=["add"]
)
app.include_router(
get_cognify_router(),
prefix="/api/v1/cognify",
tags=["cognify"]
)
class DatasetDTO(OutDTO):
id: UUID
name: str
created_at: datetime
updated_at: Optional[datetime]
owner_id: UUID
@app.get("/api/v1/datasets", response_model = list[DatasetDTO])
async def get_datasets(user: User = Depends(get_authenticated_user)):
try:
from cognee.modules.data.methods import get_datasets
datasets = await get_datasets(user.id)
return datasets
except Exception as error:
logger.error(f"Error retrieving datasets: {str(error)}")
raise HTTPException(status_code = 500, detail = f"Error retrieving datasets: {str(error)}") from error
@app.delete("/api/v1/datasets/{dataset_id}", response_model = None, responses = { 404: { "model": ErrorResponseDTO }})
async def delete_dataset(dataset_id: str, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_dataset, delete_dataset
dataset = await get_dataset(user.id, dataset_id)
if dataset is None:
raise HTTPException(
status_code = 404,
detail = f"Dataset ({dataset_id}) not found."
)
await delete_dataset(dataset)
@app.get("/api/v1/datasets/{dataset_id}/graph", response_model = str)
async def get_dataset_graph(dataset_id: str, user: User = Depends(get_authenticated_user)):
from cognee.shared.utils import render_graph
from cognee.infrastructure.databases.graph import get_graph_engine
try:
graph_client = await get_graph_engine()
graph_url = await render_graph(graph_client.graph)
return JSONResponse(
status_code = 200,
content = str(graph_url),
)
except:
return JSONResponse(
status_code = 409,
content = "Graphistry credentials are not set. Please set them in your .env file.",
)
class DataDTO(OutDTO):
id: UUID
name: str
created_at: datetime
updated_at: Optional[datetime]
extension: str
mime_type: str
raw_data_location: str
@app.get("/api/v1/datasets/{dataset_id}/data", response_model = list[DataDTO], responses = { 404: { "model": ErrorResponseDTO }})
async def get_dataset_data(dataset_id: str, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_dataset_data, get_dataset
dataset = await get_dataset(user.id, dataset_id)
if dataset is None:
return JSONResponse(
status_code = 404,
content = ErrorResponseDTO(f"Dataset ({dataset_id}) not found."),
)
dataset_data = await get_dataset_data(dataset_id = dataset.id)
if dataset_data is None:
return []
return dataset_data
@app.get("/api/v1/datasets/status", response_model = dict[str, PipelineRunStatus])
async def get_dataset_status(datasets: Annotated[List[str], Query(alias="dataset")] = None, user: User = Depends(get_authenticated_user)):
from cognee.api.v1.datasets.datasets import datasets as cognee_datasets
try:
datasets_statuses = await cognee_datasets.get_status(datasets)
return datasets_statuses
except Exception as error:
return JSONResponse(
status_code = 409,
content = {"error": str(error)}
)
@app.get("/api/v1/datasets/{dataset_id}/data/{data_id}/raw", response_class = FileResponse)
async def get_raw_data(dataset_id: str, data_id: str, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_dataset, get_dataset_data
dataset = await get_dataset(user.id, dataset_id)
if dataset is None:
return JSONResponse(
status_code = 404,
content = {
"detail": f"Dataset ({dataset_id}) not found."
}
)
dataset_data = await get_dataset_data(dataset.id)
if dataset_data is None:
raise HTTPException(status_code = 404, detail = f"Dataset ({dataset_id}) not found.")
data = [data for data in dataset_data if str(data.id) == data_id][0]
if data is None:
return JSONResponse(
status_code = 404,
content = {
"detail": f"Data ({data_id}) not found in dataset ({dataset_id})."
}
)
return data.raw_data_location
@app.post("/api/v1/add", response_model = None)
async def add(
data: List[UploadFile],
datasetId: str = Form(...),
user: User = Depends(get_authenticated_user),
):
""" This endpoint is responsible for adding data to the graph."""
from cognee.api.v1.add import add as cognee_add
try:
if isinstance(data, str) and data.startswith("http"):
if "github" in data:
# Perform git clone if the URL is from GitHub
repo_name = data.split("/")[-1].replace(".git", "")
os.system(f"git clone {data} .data/{repo_name}")
await cognee_add(
"data://.data/",
f"{repo_name}",
)
else:
# Fetch and store the data from other types of URL using curl
async with aiohttp.ClientSession() as session:
async with session.get(data) as resp:
if resp.status == 200:
file_data = await resp.read()
with open(f".data/{data.split('/')[-1]}", "wb") as f:
f.write(file_data)
await cognee_add(
"data://.data/",
f"{data.split('/')[-1]}",
)
else:
await cognee_add(
data,
datasetId,
user = user,
)
except Exception as error:
return JSONResponse(
status_code = 409,
content = {"error": str(error)}
)
class CognifyPayloadDTO(BaseModel):
datasets: List[str]
@app.post("/api/v1/cognify", response_model = None)
async def cognify(payload: CognifyPayloadDTO, user: User = Depends(get_authenticated_user)):
""" This endpoint is responsible for the cognitive processing of the content."""
from cognee.api.v1.cognify.cognify_v2 import cognify as cognee_cognify
try:
await cognee_cognify(payload.datasets, user)
except Exception as error:
return JSONResponse(
status_code = 409,
content = {"error": str(error)}
)
class SearchPayloadDTO(InDTO):
search_type: SearchType
query: str
@app.post("/api/v1/search", response_model = list)
async def search(payload: SearchPayloadDTO, user: User = Depends(get_authenticated_user)):
""" This endpoint is responsible for searching for nodes in the graph."""
from cognee.api.v1.search import search as cognee_search
try:
results = await cognee_search(payload.search_type, payload.query, user)
return results
except Exception as error:
return JSONResponse(
status_code = 409,
content = {"error": str(error)}
)
from cognee.modules.settings.get_settings import LLMConfig, VectorDBConfig
class LLMConfigDTO(OutDTO, LLMConfig):
pass
class VectorDBConfigDTO(OutDTO, VectorDBConfig):
pass
class SettingsDTO(OutDTO):
llm: LLMConfigDTO
vector_db: VectorDBConfigDTO
@app.get("/api/v1/settings", response_model = SettingsDTO)
async def get_settings(user: User = Depends(get_authenticated_user)):
from cognee.modules.settings import get_settings as get_cognee_settings
return get_cognee_settings()
class LLMConfigDTO(InDTO):
provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"]]
model: str
api_key: str
class VectorDBConfigDTO(InDTO):
provider: Union[Literal["lancedb"], Literal["qdrant"], Literal["weaviate"], Literal["pgvector"]]
url: str
api_key: str
class SettingsPayloadDTO(InDTO):
llm: Optional[LLMConfigDTO] = None
vector_db: Optional[VectorDBConfigDTO] = None
@app.post("/api/v1/settings", response_model = None)
async def save_settings(new_settings: SettingsPayloadDTO, user: User = Depends(get_authenticated_user)):
from cognee.modules.settings import save_llm_config, save_vector_db_config
if new_settings.llm is not None:
await save_llm_config(new_settings.llm)
if new_settings.vector_db is not None:
await save_vector_db_config(new_settings.vector_db)
app.include_router(
get_search_router(),
prefix="/api/v1/search",
tags=["search"]
)
app.include_router(
get_settings_router(),
prefix="/api/v1/settings",
tags=["settings"]
)
def start_api_server(host: str = "0.0.0.0", port: int = 8000):
"""

View file

@ -0,0 +1 @@
from .get_add_router import get_add_router

View file

@ -0,0 +1,60 @@
from fastapi import Form, UploadFile, Depends
from fastapi.responses import JSONResponse
from fastapi import APIRouter
from typing import List
import aiohttp
import subprocess
import logging
import os
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user
logger = logging.getLogger(__name__)
def get_add_router() -> APIRouter:
router = APIRouter()
@router.post("/", response_model=None)
async def add(
data: List[UploadFile],
datasetId: str = Form(...),
user: User = Depends(get_authenticated_user),
):
""" This endpoint is responsible for adding data to the graph."""
from cognee.api.v1.add import add as cognee_add
try:
if isinstance(data, str) and data.startswith("http"):
if "github" in data:
# Perform git clone if the URL is from GitHub
repo_name = data.split("/")[-1].replace(".git", "")
subprocess.run(["git", "clone", data, f".data/{repo_name}"], check=True)
await cognee_add(
"data://.data/",
f"{repo_name}",
)
else:
# Fetch and store the data from other types of URL using curl
async with aiohttp.ClientSession() as session:
async with session.get(data) as resp:
if resp.status == 200:
file_data = await resp.read()
filename = os.path.basename(data)
with open(f".data/{filename}", "wb") as f:
f.write(file_data)
await cognee_add(
"data://.data/",
f"{data.split('/')[-1]}",
)
else:
await cognee_add(
data,
datasetId,
user=user,
)
except Exception as error:
return JSONResponse(
status_code=409,
content={"error": str(error)}
)
return router

View file

@ -0,0 +1 @@
from .get_cognify_router import get_cognify_router

View file

@ -0,0 +1,27 @@
from fastapi import APIRouter
from typing import List
from pydantic import BaseModel
from cognee.modules.users.models import User
from fastapi.responses import JSONResponse
from cognee.modules.users.methods import get_authenticated_user
from fastapi import Depends
class CognifyPayloadDTO(BaseModel):
datasets: List[str]
def get_cognify_router() -> APIRouter:
router = APIRouter()
@router.post("/", response_model=None)
async def cognify(payload: CognifyPayloadDTO, user: User = Depends(get_authenticated_user)):
""" This endpoint is responsible for the cognitive processing of the content."""
from cognee.api.v1.cognify.cognify_v2 import cognify as cognee_cognify
try:
await cognee_cognify(payload.datasets, user)
except Exception as error:
return JSONResponse(
status_code=409,
content={"error": str(error)}
)
return router

View file

@ -5,6 +5,7 @@ from cognee.modules.cognify.config import get_cognify_config
from cognee.infrastructure.data.chunking.config import get_chunk_config
from cognee.infrastructure.databases.vector import get_vectordb_config
from cognee.infrastructure.databases.graph.config import get_graph_config
from cognee.infrastructure.llm.config import get_llm_config
from cognee.infrastructure.databases.relational import get_relational_config
from cognee.infrastructure.files.storage import LocalStorage
@ -55,19 +56,36 @@ class config():
graph_config.graph_database_provider = graph_database_provider
@staticmethod
def llm_provider(llm_provider: str):
graph_config = get_graph_config()
graph_config.llm_provider = llm_provider
def set_llm_provider(llm_provider: str):
llm_config = get_llm_config()
llm_config.llm_provider = llm_provider
@staticmethod
def llm_endpoint(llm_endpoint: str):
graph_config = get_graph_config()
graph_config.llm_endpoint = llm_endpoint
def set_llm_endpoint(llm_endpoint: str):
llm_config = get_llm_config()
llm_config.llm_endpoint = llm_endpoint
@staticmethod
def llm_model(llm_model: str):
graph_config = get_graph_config()
graph_config.llm_model = llm_model
def set_llm_model(llm_model: str):
llm_config = get_llm_config()
llm_config.llm_model = llm_model
@staticmethod
def set_llm_api_key(llm_api_key: str):
llm_config = get_llm_config()
llm_config.llm_api_key = llm_api_key
@staticmethod
def set_llm_config(config_dict: dict):
"""
Updates the llm config with values from config_dict.
"""
llm_config = get_llm_config()
for key, value in config_dict.items():
if hasattr(llm_config, key):
object.__setattr__(llm_config, key, value)
else:
raise AttributeError(f"'{key}' is not a valid attribute of the config.")
@staticmethod
def set_chunk_strategy(chunk_strategy: object):
@ -137,5 +155,5 @@ class config():
if "username" not in graphistry_config or "password" not in graphistry_config:
raise ValueError("graphistry_config dictionary must contain 'username' and 'password' keys.")
base_config.graphistry_username = graphistry_config.username
base_config.graphistry_password = graphistry_config.password
base_config.graphistry_username = graphistry_config.get("username")
base_config.graphistry_password = graphistry_config.get("password")

View file

@ -0,0 +1 @@
from .get_datasets_router import get_datasets_router

View file

@ -0,0 +1,178 @@
import logging
from fastapi import APIRouter
from datetime import datetime
from uuid import UUID
from typing import List, Optional
from typing_extensions import Annotated
from fastapi import HTTPException, Query, Depends
from fastapi.responses import JSONResponse, FileResponse
from pydantic import BaseModel
from cognee.api.DTO import OutDTO
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_authenticated_user
from cognee.modules.pipelines.models import PipelineRunStatus
logger = logging.getLogger(__name__)
class ErrorResponseDTO(BaseModel):
message: str
class DatasetDTO(OutDTO):
id: UUID
name: str
created_at: datetime
updated_at: Optional[datetime] = None
owner_id: UUID
class DataDTO(OutDTO):
id: UUID
name: str
created_at: datetime
updated_at: Optional[datetime] = None
extension: str
mime_type: str
raw_data_location: str
def get_datasets_router() -> APIRouter:
router = APIRouter()
@router.get("/", response_model=list[DatasetDTO])
async def get_datasets(user: User = Depends(get_authenticated_user)):
try:
from cognee.modules.data.methods import get_datasets
datasets = await get_datasets(user.id)
return datasets
except Exception as error:
logger.error(f"Error retrieving datasets: {str(error)}")
raise HTTPException(status_code=500, detail=f"Error retrieving datasets: {str(error)}") from error
@router.delete("/{dataset_id}", response_model=None, responses={404: {"model": ErrorResponseDTO}})
async def delete_dataset(dataset_id: str, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_dataset, delete_dataset
dataset = await get_dataset(user.id, dataset_id)
if dataset is None:
raise HTTPException(
status_code=404,
detail=f"Dataset ({dataset_id}) not found."
)
await delete_dataset(dataset)
@router.delete("/{dataset_id}/data/{data_id}", response_model=None, responses={404: {"model": ErrorResponseDTO}})
async def delete_data(dataset_id: str, data_id: str, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_data, delete_data
from cognee.modules.data.methods import get_dataset
# Check if user has permission to access dataset and data by trying to get the dataset
dataset = await get_dataset(user.id, dataset_id)
#TODO: Handle situation differently if user doesn't have permission to access data?
if dataset is None:
raise HTTPException(
status_code=404,
detail=f"Dataset ({dataset_id}) not found."
)
data = await get_data(data_id)
if data is None:
raise HTTPException(
status_code=404,
detail=f"Dataset ({data_id}) not found."
)
await delete_data(data)
@router.get("/{dataset_id}/graph", response_model=str)
async def get_dataset_graph(dataset_id: str, user: User = Depends(get_authenticated_user)):
from cognee.shared.utils import render_graph
from cognee.infrastructure.databases.graph import get_graph_engine
try:
graph_client = await get_graph_engine()
graph_url = await render_graph(graph_client.graph)
return JSONResponse(
status_code=200,
content=str(graph_url),
)
except:
return JSONResponse(
status_code=409,
content="Graphistry credentials are not set. Please set them in your .env file.",
)
@router.get("/{dataset_id}/data", response_model=list[DataDTO],
responses={404: {"model": ErrorResponseDTO}})
async def get_dataset_data(dataset_id: str, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_dataset_data, get_dataset
dataset = await get_dataset(user.id, dataset_id)
if dataset is None:
return JSONResponse(
status_code=404,
content=ErrorResponseDTO(f"Dataset ({dataset_id}) not found."),
)
dataset_data = await get_dataset_data(dataset_id=dataset.id)
if dataset_data is None:
return []
return dataset_data
@router.get("/status", response_model=dict[str, PipelineRunStatus])
async def get_dataset_status(datasets: Annotated[List[str], Query(alias="dataset")] = None,
user: User = Depends(get_authenticated_user)):
from cognee.api.v1.datasets.datasets import datasets as cognee_datasets
try:
datasets_statuses = await cognee_datasets.get_status(datasets)
return datasets_statuses
except Exception as error:
return JSONResponse(
status_code=409,
content={"error": str(error)}
)
@router.get("/{dataset_id}/data/{data_id}/raw", response_class=FileResponse)
async def get_raw_data(dataset_id: str, data_id: str, user: User = Depends(get_authenticated_user)):
from cognee.modules.data.methods import get_dataset, get_dataset_data
dataset = await get_dataset(user.id, dataset_id)
if dataset is None:
return JSONResponse(
status_code=404,
content={
"detail": f"Dataset ({dataset_id}) not found."
}
)
dataset_data = await get_dataset_data(dataset.id)
if dataset_data is None:
raise HTTPException(status_code=404, detail=f"No data found in dataset ({dataset_id}).")
matching_data = [data for data in dataset_data if str(data.id) == data_id]
# Check if matching_data contains an element
if len(matching_data) == 0:
return JSONResponse(
status_code=404,
content={
"detail": f"Data ({data_id}) not found in dataset ({dataset_id})."
}
)
data = matching_data[0]
return data.raw_data_location
return router

View file

@ -0,0 +1 @@
from .get_permissions_router import get_permissions_router

View file

@ -0,0 +1 @@
from .get_search_router import get_search_router

View file

@ -0,0 +1,31 @@
from cognee.api.v1.search import SearchType
from fastapi.responses import JSONResponse
from cognee.modules.users.models import User
from fastapi import Depends, APIRouter
from cognee.api.DTO import InDTO
from cognee.modules.users.methods import get_authenticated_user
class SearchPayloadDTO(InDTO):
search_type: SearchType
query: str
def get_search_router() -> APIRouter:
router = APIRouter()
@router.post("/", response_model = list)
async def search(payload: SearchPayloadDTO, user: User = Depends(get_authenticated_user)):
""" This endpoint is responsible for searching for nodes in the graph."""
from cognee.api.v1.search import search as cognee_search
try:
results = await cognee_search(payload.search_type, payload.query, user)
return results
except Exception as error:
return JSONResponse(
status_code = 409,
content = {"error": str(error)}
)
return router

View file

@ -0,0 +1 @@
from .get_settings_router import get_settings_router

View file

@ -0,0 +1,51 @@
from fastapi import APIRouter
from cognee.api.DTO import InDTO, OutDTO
from typing import Union, Optional, Literal
from cognee.modules.users.methods import get_authenticated_user
from fastapi import Depends
from cognee.modules.users.models import User
from cognee.modules.settings.get_settings import LLMConfig, VectorDBConfig
class LLMConfigOutputDTO(OutDTO, LLMConfig):
pass
class VectorDBConfigOutputDTO(OutDTO, VectorDBConfig):
pass
class SettingsDTO(OutDTO):
llm: LLMConfigOutputDTO
vector_db: VectorDBConfigOutputDTO
class LLMConfigInputDTO(InDTO):
provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"]]
model: str
api_key: str
class VectorDBConfigInputDTO(InDTO):
provider: Union[Literal["lancedb"], Literal["qdrant"], Literal["weaviate"], Literal["pgvector"]]
url: str
api_key: str
class SettingsPayloadDTO(InDTO):
llm: Optional[LLMConfigInputDTO] = None
vector_db: Optional[VectorDBConfigInputDTO] = None
def get_settings_router() -> APIRouter:
router = APIRouter()
@router.get("/", response_model=SettingsDTO)
async def get_settings(user: User = Depends(get_authenticated_user)):
from cognee.modules.settings import get_settings as get_cognee_settings
return get_cognee_settings()
@router.post("/", response_model=None)
async def save_settings(new_settings: SettingsPayloadDTO, user: User = Depends(get_authenticated_user)):
from cognee.modules.settings import save_llm_config, save_vector_db_config
if new_settings.llm is not None:
await save_llm_config(new_settings.llm)
if new_settings.vector_db is not None:
await save_vector_db_config(new_settings.vector_db)
return router

View file

@ -89,10 +89,22 @@ class SQLAlchemyAdapter():
"""
Delete data in given table based on id. Table must have an id Column.
"""
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
await session.execute(TableModel.delete().where(TableModel.c.id == data_id))
await session.commit()
if self.engine.dialect.name == "sqlite":
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
# Foreign key constraints are disabled by default in SQLite (for backwards compatibility),
# so must be enabled for each database connection/session separately.
await session.execute(text("PRAGMA foreign_keys = ON;"))
await session.execute(TableModel.delete().where(TableModel.c.id == data_id))
await session.commit()
else:
async with self.get_async_session() as session:
TableModel = await self.get_table(table_name, schema_name)
await session.execute(TableModel.delete().where(TableModel.c.id == data_id))
await session.commit()
async def get_table(self, table_name: str, schema_name: Optional[str] = "public") -> Table:
"""

View file

@ -1,25 +0,0 @@
from typing import List, Optional
from fastembed import TextEmbedding
from cognee.root_dir import get_absolute_path
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
class FastembedEmbeddingEngine(EmbeddingEngine):
embedding_model: str
embedding_dimensions: int
def __init__(
self,
embedding_model: Optional[str] = "BAAI/bge-large-en-v1.5",
embedding_dimensions: Optional[int] = 1024,
):
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
async def embed_text(self, text: List[str]) -> List[float]:
embedding_model = TextEmbedding(model_name = self.embedding_model, cache_dir = get_absolute_path("cache/embeddings"))
embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text)))
return embeddings_list
def get_vector_size(self) -> int:
return self.embedding_dimensions

View file

@ -193,7 +193,10 @@ class LanceDBAdapter(VectorDBInterface):
async def delete_data_points(self, collection_name: str, data_point_ids: list[str]):
connection = await self.get_connection()
collection = await connection.open_table(collection_name)
results = await collection.delete(f"id IN {tuple(data_point_ids)}")
if len(data_point_ids) == 1:
results = await collection.delete(f"id = '{data_point_ids[0]}'")
else:
results = await collection.delete(f"id IN {tuple(data_point_ids)}")
return results
async def create_vector_index(self, index_name: str, index_property_name: str):

View file

@ -33,7 +33,6 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
self.api_key = api_key
self.embedding_engine = embedding_engine
self.db_uri: str = connection_string
self.engine = create_async_engine(self.db_uri)
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)

View file

@ -1,7 +1,8 @@
from typing import BinaryIO
from pypdf import PdfReader
import filetype
def extract_text_from_file(file: BinaryIO, file_type) -> str:
def extract_text_from_file(file: BinaryIO, file_type: filetype.Type) -> str:
"""Extract text from a file"""
if file_type.extension == "pdf":
reader = PdfReader(stream = file)

View file

@ -1,5 +0,0 @@
import os
def get_file_size(file_path: str):
"""Get the size of a file"""
return os.path.getsize(file_path)

View file

@ -1,4 +1,3 @@
import dsp
import dspy
from dspy.evaluate.evaluate import Evaluate
from dspy.primitives.example import Example

View file

@ -1,4 +1,3 @@
import dsp
import dspy
from dspy.teleprompt import BootstrapFewShot
from dspy.primitives.example import Example

View file

@ -6,6 +6,8 @@ from .get_dataset import get_dataset
from .get_datasets import get_datasets
from .get_datasets_by_name import get_datasets_by_name
from .get_dataset_data import get_dataset_data
from .get_data import get_data
# Delete
from .delete_dataset import delete_dataset
from .delete_data import delete_data

View file

@ -0,0 +1,19 @@
from cognee.modules.data.models import Data
from cognee.infrastructure.databases.relational import get_relational_engine
async def delete_data(data: Data):
"""Delete a data record from the database.
Args:
data (Data): The data object to be deleted.
Raises:
ValueError: If the data object is invalid.
"""
if not hasattr(data, '__tablename__'):
raise ValueError("The provided data object is missing the required '__tablename__' attribute.")
db_engine = get_relational_engine()
return await db_engine.delete_data_by_id(data.__tablename__, data.id)

View file

@ -4,4 +4,4 @@ from cognee.infrastructure.databases.relational import get_relational_engine
async def delete_dataset(dataset: Dataset):
db_engine = get_relational_engine()
return await db_engine.delete_table(dataset.id)
return await db_engine.delete_data_by_id(dataset.__tablename__, dataset.id)

View file

@ -0,0 +1,20 @@
from uuid import UUID
from typing import Optional
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models import Data
async def get_data(data_id: UUID) -> Optional[Data]:
"""Retrieve data by ID.
Args:
data_id (UUID): ID of the data to retrieve
Returns:
Optional[Data]: The requested data object if found, None otherwise
"""
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
data = await session.get(Data, data_id)
return data

View file

@ -1,8 +1,9 @@
from typing import Optional
from uuid import UUID
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models import Dataset
async def get_dataset(user_id: UUID, dataset_id: UUID) -> Dataset:
async def get_dataset(user_id: UUID, dataset_id: UUID) -> Optional[Dataset]:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:

View file

@ -20,9 +20,11 @@ class Data(Base):
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))
datasets: Mapped[List["Dataset"]] = relationship(
"Dataset",
secondary = DatasetData.__tablename__,
back_populates = "data",
lazy = "noload",
cascade="all, delete"
)
def to_json(self) -> dict:

View file

@ -19,9 +19,11 @@ class Dataset(Base):
owner_id = Column(UUID, index = True)
data: Mapped[List["Data"]] = relationship(
"Data",
secondary = DatasetData.__tablename__,
back_populates = "datasets",
lazy = "noload",
cascade="all, delete"
)
def to_json(self) -> dict:

View file

@ -7,5 +7,5 @@ class DatasetData(Base):
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
dataset_id = Column(UUID, ForeignKey("datasets.id"), primary_key = True)
data_id = Column(UUID, ForeignKey("data.id"), primary_key = True)
dataset_id = Column(UUID, ForeignKey("datasets.id", ondelete="CASCADE"), primary_key = True)
data_id = Column(UUID, ForeignKey("data.id", ondelete="CASCADE"), primary_key = True)

View file

@ -5,7 +5,7 @@ from .models.Task import Task
class PipelineConfig(BaseModel):
batch_count: int = 10
description: Optional[str]
description: Optional[str] = None
class Pipeline():
id: UUID = uuid4()

View file

@ -1,8 +1,8 @@
from typing import Any, Callable, Generator
from typing import Any, Callable, Generator, List
import asyncio
from ..tasks.Task import Task
def run_tasks_parallel(tasks: [Task]) -> Callable[[Any], Generator[Any, Any, Any]]:
def run_tasks_parallel(tasks: List[Task]) -> Callable[[Any], Generator[Any, Any, Any]]:
async def parallel_run(*args, **kwargs):
parallel_tasks = [asyncio.create_task(task.run(*args, **kwargs)) for task in tasks]

View file

@ -18,7 +18,7 @@ class Directory(BaseModel):
directories: List['Directory'] = []
# Allows recursive Directory Model
Directory.update_forward_refs()
Directory.model_rebuild()
class RepositoryProperties(BaseModel):
custom_properties: Optional[Dict[str, Any]] = None

View file

@ -6,15 +6,15 @@ class BaseClass(BaseModel):
name: str
type: Literal["Class"] = "Class"
description: str
constructor_parameters: Optional[List[str]]
constructor_parameters: Optional[List[str]] = None
class Class(BaseModel):
id: str
name: str
type: Literal["Class"] = "Class"
description: str
constructor_parameters: Optional[List[str]]
from_class: Optional[BaseClass]
constructor_parameters: Optional[List[str]] = None
from_class: Optional[BaseClass] = None
class ClassInstance(BaseModel):
id: str
@ -28,7 +28,7 @@ class Function(BaseModel):
name: str
type: Literal["Function"] = "Function"
description: str
parameters: Optional[List[str]]
parameters: Optional[List[str]] = None
return_type: str
is_static: Optional[bool] = False
@ -38,7 +38,7 @@ class Variable(BaseModel):
type: Literal["Variable"] = "Variable"
description: str
is_static: Optional[bool] = False
default_value: Optional[str]
default_value: Optional[str] = None
class Operator(BaseModel):
id: str

View file

@ -21,7 +21,6 @@ async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classific
for chunk_index, chunk in enumerate(data_chunks):
chunk_classification = chunk_classifications[chunk_index]
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
classification_data_points.append(uuid5(NAMESPACE_OID, chunk_classification.label.type))
for classification_subclass in chunk_classification.label.subclass:
classification_data_points.append(uuid5(NAMESPACE_OID, classification_subclass.value))
@ -39,7 +38,7 @@ async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classific
if await vector_engine.has_collection(collection_name):
existing_data_points = await vector_engine.retrieve(
collection_name,
list(set(classification_data_points)),
[str(classification_data) for classification_data in list(set(classification_data_points))],
) if len(classification_data_points) > 0 else []
existing_points_map = {point.id: True for point in existing_data_points}
@ -60,7 +59,7 @@ async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classific
data_points.append(
DataPoint[Keyword](
id=str(classification_type_id),
payload=Keyword.parse_obj({
payload=Keyword.model_validate({
"uuid": str(classification_type_id),
"text": classification_type_label,
"chunk_id": str(data_chunk.chunk_id),
@ -99,7 +98,7 @@ async def chunk_naive_llm_classifier(data_chunks: list[DocumentChunk], classific
data_points.append(
DataPoint[Keyword](
id=str(classification_subtype_id),
payload=Keyword.parse_obj({
payload=Keyword.model_validate({
"uuid": str(classification_subtype_id),
"text": classification_subtype_label,
"chunk_id": str(data_chunk.chunk_id),

View file

@ -56,7 +56,7 @@ class OntologyEngine:
for item in items:
flat_list.extend(await self.recursive_flatten(item, parent_id))
elif isinstance(items, dict):
model = NodeModel.parse_obj(items)
model = NodeModel.model_validate(items)
flat_list.append(await self.flatten_model(model, parent_id))
for child in model.children:
flat_list.extend(await self.recursive_flatten(child, model.node_id))

View file

@ -0,0 +1,31 @@
from typing import Any, Dict, List, Optional, Union
from pydantic import BaseModel, Field
class RelationshipModel(BaseModel):
type: str
source: str
target: str
class NodeModel(BaseModel):
node_id: str
name: str
default_relationship: Optional[RelationshipModel] = None
children: List[Union[Dict[str, Any], "NodeModel"]] = Field(default_factory=list)
NodeModel.model_rebuild()
class OntologyNode(BaseModel):
id: str = Field(..., description = "Unique identifier made from node name.")
name: str
description: str
class OntologyEdge(BaseModel):
id: str
source_id: str
target_id: str
relationship_type: str
class GraphOntology(BaseModel):
nodes: list[OntologyNode]
edges: list[OntologyEdge]

View file

@ -49,7 +49,7 @@ async def main():
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\n\Extracted summaries are:\n")
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")

View file

@ -53,7 +53,7 @@ async def main():
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\n\Extracted summaries are:\n")
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")

View file

@ -54,7 +54,7 @@ async def main():
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\n\Extracted summaries are:\n")
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")

View file

@ -52,7 +52,7 @@ async def main():
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\n\Extracted summaries are:\n")
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")

View file

@ -14,9 +14,11 @@ Check available configuration options:
from cognee.infrastructure.databases.vector import get_vectordb_config
from cognee.infrastructure.databases.graph.config import get_graph_config
from cognee.infrastructure.databases.relational import get_relational_config
from cognee.infrastructure.llm.config import get_llm_config
print(get_vectordb_config().to_dict())
print(get_graph_config().to_dict())
print(get_relational_config().to_dict())
print(get_llm_config().to_dict())
```
@ -29,8 +31,7 @@ GRAPH_DATABASE_PROVIDER = 'lancedb'
Otherwise, you can set the configuration yourself:
```python
cognee.config.llm_provider = 'ollama'
cognee.config.set_llm_provider('ollama')
```
## 🚀 Getting Started with Local Models
@ -52,15 +53,14 @@ LLM_PROVIDER = 'ollama'
Otherwise, you can set the configuration for the model:
```bash
cognee.config.llm_provider = 'ollama'
cognee.config.set_llm_provider('ollama')
```
You can also set the HOST and model name:
```bash
cognee.config.llm_endpoint = "http://localhost:11434/v1"
cognee.config.llm_model = "mistral:instruct"
cognee.config.set_llm_endpoint("http://localhost:11434/v1")
cognee.config.set_llm_model("mistral:instruct")
```
@ -73,7 +73,7 @@ LLM_PROVIDER = 'custom'
Otherwise, you can set the configuration for the model:
```bash
cognee.config.llm_provider = 'custom'
cognee.config.set_llm_provider('custom')
```
You can also set the HOST and model name:

View file

@ -0,0 +1,39 @@
import cognee
import asyncio
from cognee.api.v1.search import SearchType
# Prerequisites:
# 1. Copy `.env.template` and rename it to `.env`.
# 2. Add your OpenAI API key to the `.env` file in the `LLM_API_KEY` field:
# LLM_API_KEY = "your_key_here"
# 3. (Optional) To minimize setup effort, set `VECTOR_DB_PROVIDER="lancedb"` in `.env".
async def main():
# Create a clean slate for cognee -- reset data and system state
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
# cognee knowledge graph will be created based on this text
text = """
Natural language processing (NLP) is an interdisciplinary
subfield of computer science and information retrieval.
"""
# Add the text, and make it available for cognify
await cognee.add(text)
# Use LLMs and cognee to create knowledge graph
await cognee.cognify()
# Query cognee for insights on the added text
search_results = await cognee.search(
SearchType.INSIGHTS,
{'query': 'Tell me about NLP'}
)
# Display search results
for result_text in search_results:
print(result_text)
if __name__ == '__main__':
asyncio.run(main())

2664
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -19,60 +19,57 @@ classifiers = [
[tool.poetry.dependencies]
python = ">=3.9.0,<3.12"
openai = "1.27.0"
openai = "1.52.0"
pydantic = "2.8.2"
python-dotenv = "1.0.1"
fastapi = "^0.109.2"
uvicorn = "0.22.0"
requests = "2.32.3"
aiohttp = "3.10.10"
typing_extensions = "4.12.2"
dspy = "2.5.25"
nest_asyncio = "1.6.0"
numpy = "1.26.4"
datasets = "3.1.0"
falkordb = "1.0.9"
boto3 = "^1.26.125"
botocore="^1.35.54"
gunicorn = "^20.1.0"
sqlalchemy = "2.0.35"
instructor = "1.3.5"
instructor = "1.6.3"
networkx = "^3.2.1"
debugpy = "1.8.2"
pyarrow = "15.0.0"
pylint = "^3.0.3"
aiosqlite = "^0.20.0"
pandas = "2.0.3"
greenlet = "^3.0.3"
ruff = "^0.2.2"
filetype = "^1.2.0"
nltk = "^3.8.1"
dlt = {extras = ["sqlalchemy"], version = "^1.2.0"}
overrides = "^7.7.0"
aiofiles = "^23.2.1"
qdrant-client = "^1.9.0"
graphistry = "^0.33.5"
tenacity = "^8.2.3"
tenacity = "^9.0.0"
weaviate-client = "4.6.7"
scikit-learn = "^1.5.0"
fastembed = "0.2.7"
pypdf = "^4.1.0"
neo4j = "^5.20.0"
jinja2 = "^3.1.3"
matplotlib = "^3.8.3"
structlog = "^24.1.0"
tiktoken = "0.7.0"
langchain_text_splitters = "0.3.2"
langsmith = "0.1.139"
langdetect = "1.0.9"
posthog = "^3.5.0"
lancedb = "0.15.0"
litellm = "1.38.10"
litellm = "1.49.1"
groq = "0.8.0"
tantivy = "^0.22.0"
tokenizers ="0.15.2"
transformers ="4.39.0"
python-multipart = "^0.0.9"
langfuse = "^2.32.0"
protobuf = "<5.0.0"
pydantic-settings = "^2.2.1"
anthropic = "^0.26.1"
pdfplumber = "^0.11.1"
sentry-sdk = {extras = ["fastapi"], version = "^2.9.0"}
fastapi-users = { version = "*", extras = ["sqlalchemy"] }
fastapi-users = {version = "*", extras = ["sqlalchemy"]}
asyncpg = "^0.29.0"
alembic = "^1.13.3"
pgvector = "^0.3.5"
psycopg2 = {version = "^2.9.10", optional = true}
falkordb = "^1.0.9"
[tool.poetry.extras]
filesystem = ["s3fs", "botocore"]
@ -89,6 +86,11 @@ pytest-asyncio = "^0.21.1"
coverage = "^7.3.2"
mypy = "^1.7.1"
notebook = "^7.1.1"
deptry = "^0.20.0"
debugpy = "1.8.2"
pylint = "^3.0.3"
ruff = "^0.2.2"
tweepy = "4.14.0"
[tool.poetry.group.docs.dependencies]
mkdocs-material = "^9.5.42"