Merge branch 'main' into COG-533-pydantic-unit-tests

This commit is contained in:
Leon Luithlen 2024-11-18 11:24:51 +01:00
commit fde56f0c3b
57 changed files with 877 additions and 166 deletions

BIN
.DS_Store vendored

Binary file not shown.

View file

@ -0,0 +1,63 @@
name: test | llama index notebook
on:
workflow_dispatch:
pull_request:
branches:
- main
types: [labeled, synchronize]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
RUNTIME__LOG_LEVEL: ERROR
jobs:
get_docs_changes:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
run_notebook_test:
name: test
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && github.event.label.name == 'run-checks'
runs-on: ubuntu-latest
defaults:
run:
shell: bash
steps:
- name: Check out
uses: actions/checkout@master
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11.x'
- name: Install Poetry
uses: snok/install-poetry@v1.3.2
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
- name: Install dependencies
run: |
poetry install --no-interaction --all-extras --no-root
poetry add jupyter --no-interaction
- name: Execute Jupyter Notebook
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
run: |
poetry run jupyter nbconvert \
--to notebook \
--execute notebooks/cognee_llama_index.ipynb \
--output executed_notebook.ipynb \
--ExecutePreprocessor.timeout=1200

View file

@ -0,0 +1,8 @@
import { fetch } from '@/utils';
export default function getHistory() {
return fetch(
'/v1/search',
)
.then((response) => response.json());
}

View file

@ -1,9 +1,12 @@
'use client';
import { v4 } from 'uuid';
import classNames from 'classnames';
import { useCallback, useState } from 'react';
import { useCallback, useEffect, useState } from 'react';
import { CTAButton, Stack, Text, DropdownSelect, TextArea, useBoolean } from 'ohmy-ui';
import { fetch } from '@/utils';
import styles from './SearchView.module.css';
import getHistory from '@/modules/chat/getHistory';
interface Message {
id: string;
@ -52,6 +55,14 @@ export default function SearchView() {
}, 300);
}, []);
useEffect(() => {
getHistory()
.then((history) => {
setMessages(history);
scrollToBottom();
});
}, [scrollToBottom]);
const handleSearchSubmit = useCallback((event: React.FormEvent<HTMLFormElement>) => {
event.preventDefault();
@ -78,7 +89,7 @@ export default function SearchView() {
'Content-Type': 'application/json',
},
body: JSON.stringify({
query: inputValue,
query: inputValue.trim(),
searchType: searchTypeValue,
}),
})

BIN
cognee/.DS_Store vendored

Binary file not shown.

View file

@ -2,7 +2,7 @@ from .api.v1.config.config import config
from .api.v1.add import add
from .api.v1.cognify import cognify
from .api.v1.datasets.datasets import datasets
from .api.v1.search import search, SearchType
from .api.v1.search import search, SearchType, get_search_history
from .api.v1.prune import prune
# Pipelines

View file

@ -21,4 +21,4 @@ async def add(data: Union[BinaryIO, list[BinaryIO], str, list[str]], dataset_nam
pipeline = run_tasks(tasks, data, "add_pipeline")
async for result in pipeline:
print(result)
print(result)

View file

@ -1 +1,2 @@
from .search_v2 import search, SearchType
from .get_search_history import get_search_history

View file

@ -0,0 +1,9 @@
from cognee.modules.search.operations import get_history
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.models import User
async def get_search_history(user: User = None) -> list:
if not user:
user = await get_default_user()
return await get_history(user.id)

View file

@ -1,8 +1,11 @@
from cognee.api.v1.search import SearchType
from fastapi.responses import JSONResponse
from cognee.modules.users.models import User
from uuid import UUID
from datetime import datetime
from fastapi import Depends, APIRouter
from cognee.api.DTO import InDTO
from fastapi.responses import JSONResponse
from cognee.api.v1.search import SearchType
from cognee.api.DTO import InDTO, OutDTO
from cognee.modules.users.models import User
from cognee.modules.search.operations import get_history
from cognee.modules.users.methods import get_authenticated_user
@ -13,6 +16,24 @@ class SearchPayloadDTO(InDTO):
def get_search_router() -> APIRouter:
router = APIRouter()
class SearchHistoryItem(OutDTO):
id: UUID
text: str
user: str
created_at: datetime
@router.get("/", response_model = list[SearchHistoryItem])
async def get_search_history(user: User = Depends(get_authenticated_user)):
try:
history = await get_history(user.id)
return history
except Exception as error:
return JSONResponse(
status_code = 500,
content = {"error": str(error)}
)
@router.post("/", response_model = list)
async def search(payload: SearchPayloadDTO, user: User = Depends(get_authenticated_user)):
""" This endpoint is responsible for searching for nodes in the graph."""
@ -28,4 +49,4 @@ def get_search_router() -> APIRouter:
content = {"error": str(error)}
)
return router
return router

View file

@ -1,6 +1,9 @@
import json
from uuid import UUID
from enum import Enum
from typing import Callable, Dict
from cognee.modules.search.operations import log_query, log_result
from cognee.modules.storage.utils import JSONEncoder
from cognee.shared.utils import send_telemetry
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
@ -14,15 +17,17 @@ class SearchType(Enum):
INSIGHTS = "INSIGHTS"
CHUNKS = "CHUNKS"
async def search(search_type: SearchType, query: str, user: User = None) -> list:
async def search(query_type: SearchType, query_text: str, user: User = None) -> list:
if user is None:
user = await get_default_user()
if user is None:
raise PermissionError("No user found in the system. Please create a user.")
query = await log_query(query_text, str(query_type), user.id)
own_document_ids = await get_document_ids_for_user(user.id)
search_results = await specific_search(search_type, query, user)
search_results = await specific_search(query_type, query_text, user)
filtered_search_results = []
@ -33,19 +38,21 @@ async def search(search_type: SearchType, query: str, user: User = None) -> list
if document_id is None or document_id in own_document_ids:
filtered_search_results.append(search_result)
await log_result(query.id, json.dumps(filtered_search_results, cls = JSONEncoder), user.id)
return filtered_search_results
async def specific_search(search_type: SearchType, query: str, user) -> list:
async def specific_search(query_type: SearchType, query: str, user) -> list:
search_tasks: Dict[SearchType, Callable] = {
SearchType.SUMMARIES: query_summaries,
SearchType.INSIGHTS: query_graph_connections,
SearchType.CHUNKS: query_chunks,
}
search_task = search_tasks.get(search_type)
search_task = search_tasks.get(query_type)
if search_task is None:
raise ValueError(f"Unsupported search type: {search_type}")
raise ValueError(f"Unsupported search type: {query_type}")
send_telemetry("cognee.search EXECUTION STARTED", user.id)

View file

@ -2,6 +2,3 @@ from .ModelBase import Base
from .config import get_relational_config
from .create_db_and_tables import create_db_and_tables
from .get_relational_engine import get_relational_engine
# Global data types
from .data_types.UUID import UUID

View file

@ -1,45 +0,0 @@
import uuid
from sqlalchemy.types import TypeDecorator, BINARY
from sqlalchemy.dialects.postgresql import UUID as psqlUUID
class UUID(TypeDecorator):
"""Platform-independent GUID type.
Uses Postgresql's UUID type, otherwise uses
BINARY(16), to store UUID.
"""
impl = BINARY
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(psqlUUID())
else:
return dialect.type_descriptor(BINARY(16))
def process_bind_param(self, value, dialect):
if value is None:
return value
else:
if not isinstance(value, uuid.UUID):
if isinstance(value, bytes):
value = uuid.UUID(bytes = value)
elif isinstance(value, int):
value = uuid.UUID(int = value)
elif isinstance(value, str):
value = uuid.UUID(value)
if dialect.name == 'postgresql':
return str(value)
else:
return value.bytes
def process_result_value(self, value, dialect):
if value is None:
return value
if dialect.name == 'postgresql':
if isinstance(value, uuid.UUID):
return value
return uuid.UUID(value)
else:
return uuid.UUID(bytes = value)

View file

@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone
from sqlalchemy.orm import Mapped, MappedColumn
from sqlalchemy import Column, DateTime, ForeignKey, Enum, JSON
from cognee.infrastructure.databases.relational import Base, UUID
@ -24,4 +24,4 @@ class Operation(Base):
data_id = Column(UUID, ForeignKey("data.id"))
meta_data: Mapped[dict] = MappedColumn(type_ = JSON)
created_at = Column(DateTime, default = datetime.utcnow)
created_at = Column(DateTime, default = datetime.now(timezone.utc))

View file

@ -2,8 +2,8 @@ from uuid import uuid4
from typing import List
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, String, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from .DatasetData import DatasetData
class Data(Base):

View file

@ -2,8 +2,8 @@ from uuid import uuid4
from typing import List
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, Text, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, Text, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from .DatasetData import DatasetData
class Dataset(Base):

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, ForeignKey
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, DateTime, ForeignKey, UUID
from cognee.infrastructure.databases.relational import Base
class DatasetData(Base):
__tablename__ = "dataset_data"

View file

@ -1,9 +1,10 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, String, Text
from sqlalchemy import Column, DateTime, String, Text, UUID
from sqlalchemy.orm import relationship, Mapped
from cognee.infrastructure.databases.relational import Base, UUID
from cognee.infrastructure.databases.relational import Base
from .PipelineTask import PipelineTask
from .Task import Task
class Pipeline(Base):
__tablename__ = "pipelines"

View file

@ -1,8 +1,8 @@
import enum
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, JSON, Enum
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, DateTime, JSON, Enum, UUID
from cognee.infrastructure.databases.relational import Base
class PipelineRunStatus(enum.Enum):
DATASET_PROCESSING_STARTED = "DATASET_PROCESSING_STARTED"

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, ForeignKey
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, DateTime, ForeignKey, UUID
from cognee.infrastructure.databases.relational import Base
class PipelineTask(Base):
__tablename__ = "pipeline_task"

View file

@ -0,0 +1,16 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, String, UUID
from cognee.infrastructure.databases.relational import Base
class Query(Base):
__tablename__ = "queries"
id = Column(UUID, primary_key = True, default = uuid4)
text = Column(String)
query_type = Column(String)
user_id = Column(UUID)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))

View file

@ -0,0 +1,16 @@
from datetime import datetime, timezone
from uuid import uuid4
from sqlalchemy import Column, DateTime, Text, UUID
from cognee.infrastructure.databases.relational import Base
class Result(Base):
__tablename__ = "results"
id = Column(UUID, primary_key = True, default = uuid4)
value = Column(Text)
query_id = Column(UUID)
user_id = Column(UUID, index = True)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))

View file

@ -0,0 +1,3 @@
from .log_query import log_query
from .log_result import log_result
from .get_history import get_history

View file

@ -0,0 +1,31 @@
from uuid import UUID
from sqlalchemy import literal, select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Query import Query
from ..models.Result import Result
async def get_history(user_id: UUID, limit: int = 10) -> list[Result]:
db_engine = get_relational_engine()
queries_query = select(
Query.id,
Query.text.label("text"),
Query.created_at,
literal("user").label("user")
) \
.filter(Query.user_id == user_id)
results_query = select(
Result.id,
Result.value.label("text"),
Result.created_at,
literal("system").label("user")
) \
.filter(Result.user_id == user_id)
history_query = queries_query.union(results_query).order_by("created_at").limit(limit)
async with db_engine.get_async_session() as session:
history = (await session.execute(history_query)).all()
return history

View file

@ -0,0 +1,17 @@
from uuid import UUID
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Query import Query
async def get_queries(user_id: UUID, limit: int) -> list[Query]:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
queries = (await session.scalars(
select(Query)
.filter(Query.user_id == user_id)
.order_by(Query.created_at.desc())
.limit(limit)
)).all()
return queries

View file

@ -0,0 +1,17 @@
from uuid import UUID
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Result import Result
async def get_results(user_id: UUID, limit: int = 10) -> list[Result]:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
results = (await session.scalars(
select(Result)
.filter(Result.user_id == user_id)
.order_by(Result.created_at.desc())
.limit(limit)
)).all()
return results

View file

@ -0,0 +1,19 @@
from uuid import UUID
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Query import Query
async def log_query(query_text: str, query_type: str, user_id: UUID) -> Query:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
query = Query(
text = query_text,
query_type = query_type,
user_id = user_id,
)
session.add(query)
await session.commit()
return query

View file

@ -0,0 +1,15 @@
from uuid import UUID
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Result import Result
async def log_result(query_id: UUID, result: str, user_id: UUID):
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
session.add(Result(
value = result,
query_id = query_id,
user_id = user_id,
))
await session.commit()

View file

@ -1,8 +1,8 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from .ACLResources import ACLResources
class ACL(Base):

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
class ACLResources(Base):
__tablename__ = "acl_resources"

View file

@ -1,6 +1,5 @@
from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, ForeignKey
from cognee.infrastructure.databases.relational import UUID
from sqlalchemy import Column, String, ForeignKey, UUID
from .Principal import Principal
from .UserGroup import UserGroup

View file

@ -1,8 +1,8 @@
from uuid import uuid4
from datetime import datetime, timezone
# from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime, String
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, DateTime, String, UUID
from cognee.infrastructure.databases.relational import Base
class Permission(Base):
__tablename__ = "permissions"

View file

@ -1,7 +1,7 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, String, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, String, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
class Principal(Base):
__tablename__ = "principals"

View file

@ -1,8 +1,8 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
from .ACLResources import ACLResources
class Resource(Base):

View file

@ -1,10 +1,10 @@
from uuid import UUID as uuid_UUID
from sqlalchemy import ForeignKey, Column
from sqlalchemy import ForeignKey, Column, UUID
from sqlalchemy.orm import relationship, Mapped
from fastapi_users.db import SQLAlchemyBaseUserTableUUID
from cognee.infrastructure.databases.relational import UUID
from .Principal import Principal
from .UserGroup import UserGroup
from .Group import Group
class User(SQLAlchemyBaseUserTableUUID, Principal):
__tablename__ = "users"
@ -25,7 +25,6 @@ class User(SQLAlchemyBaseUserTableUUID, Principal):
from fastapi_users import schemas
class UserRead(schemas.BaseUser[uuid_UUID]):
# groups: list[uuid_UUID] # Add groups attribute
pass
class UserCreate(schemas.BaseUserCreate):

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, DateTime
from cognee.infrastructure.databases.relational import Base, UUID
from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base
class UserGroup(Base):
__tablename__ = "user_groups"

View file

@ -33,4 +33,4 @@ async def check_permission_on_documents(user: User, permission_type: str, docume
has_permissions = all(document_id in resource_ids for document_id in document_ids)
if not has_permissions:
raise PermissionDeniedException(f"User {user.username} does not have {permission_type} permission on documents")
raise PermissionDeniedException(f"User {user.email} does not have {permission_type} permission on documents")

View file

@ -28,7 +28,7 @@ class Class(DataPoint):
description: str
constructor_parameters: List[Variable]
extended_from_class: Optional["Class"] = None
has_methods: list["Function"]
has_methods: List["Function"]
_metadata = {
"index_fields": ["name"]
@ -89,7 +89,8 @@ class SourceCodeGraph(DataPoint):
Operator,
Expression,
]]
Class.model_rebuild()
ClassInstance.model_rebuild()
Expression.model_rebuild()
FunctionCall.model_rebuild()
SourceCodeGraph.model_rebuild()

View file

@ -1,5 +1,6 @@
""" This module contains utility functions for the cognee. """
import os
import requests
from datetime import datetime, timezone
import graphistry
import networkx as nx
@ -8,7 +9,6 @@ import pandas as pd
import matplotlib.pyplot as plt
import tiktoken
import nltk
from posthog import Posthog
from cognee.base_config import get_base_config
from cognee.infrastructure.databases.graph import get_graph_engine
@ -16,6 +16,9 @@ from cognee.infrastructure.databases.graph import get_graph_engine
from uuid import uuid4
import pathlib
# Analytics Proxy Url, currently hosted by Vercel
vercel_url = "https://proxyanalytics.vercel.app"
def get_anonymous_id():
"""Creates or reads a anonymous user id"""
home_dir = str(pathlib.Path(pathlib.Path(__file__).parent.parent.parent.resolve()))
@ -40,25 +43,24 @@ def send_telemetry(event_name: str, user_id, additional_properties: dict = {}):
if env in ["test", "dev"]:
return
posthog = Posthog(
project_api_key = "phc_UB1YVere1KtJg1MFxAo6ABfpkwN3OxCvGNDkMTjvH0",
host = "https://eu.i.posthog.com"
)
current_time = datetime.now(timezone.utc)
properties = {
"time": current_time.strftime("%m/%d/%Y"),
"user_id": user_id,
**additional_properties,
payload = {
"anonymous_id": str(get_anonymous_id()),
"event_name": event_name,
"user_properties": {
"user_id": str(user_id),
},
"properties": {
"time": current_time.strftime("%m/%d/%Y"),
"user_id": str(user_id),
**additional_properties
},
}
# Needed to forward properties to PostHog along with id
posthog.identify(get_anonymous_id(), properties)
response = requests.post(vercel_url, json=payload)
try:
posthog.capture(get_anonymous_id(), event_name, properties)
except Exception as e:
print("ERROR sending telemetric data to Posthog. See exception: %s", e)
if response.status_code != 200:
print(f"Error sending telemetry through proxy: {response.status_code}")
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""

View file

@ -1,2 +1,4 @@
from .ingest_data import ingest_data
from .save_data_to_storage import save_data_to_storage
from .save_data_item_to_storage import save_data_item_to_storage
from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage

View file

@ -3,7 +3,7 @@ import cognee.modules.ingestion as ingestion
from cognee.shared.utils import send_telemetry
from cognee.modules.users.models import User
from cognee.infrastructure.databases.relational import get_relational_config, get_relational_engine
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.methods import create_dataset
from cognee.modules.users.permissions.methods import give_permission_on_document
from .get_dlt_destination import get_dlt_destination

View file

@ -0,0 +1,92 @@
import dlt
import cognee.modules.ingestion as ingestion
from typing import Any
from cognee.shared.utils import send_telemetry
from cognee.modules.users.models import User
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.methods import create_dataset
from cognee.modules.users.permissions.methods import give_permission_on_document
from .get_dlt_destination import get_dlt_destination
from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage
async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
destination = get_dlt_destination()
pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem",
destination = destination,
)
@dlt.resource(standalone = True, merge_key = "id")
async def data_resources(data: Any, user: User):
if not isinstance(data, list):
# Convert data to a list as we work with lists further down.
data = [data]
# Process data
for data_item in data:
file_path = save_data_item_with_metadata_to_storage(data_item, dataset_name)
# Ingest data and add metadata
with open(file_path.replace("file://", ""), mode = "rb") as file:
classified_data = ingestion.classify(file)
data_id = ingestion.identify(classified_data)
file_metadata = classified_data.get_metadata()
from sqlalchemy import select
from cognee.modules.data.models import Data
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
dataset = await create_dataset(dataset_name, user.id, session)
data_point = (await session.execute(
select(Data).filter(Data.id == data_id)
)).scalar_one_or_none()
if data_point is not None:
data_point.name = file_metadata["name"]
data_point.raw_data_location = file_metadata["file_path"]
data_point.extension = file_metadata["extension"]
data_point.mime_type = file_metadata["mime_type"]
await session.merge(data_point)
await session.commit()
else:
data_point = Data(
id = data_id,
name = file_metadata["name"],
raw_data_location = file_metadata["file_path"],
extension = file_metadata["extension"],
mime_type = file_metadata["mime_type"],
)
dataset.data.append(data_point)
await session.commit()
yield {
"id": data_id,
"name": file_metadata["name"],
"file_path": file_metadata["file_path"],
"extension": file_metadata["extension"],
"mime_type": file_metadata["mime_type"],
}
await give_permission_on_document(user, data_id, "read")
await give_permission_on_document(user, data_id, "write")
send_telemetry("cognee.add EXECUTION STARTED", user_id = user.id)
run_info = pipeline.run(
data_resources(data, user),
table_name = "file_metadata",
dataset_name = dataset_name,
write_disposition = "merge",
)
send_telemetry("cognee.add EXECUTION COMPLETED", user_id = user.id)
return run_info

View file

@ -0,0 +1,20 @@
from typing import Union, BinaryIO
from cognee.modules.ingestion import save_data_to_file
def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str) -> str:
# data is a file object coming from upload.
if hasattr(data_item, "file"):
file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
elif isinstance(data_item, str):
# data is a file path
if data_item.startswith("file://") or data_item.startswith("/"):
file_path = data_item.replace("file://", "")
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
else:
raise ValueError(f"Data type not supported: {type(data_item)}")
return file_path

View file

@ -0,0 +1,28 @@
from typing import Union, BinaryIO, Any
from cognee.modules.ingestion import save_data_to_file
def save_data_item_with_metadata_to_storage(data_item: Union[BinaryIO, str, Any], dataset_name: str) -> str:
# Dynamic import is used because the llama_index module is optional.
# For the same reason Any is accepted as a data item
from llama_index.core import Document
from .transform_data import get_data_from_llama_index
# Check if data is of type Document or any of it's subclasses
if isinstance(data_item, Document):
file_path = get_data_from_llama_index(data_item, dataset_name)
# data is a file object coming from upload.
elif hasattr(data_item, "file"):
file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
elif isinstance(data_item, str):
# data is a file path
if data_item.startswith("file://") or data_item.startswith("/"):
file_path = data_item.replace("file://", "")
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
else:
raise ValueError(f"Data type not supported: {type(data_item)}")
return file_path

View file

@ -1,5 +1,5 @@
from typing import Union, BinaryIO
from cognee.modules.ingestion import save_data_to_file
from cognee.tasks.ingestion.save_data_item_to_storage import save_data_item_to_storage
def save_data_to_storage(data: Union[BinaryIO, str], dataset_name) -> list[str]:
if not isinstance(data, list):
@ -9,19 +9,7 @@ def save_data_to_storage(data: Union[BinaryIO, str], dataset_name) -> list[str]:
file_paths = []
for data_item in data:
# data is a file object coming from upload.
if hasattr(data_item, "file"):
file_path = save_data_to_file(data_item.file, dataset_name, filename = data_item.filename)
file_paths.append(file_path)
if isinstance(data_item, str):
# data is a file path
if data_item.startswith("file://") or data_item.startswith("/"):
file_paths.append(data_item.replace("file://", ""))
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
file_paths.append(file_path)
file_path = save_data_item_to_storage(data_item, dataset_name)
file_paths.append(file_path)
return file_paths

View file

@ -0,0 +1,18 @@
from llama_index.core import Document
from llama_index.core.schema import ImageDocument
from cognee.modules.ingestion import save_data_to_file
from typing import Union
def get_data_from_llama_index(data_point: Union[Document, ImageDocument], dataset_name: str) -> str:
# Specific type checking is used to ensure it's not a child class from Document
if type(data_point) == Document:
file_path = data_point.metadata.get("file_path")
if file_path is None:
file_path = save_data_to_file(data_point.text, dataset_name)
return file_path
return file_path
elif type(data_point) == ImageDocument:
if data_point.image_path is None:
file_path = save_data_to_file(data_point.text, dataset_name)
return file_path
return data_point.image_path

View file

@ -35,24 +35,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "AI"))[0]
random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name)
search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name)
search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__":
import asyncio

View file

@ -39,24 +39,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name)
search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name)
search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__":
import asyncio

View file

@ -68,24 +68,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query=random_node_name)
search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query=random_node_name)
search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query=random_node_name)
search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\n\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__":
import asyncio

View file

@ -40,24 +40,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name)
search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name)
search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__":
import asyncio

View file

@ -38,24 +38,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name)
search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name)
search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n")
for result in search_results:
print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name)
search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n")
for result in search_results:
print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__":
import asyncio

View file

@ -27,8 +27,7 @@ async def main():
# Query cognee for insights on the added text
search_results = await cognee.search(
SearchType.INSIGHTS,
{'query': 'Tell me about NLP'}
SearchType.INSIGHTS, query='Tell me about NLP'
)
# Display search results

View file

@ -791,7 +791,7 @@
"node = (await vector_engine.search(\"Entity_name\", \"sarah.nguyen@example.com\"))[0]\n",
"node_name = node.payload[\"text\"]\n",
"\n",
"search_results = await cognee.search(SearchType.SUMMARIES, query = node_name)\n",
"search_results = await cognee.search(SearchType.SUMMARIES, query_text = node_name)\n",
"print(\"\\n\\Extracted summaries are:\\n\")\n",
"for result in search_results:\n",
" print(f\"{result}\\n\")"
@ -812,7 +812,7 @@
"metadata": {},
"outputs": [],
"source": [
"search_results = await cognee.search(SearchType.CHUNKS, query = node_name)\n",
"search_results = await cognee.search(SearchType.CHUNKS, query_text = node_name)\n",
"print(\"\\n\\nExtracted chunks are:\\n\")\n",
"for result in search_results:\n",
" print(f\"{result}\\n\")"
@ -833,7 +833,7 @@
"metadata": {},
"outputs": [],
"source": [
"search_results = await cognee.search(SearchType.INSIGHTS, query = node_name)\n",
"search_results = await cognee.search(SearchType.INSIGHTS, query_text = node_name)\n",
"print(\"\\n\\nExtracted sentences are:\\n\")\n",
"for result in search_results:\n",
" print(f\"{result}\\n\")"

View file

@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cognee GraphRAG with LlamaIndex Documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install llama-index-core"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data\n",
"\n",
"We will use a sample news article dataset retrieved from Diffbot, which Tomaz has conveniently made available on GitHub for easy access.\n",
"\n",
"The dataset contains 2,500 samples; for ease of experimentation, we will use 5 of these samples, which include the `title` and `text` of news articles."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from llama_index.core import Document\n",
"\n",
"news = pd.read_csv(\n",
" \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv\"\n",
")[:5]\n",
"\n",
"news.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare documents as required by LlamaIndex"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"documents = [\n",
" Document(text=f\"{row['title']}: {row['text']}\")\n",
" for i, row in news.iterrows()\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set environment variables"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Setting environment variables\n",
"if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
"\n",
"if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
"\n",
"if \"LLM_API_KEY\" not in os.environ:\n",
" os.environ[\"LLM_API_KEY\"] = \"\"\n",
"\n",
"# \"neo4j\" or \"networkx\"\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
"# Not needed if using networkx\n",
"#GRAPH_DATABASE_URL=\"\"\n",
"#GRAPH_DATABASE_USERNAME=\"\"\n",
"#GRAPH_DATABASE_PASSWORD=\"\"\n",
"\n",
"# \"qdrant\", \"weaviate\" or \"lancedb\"\n",
"os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
"# Not needed if using \"lancedb\"\n",
"# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
"# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
"\n",
"# Database provider\n",
"os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n",
"\n",
"# Database name\n",
"os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
"\n",
"# Postgres specific parameters (Only if Postgres is run)\n",
"# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
"# os.environ[\"DB_PORT\"]=\"5432\"\n",
"# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
"# os.environ[\"DB_PASSWORD\"]=\"cognee\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run Cognee with LlamaIndex Documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from typing import Union, BinaryIO\n",
"\n",
"from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables\n",
"from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables\n",
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
"from cognee.shared.utils import render_graph\n",
"from cognee.modules.users.models import User\n",
"from cognee.modules.users.methods import get_default_user\n",
"from cognee.tasks.ingestion.ingest_data_with_metadata import ingest_data_with_metadata\n",
"import cognee\n",
"\n",
"# Create a clean slate for cognee -- reset data and system state\n",
"await cognee.prune.prune_data()\n",
"await cognee.prune.prune_system(metadata=True)\n",
"\n",
"# Add the LlamaIndex documents, and make it available for cognify\n",
"async def add(data: Union[BinaryIO, list[BinaryIO], str, list[str]], dataset_name: str = \"main_dataset\", user: User = None):\n",
" await create_relational_db_and_tables()\n",
" await create_pgvector_db_and_tables()\n",
"\n",
" if user is None:\n",
" user = await get_default_user()\n",
"\n",
" await ingest_data_with_metadata(data, dataset_name, user)\n",
"\n",
"await add(documents)\n",
"\n",
"# Use LLMs and cognee to create knowledge graph\n",
"await cognee.cognify()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query Cognee for summaries related to data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cognee import SearchType\n",
"\n",
"# Query cognee for summaries\n",
"search_results = await cognee.search(\n",
" SearchType.SUMMARIES, query=\"What are the main news discussed in the document?\"\n",
")\n",
"# Display search results\n",
"print(\"\\n Summary of main news discussed:\\n\")\n",
"print(search_results[0][\"text\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Render Knowledge Graph generated from provided data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import graphistry\n",
"\n",
"# Get graph\n",
"graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
"graph_engine = await get_graph_engine()\n",
"\n",
"graph_url = await render_graph(graph_engine.graph)\n",
"print(graph_url)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

138
poetry.lock generated
View file

@ -1125,6 +1125,21 @@ files = [
docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
tests = ["pytest", "pytest-cov", "pytest-xdist"]
[[package]]
name = "dataclasses-json"
version = "0.6.7"
description = "Easily serialize dataclasses to and from JSON."
optional = true
python-versions = "<4.0,>=3.7"
files = [
{file = "dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a"},
{file = "dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0"},
]
[package.dependencies]
marshmallow = ">=3.18.0,<4.0.0"
typing-inspect = ">=0.4.0,<1"
[[package]]
name = "datasets"
version = "3.1.0"
@ -1220,6 +1235,23 @@ files = [
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
]
[[package]]
name = "deprecated"
version = "1.2.14"
description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
optional = true
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
{file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
]
[package.dependencies]
wrapt = ">=1.10,<2"
[package.extras]
dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
[[package]]
name = "deprecation"
version = "2.1.0"
@ -1275,6 +1307,17 @@ files = [
graph = ["objgraph (>=1.7.2)"]
profile = ["gprof2dot (>=2022.7.29)"]
[[package]]
name = "dirtyjson"
version = "1.0.8"
description = "JSON decoder for Python that can extract data from the muck"
optional = true
python-versions = "*"
files = [
{file = "dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53"},
{file = "dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd"},
]
[[package]]
name = "distro"
version = "1.9.0"
@ -2396,37 +2439,35 @@ files = [
[[package]]
name = "instructor"
version = "1.6.3"
version = "1.5.2"
description = "structured outputs for llm"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "instructor-1.6.3-py3-none-any.whl", hash = "sha256:a8f973fea621c0188009b65a3429a526c24aeb249fc24100b605ea496e92d622"},
{file = "instructor-1.6.3.tar.gz", hash = "sha256:399cd90e30b5bc7cbd47acd7399c9c4e84926a96c20c8b5d00c5a04b41ed41ab"},
{file = "instructor-1.5.2-py3-none-any.whl", hash = "sha256:da25abbf1ab792fb094992f1d9ce593e26fe458cb1f9a8e7ebf9d68f3f2c757a"},
{file = "instructor-1.5.2.tar.gz", hash = "sha256:fdd5ccbca21b4c558a24e9ba12c84afd907e65153a39d035f47c25800011a977"},
]
[package.dependencies]
aiohttp = ">=3.9.1,<4.0.0"
docstring-parser = ">=0.16,<0.17"
jinja2 = ">=3.1.4,<4.0.0"
jiter = ">=0.5.0,<0.6.0"
openai = ">=1.52.0,<2.0.0"
openai = ">=1.45.0,<2.0.0"
pydantic = ">=2.8.0,<3.0.0"
pydantic-core = ">=2.18.0,<3.0.0"
rich = ">=13.7.0,<14.0.0"
tenacity = ">=9.0.0,<10.0.0"
tenacity = ">=8.4.1,<9.0.0"
typer = ">=0.9.0,<1.0.0"
[package.extras]
anthropic = ["anthropic (>=0.36.2,<0.37.0)", "xmltodict (>=0.13.0,<0.14.0)"]
anthropic = ["anthropic (>=0.34.0,<0.35.0)", "xmltodict (>=0.13.0,<0.14.0)"]
cerebras-cloud-sdk = ["cerebras_cloud_sdk (>=1.5.0,<2.0.0)"]
cohere = ["cohere (>=5.1.8,<6.0.0)"]
fireworks-ai = ["fireworks-ai (>=0.15.4,<0.16.0)"]
google-generativeai = ["google-generativeai (>=0.8.2,<0.9.0)"]
groq = ["groq (>=0.4.2,<0.5.0)"]
litellm = ["litellm (>=1.35.31,<2.0.0)"]
mistralai = ["mistralai (>=1.0.3,<2.0.0)"]
test-docs = ["anthropic (>=0.36.2,<0.37.0)", "cohere (>=5.1.8,<6.0.0)", "diskcache (>=5.6.3,<6.0.0)", "fastapi (>=0.109.2,<0.110.0)", "groq (>=0.4.2,<0.5.0)", "litellm (>=1.35.31,<2.0.0)", "mistralai (>=1.0.3,<2.0.0)", "pandas (>=2.2.0,<3.0.0)", "pydantic_extra_types (>=2.6.0,<3.0.0)", "redis (>=5.0.1,<6.0.0)", "tabulate (>=0.9.0,<0.10.0)"]
test-docs = ["anthropic (>=0.34.0,<0.35.0)", "cohere (>=5.1.8,<6.0.0)", "diskcache (>=5.6.3,<6.0.0)", "fastapi (>=0.109.2,<0.110.0)", "groq (>=0.4.2,<0.5.0)", "litellm (>=1.35.31,<2.0.0)", "mistralai (>=1.0.3,<2.0.0)", "pandas (>=2.2.0,<3.0.0)", "pydantic_extra_types (>=2.6.0,<3.0.0)", "redis (>=5.0.1,<6.0.0)", "tabulate (>=0.9.0,<0.10.0)"]
vertexai = ["google-cloud-aiplatform (>=1.53.0,<2.0.0)", "jsonref (>=1.1.0,<2.0.0)"]
[[package]]
@ -3246,6 +3287,40 @@ tokenizers = "*"
extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"]
proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"]
[[package]]
name = "llama-index-core"
version = "0.11.22"
description = "Interface between LLMs and your data"
optional = true
python-versions = "<4.0,>=3.8.1"
files = [
{file = "llama_index_core-0.11.22-py3-none-any.whl", hash = "sha256:5c59d95dec9bb0727f25b03de89392c69076b2e4aaa6acbd8773de1f07502e9e"},
{file = "llama_index_core-0.11.22.tar.gz", hash = "sha256:ddc30b9c873495de40ad8278d0c894ba09f32f6aa7fc638012b1b22b74c32553"},
]
[package.dependencies]
aiohttp = ">=3.8.6,<4.0.0"
dataclasses-json = "*"
deprecated = ">=1.2.9.3"
dirtyjson = ">=1.0.8,<2.0.0"
fsspec = ">=2023.5.0"
httpx = "*"
nest-asyncio = ">=1.5.8,<2.0.0"
networkx = ">=3.0"
nltk = ">3.8.1"
numpy = "<2.0.0"
pillow = ">=9.0.0"
pydantic = ">=2.7.0,<3.0.0"
PyYAML = ">=6.0.1"
requests = ">=2.31.0"
SQLAlchemy = {version = ">=1.4.49", extras = ["asyncio"]}
tenacity = ">=8.2.0,<8.4.0 || >8.4.0,<9.0.0"
tiktoken = ">=0.3.3"
tqdm = ">=4.66.1,<5.0.0"
typing-extensions = ">=4.5.0"
typing-inspect = ">=0.8.0"
wrapt = "*"
[[package]]
name = "makefun"
version = "1.15.6"
@ -3388,6 +3463,25 @@ files = [
{file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
]
[[package]]
name = "marshmallow"
version = "3.23.1"
description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
optional = true
python-versions = ">=3.9"
files = [
{file = "marshmallow-3.23.1-py3-none-any.whl", hash = "sha256:fece2eb2c941180ea1b7fcbd4a83c51bfdd50093fdd3ad2585ee5e1df2508491"},
{file = "marshmallow-3.23.1.tar.gz", hash = "sha256:3a8dfda6edd8dcdbf216c0ede1d1e78d230a6dc9c5a088f58c4083b974a0d468"},
]
[package.dependencies]
packaging = ">=17.0"
[package.extras]
dev = ["marshmallow[tests]", "pre-commit (>=3.5,<5.0)", "tox"]
docs = ["alabaster (==1.0.0)", "autodocsumm (==0.2.14)", "sphinx (==8.1.3)", "sphinx-issues (==5.0.0)", "sphinx-version-warning (==1.1.2)"]
tests = ["pytest", "simplejson"]
[[package]]
name = "matplotlib"
version = "3.9.2"
@ -4885,7 +4979,7 @@ test = ["pytest", "pytest-xdist", "setuptools"]
name = "psycopg2"
version = "2.9.10"
description = "psycopg2 - Python-PostgreSQL Database Adapter"
optional = false
optional = true
python-versions = ">=3.8"
files = [
{file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"},
@ -6551,13 +6645,13 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7
[[package]]
name = "tenacity"
version = "9.0.0"
version = "8.5.0"
description = "Retry code until it succeeds"
optional = false
python-versions = ">=3.8"
files = [
{file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"},
{file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"},
{file = "tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687"},
{file = "tenacity-8.5.0.tar.gz", hash = "sha256:8bc6c0c8a09b31e6cad13c47afbed1a567518250a9a171418582ed8d9c20ca78"},
]
[package.extras]
@ -6946,6 +7040,21 @@ files = [
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
]
[[package]]
name = "typing-inspect"
version = "0.9.0"
description = "Runtime inspection utilities for typing module."
optional = true
python-versions = "*"
files = [
{file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"},
{file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"},
]
[package.dependencies]
mypy-extensions = ">=0.3.0"
typing-extensions = ">=3.7.4"
[[package]]
name = "tzdata"
version = "2024.2"
@ -7513,6 +7622,7 @@ type = ["pytest-mypy"]
[extras]
cli = []
filesystem = ["botocore"]
llama-index = ["llama-index-core"]
neo4j = ["neo4j"]
notebook = []
postgres = ["asyncpg", "pgvector", "psycopg2"]
@ -7522,4 +7632,4 @@ weaviate = ["weaviate-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9.0,<3.12"
content-hash = "57a154a7bbdd990e0fbe2313fa24c412dad98e47b9cd05e41bf378a3f597713f"
content-hash = "f5874af8364839dd2a362b6b3209c4aae108f30dcc27be43d0d07f7b28160eda"

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "cognee"
version = "0.1.18"
version = "0.1.19"
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
authors = ["Vasilije Markovic", "Boris Arzentar"]
readme = "README.md"
@ -35,7 +35,7 @@ boto3 = "^1.26.125"
botocore="^1.35.54"
gunicorn = "^20.1.0"
sqlalchemy = "2.0.35"
instructor = "1.6.3"
instructor = "1.5.2"
networkx = "^3.2.1"
aiosqlite = "^0.20.0"
pandas = "2.0.3"
@ -45,7 +45,7 @@ dlt = {extras = ["sqlalchemy"], version = "^1.3.0"}
aiofiles = "^23.2.1"
qdrant-client = "^1.9.0"
graphistry = "^0.33.5"
tenacity = "^9.0.0"
tenacity = "^8.4.1"
weaviate-client = "4.6.7"
scikit-learn = "^1.5.0"
pypdf = "^4.1.0"
@ -68,7 +68,8 @@ fastapi-users = {version = "*", extras = ["sqlalchemy"]}
alembic = "^1.13.3"
asyncpg = "^0.29.0"
pgvector = "^0.3.5"
psycopg2 = "^2.9.10"
psycopg2 = {version = "^2.9.10", optional = true}
llama-index-core = {version = "^0.11.22", optional = true}
[tool.poetry.extras]
filesystem = ["s3fs", "botocore"]
@ -78,6 +79,7 @@ qdrant = ["qdrant-client"]
neo4j = ["neo4j"]
postgres = ["psycopg2", "pgvector", "asyncpg"]
notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
llama-index = ["llama-index-core"]
[tool.poetry.group.dev.dependencies]