Merge branch 'main' into COG-533-pydantic-unit-tests

This commit is contained in:
Leon Luithlen 2024-11-18 11:24:51 +01:00
commit fde56f0c3b
57 changed files with 877 additions and 166 deletions

BIN
.DS_Store vendored

Binary file not shown.

View file

@ -0,0 +1,63 @@
name: test | llama index notebook
on:
workflow_dispatch:
pull_request:
branches:
- main
types: [labeled, synchronize]
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
RUNTIME__LOG_LEVEL: ERROR
jobs:
get_docs_changes:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
run_notebook_test:
name: test
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && github.event.label.name == 'run-checks'
runs-on: ubuntu-latest
defaults:
run:
shell: bash
steps:
- name: Check out
uses: actions/checkout@master
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11.x'
- name: Install Poetry
uses: snok/install-poetry@v1.3.2
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
- name: Install dependencies
run: |
poetry install --no-interaction --all-extras --no-root
poetry add jupyter --no-interaction
- name: Execute Jupyter Notebook
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
run: |
poetry run jupyter nbconvert \
--to notebook \
--execute notebooks/cognee_llama_index.ipynb \
--output executed_notebook.ipynb \
--ExecutePreprocessor.timeout=1200

View file

@ -0,0 +1,8 @@
import { fetch } from '@/utils';
export default function getHistory() {
return fetch(
'/v1/search',
)
.then((response) => response.json());
}

View file

@ -1,9 +1,12 @@
'use client';
import { v4 } from 'uuid'; import { v4 } from 'uuid';
import classNames from 'classnames'; import classNames from 'classnames';
import { useCallback, useState } from 'react'; import { useCallback, useEffect, useState } from 'react';
import { CTAButton, Stack, Text, DropdownSelect, TextArea, useBoolean } from 'ohmy-ui'; import { CTAButton, Stack, Text, DropdownSelect, TextArea, useBoolean } from 'ohmy-ui';
import { fetch } from '@/utils'; import { fetch } from '@/utils';
import styles from './SearchView.module.css'; import styles from './SearchView.module.css';
import getHistory from '@/modules/chat/getHistory';
interface Message { interface Message {
id: string; id: string;
@ -52,6 +55,14 @@ export default function SearchView() {
}, 300); }, 300);
}, []); }, []);
useEffect(() => {
getHistory()
.then((history) => {
setMessages(history);
scrollToBottom();
});
}, [scrollToBottom]);
const handleSearchSubmit = useCallback((event: React.FormEvent<HTMLFormElement>) => { const handleSearchSubmit = useCallback((event: React.FormEvent<HTMLFormElement>) => {
event.preventDefault(); event.preventDefault();
@ -78,7 +89,7 @@ export default function SearchView() {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
}, },
body: JSON.stringify({ body: JSON.stringify({
query: inputValue, query: inputValue.trim(),
searchType: searchTypeValue, searchType: searchTypeValue,
}), }),
}) })

BIN
cognee/.DS_Store vendored

Binary file not shown.

View file

@ -2,7 +2,7 @@ from .api.v1.config.config import config
from .api.v1.add import add from .api.v1.add import add
from .api.v1.cognify import cognify from .api.v1.cognify import cognify
from .api.v1.datasets.datasets import datasets from .api.v1.datasets.datasets import datasets
from .api.v1.search import search, SearchType from .api.v1.search import search, SearchType, get_search_history
from .api.v1.prune import prune from .api.v1.prune import prune
# Pipelines # Pipelines

View file

@ -1 +1,2 @@
from .search_v2 import search, SearchType from .search_v2 import search, SearchType
from .get_search_history import get_search_history

View file

@ -0,0 +1,9 @@
from cognee.modules.search.operations import get_history
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.models import User
async def get_search_history(user: User = None) -> list:
if not user:
user = await get_default_user()
return await get_history(user.id)

View file

@ -1,8 +1,11 @@
from cognee.api.v1.search import SearchType from uuid import UUID
from fastapi.responses import JSONResponse from datetime import datetime
from cognee.modules.users.models import User
from fastapi import Depends, APIRouter from fastapi import Depends, APIRouter
from cognee.api.DTO import InDTO from fastapi.responses import JSONResponse
from cognee.api.v1.search import SearchType
from cognee.api.DTO import InDTO, OutDTO
from cognee.modules.users.models import User
from cognee.modules.search.operations import get_history
from cognee.modules.users.methods import get_authenticated_user from cognee.modules.users.methods import get_authenticated_user
@ -13,6 +16,24 @@ class SearchPayloadDTO(InDTO):
def get_search_router() -> APIRouter: def get_search_router() -> APIRouter:
router = APIRouter() router = APIRouter()
class SearchHistoryItem(OutDTO):
id: UUID
text: str
user: str
created_at: datetime
@router.get("/", response_model = list[SearchHistoryItem])
async def get_search_history(user: User = Depends(get_authenticated_user)):
try:
history = await get_history(user.id)
return history
except Exception as error:
return JSONResponse(
status_code = 500,
content = {"error": str(error)}
)
@router.post("/", response_model = list) @router.post("/", response_model = list)
async def search(payload: SearchPayloadDTO, user: User = Depends(get_authenticated_user)): async def search(payload: SearchPayloadDTO, user: User = Depends(get_authenticated_user)):
""" This endpoint is responsible for searching for nodes in the graph.""" """ This endpoint is responsible for searching for nodes in the graph."""

View file

@ -1,6 +1,9 @@
import json
from uuid import UUID from uuid import UUID
from enum import Enum from enum import Enum
from typing import Callable, Dict from typing import Callable, Dict
from cognee.modules.search.operations import log_query, log_result
from cognee.modules.storage.utils import JSONEncoder
from cognee.shared.utils import send_telemetry from cognee.shared.utils import send_telemetry
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user from cognee.modules.users.methods import get_default_user
@ -14,15 +17,17 @@ class SearchType(Enum):
INSIGHTS = "INSIGHTS" INSIGHTS = "INSIGHTS"
CHUNKS = "CHUNKS" CHUNKS = "CHUNKS"
async def search(search_type: SearchType, query: str, user: User = None) -> list: async def search(query_type: SearchType, query_text: str, user: User = None) -> list:
if user is None: if user is None:
user = await get_default_user() user = await get_default_user()
if user is None: if user is None:
raise PermissionError("No user found in the system. Please create a user.") raise PermissionError("No user found in the system. Please create a user.")
query = await log_query(query_text, str(query_type), user.id)
own_document_ids = await get_document_ids_for_user(user.id) own_document_ids = await get_document_ids_for_user(user.id)
search_results = await specific_search(search_type, query, user) search_results = await specific_search(query_type, query_text, user)
filtered_search_results = [] filtered_search_results = []
@ -33,19 +38,21 @@ async def search(search_type: SearchType, query: str, user: User = None) -> list
if document_id is None or document_id in own_document_ids: if document_id is None or document_id in own_document_ids:
filtered_search_results.append(search_result) filtered_search_results.append(search_result)
await log_result(query.id, json.dumps(filtered_search_results, cls = JSONEncoder), user.id)
return filtered_search_results return filtered_search_results
async def specific_search(search_type: SearchType, query: str, user) -> list: async def specific_search(query_type: SearchType, query: str, user) -> list:
search_tasks: Dict[SearchType, Callable] = { search_tasks: Dict[SearchType, Callable] = {
SearchType.SUMMARIES: query_summaries, SearchType.SUMMARIES: query_summaries,
SearchType.INSIGHTS: query_graph_connections, SearchType.INSIGHTS: query_graph_connections,
SearchType.CHUNKS: query_chunks, SearchType.CHUNKS: query_chunks,
} }
search_task = search_tasks.get(search_type) search_task = search_tasks.get(query_type)
if search_task is None: if search_task is None:
raise ValueError(f"Unsupported search type: {search_type}") raise ValueError(f"Unsupported search type: {query_type}")
send_telemetry("cognee.search EXECUTION STARTED", user.id) send_telemetry("cognee.search EXECUTION STARTED", user.id)

View file

@ -2,6 +2,3 @@ from .ModelBase import Base
from .config import get_relational_config from .config import get_relational_config
from .create_db_and_tables import create_db_and_tables from .create_db_and_tables import create_db_and_tables
from .get_relational_engine import get_relational_engine from .get_relational_engine import get_relational_engine
# Global data types
from .data_types.UUID import UUID

View file

@ -1,45 +0,0 @@
import uuid
from sqlalchemy.types import TypeDecorator, BINARY
from sqlalchemy.dialects.postgresql import UUID as psqlUUID
class UUID(TypeDecorator):
"""Platform-independent GUID type.
Uses Postgresql's UUID type, otherwise uses
BINARY(16), to store UUID.
"""
impl = BINARY
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(psqlUUID())
else:
return dialect.type_descriptor(BINARY(16))
def process_bind_param(self, value, dialect):
if value is None:
return value
else:
if not isinstance(value, uuid.UUID):
if isinstance(value, bytes):
value = uuid.UUID(bytes = value)
elif isinstance(value, int):
value = uuid.UUID(int = value)
elif isinstance(value, str):
value = uuid.UUID(value)
if dialect.name == 'postgresql':
return str(value)
else:
return value.bytes
def process_result_value(self, value, dialect):
if value is None:
return value
if dialect.name == 'postgresql':
if isinstance(value, uuid.UUID):
return value
return uuid.UUID(value)
else:
return uuid.UUID(bytes = value)

View file

@ -1,4 +1,4 @@
from datetime import datetime from datetime import datetime, timezone
from sqlalchemy.orm import Mapped, MappedColumn from sqlalchemy.orm import Mapped, MappedColumn
from sqlalchemy import Column, DateTime, ForeignKey, Enum, JSON from sqlalchemy import Column, DateTime, ForeignKey, Enum, JSON
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base, UUID
@ -24,4 +24,4 @@ class Operation(Base):
data_id = Column(UUID, ForeignKey("data.id")) data_id = Column(UUID, ForeignKey("data.id"))
meta_data: Mapped[dict] = MappedColumn(type_ = JSON) meta_data: Mapped[dict] = MappedColumn(type_ = JSON)
created_at = Column(DateTime, default = datetime.utcnow) created_at = Column(DateTime, default = datetime.now(timezone.utc))

View file

@ -2,8 +2,8 @@ from uuid import uuid4
from typing import List from typing import List
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, DateTime from sqlalchemy import Column, String, DateTime, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
from .DatasetData import DatasetData from .DatasetData import DatasetData
class Data(Base): class Data(Base):

View file

@ -2,8 +2,8 @@ from uuid import uuid4
from typing import List from typing import List
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, Text, DateTime from sqlalchemy import Column, Text, DateTime, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
from .DatasetData import DatasetData from .DatasetData import DatasetData
class Dataset(Base): class Dataset(Base):

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, ForeignKey from sqlalchemy import Column, DateTime, ForeignKey, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
class DatasetData(Base): class DatasetData(Base):
__tablename__ = "dataset_data" __tablename__ = "dataset_data"

View file

@ -1,9 +1,10 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, String, Text from sqlalchemy import Column, DateTime, String, Text, UUID
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
from .PipelineTask import PipelineTask from .PipelineTask import PipelineTask
from .Task import Task
class Pipeline(Base): class Pipeline(Base):
__tablename__ = "pipelines" __tablename__ = "pipelines"

View file

@ -1,8 +1,8 @@
import enum import enum
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, JSON, Enum from sqlalchemy import Column, DateTime, JSON, Enum, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
class PipelineRunStatus(enum.Enum): class PipelineRunStatus(enum.Enum):
DATASET_PROCESSING_STARTED = "DATASET_PROCESSING_STARTED" DATASET_PROCESSING_STARTED = "DATASET_PROCESSING_STARTED"

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, ForeignKey from sqlalchemy import Column, DateTime, ForeignKey, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
class PipelineTask(Base): class PipelineTask(Base):
__tablename__ = "pipeline_task" __tablename__ = "pipeline_task"

View file

@ -0,0 +1,16 @@
from uuid import uuid4
from datetime import datetime, timezone
from sqlalchemy import Column, DateTime, String, UUID
from cognee.infrastructure.databases.relational import Base
class Query(Base):
__tablename__ = "queries"
id = Column(UUID, primary_key = True, default = uuid4)
text = Column(String)
query_type = Column(String)
user_id = Column(UUID)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))

View file

@ -0,0 +1,16 @@
from datetime import datetime, timezone
from uuid import uuid4
from sqlalchemy import Column, DateTime, Text, UUID
from cognee.infrastructure.databases.relational import Base
class Result(Base):
__tablename__ = "results"
id = Column(UUID, primary_key = True, default = uuid4)
value = Column(Text)
query_id = Column(UUID)
user_id = Column(UUID, index = True)
created_at = Column(DateTime(timezone = True), default = lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone = True), onupdate = lambda: datetime.now(timezone.utc))

View file

@ -0,0 +1,3 @@
from .log_query import log_query
from .log_result import log_result
from .get_history import get_history

View file

@ -0,0 +1,31 @@
from uuid import UUID
from sqlalchemy import literal, select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Query import Query
from ..models.Result import Result
async def get_history(user_id: UUID, limit: int = 10) -> list[Result]:
db_engine = get_relational_engine()
queries_query = select(
Query.id,
Query.text.label("text"),
Query.created_at,
literal("user").label("user")
) \
.filter(Query.user_id == user_id)
results_query = select(
Result.id,
Result.value.label("text"),
Result.created_at,
literal("system").label("user")
) \
.filter(Result.user_id == user_id)
history_query = queries_query.union(results_query).order_by("created_at").limit(limit)
async with db_engine.get_async_session() as session:
history = (await session.execute(history_query)).all()
return history

View file

@ -0,0 +1,17 @@
from uuid import UUID
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Query import Query
async def get_queries(user_id: UUID, limit: int) -> list[Query]:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
queries = (await session.scalars(
select(Query)
.filter(Query.user_id == user_id)
.order_by(Query.created_at.desc())
.limit(limit)
)).all()
return queries

View file

@ -0,0 +1,17 @@
from uuid import UUID
from sqlalchemy import select
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Result import Result
async def get_results(user_id: UUID, limit: int = 10) -> list[Result]:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
results = (await session.scalars(
select(Result)
.filter(Result.user_id == user_id)
.order_by(Result.created_at.desc())
.limit(limit)
)).all()
return results

View file

@ -0,0 +1,19 @@
from uuid import UUID
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Query import Query
async def log_query(query_text: str, query_type: str, user_id: UUID) -> Query:
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
query = Query(
text = query_text,
query_type = query_type,
user_id = user_id,
)
session.add(query)
await session.commit()
return query

View file

@ -0,0 +1,15 @@
from uuid import UUID
from cognee.infrastructure.databases.relational import get_relational_engine
from ..models.Result import Result
async def log_result(query_id: UUID, result: str, user_id: UUID):
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
session.add(Result(
value = result,
query_id = query_id,
user_id = user_id,
))
await session.commit()

View file

@ -1,8 +1,8 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, ForeignKey, DateTime from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
from .ACLResources import ACLResources from .ACLResources import ACLResources
class ACL(Base): class ACL(Base):

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, DateTime from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
class ACLResources(Base): class ACLResources(Base):
__tablename__ = "acl_resources" __tablename__ = "acl_resources"

View file

@ -1,6 +1,5 @@
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from sqlalchemy import Column, String, ForeignKey from sqlalchemy import Column, String, ForeignKey, UUID
from cognee.infrastructure.databases.relational import UUID
from .Principal import Principal from .Principal import Principal
from .UserGroup import UserGroup from .UserGroup import UserGroup

View file

@ -1,8 +1,8 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
# from sqlalchemy.orm import relationship # from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime, String from sqlalchemy import Column, DateTime, String, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
class Permission(Base): class Permission(Base):
__tablename__ = "permissions" __tablename__ = "permissions"

View file

@ -1,7 +1,7 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, String, DateTime from sqlalchemy import Column, String, DateTime, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
class Principal(Base): class Principal(Base):
__tablename__ = "principals" __tablename__ = "principals"

View file

@ -1,8 +1,8 @@
from uuid import uuid4 from uuid import uuid4
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy.orm import relationship from sqlalchemy.orm import relationship
from sqlalchemy import Column, DateTime from sqlalchemy import Column, DateTime, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
from .ACLResources import ACLResources from .ACLResources import ACLResources
class Resource(Base): class Resource(Base):

View file

@ -1,10 +1,10 @@
from uuid import UUID as uuid_UUID from uuid import UUID as uuid_UUID
from sqlalchemy import ForeignKey, Column from sqlalchemy import ForeignKey, Column, UUID
from sqlalchemy.orm import relationship, Mapped from sqlalchemy.orm import relationship, Mapped
from fastapi_users.db import SQLAlchemyBaseUserTableUUID from fastapi_users.db import SQLAlchemyBaseUserTableUUID
from cognee.infrastructure.databases.relational import UUID
from .Principal import Principal from .Principal import Principal
from .UserGroup import UserGroup from .UserGroup import UserGroup
from .Group import Group
class User(SQLAlchemyBaseUserTableUUID, Principal): class User(SQLAlchemyBaseUserTableUUID, Principal):
__tablename__ = "users" __tablename__ = "users"
@ -25,7 +25,6 @@ class User(SQLAlchemyBaseUserTableUUID, Principal):
from fastapi_users import schemas from fastapi_users import schemas
class UserRead(schemas.BaseUser[uuid_UUID]): class UserRead(schemas.BaseUser[uuid_UUID]):
# groups: list[uuid_UUID] # Add groups attribute
pass pass
class UserCreate(schemas.BaseUserCreate): class UserCreate(schemas.BaseUserCreate):

View file

@ -1,6 +1,6 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from sqlalchemy import Column, ForeignKey, DateTime from sqlalchemy import Column, ForeignKey, DateTime, UUID
from cognee.infrastructure.databases.relational import Base, UUID from cognee.infrastructure.databases.relational import Base
class UserGroup(Base): class UserGroup(Base):
__tablename__ = "user_groups" __tablename__ = "user_groups"

View file

@ -33,4 +33,4 @@ async def check_permission_on_documents(user: User, permission_type: str, docume
has_permissions = all(document_id in resource_ids for document_id in document_ids) has_permissions = all(document_id in resource_ids for document_id in document_ids)
if not has_permissions: if not has_permissions:
raise PermissionDeniedException(f"User {user.username} does not have {permission_type} permission on documents") raise PermissionDeniedException(f"User {user.email} does not have {permission_type} permission on documents")

View file

@ -28,7 +28,7 @@ class Class(DataPoint):
description: str description: str
constructor_parameters: List[Variable] constructor_parameters: List[Variable]
extended_from_class: Optional["Class"] = None extended_from_class: Optional["Class"] = None
has_methods: list["Function"] has_methods: List["Function"]
_metadata = { _metadata = {
"index_fields": ["name"] "index_fields": ["name"]
@ -89,7 +89,8 @@ class SourceCodeGraph(DataPoint):
Operator, Operator,
Expression, Expression,
]] ]]
Class.model_rebuild() Class.model_rebuild()
ClassInstance.model_rebuild() ClassInstance.model_rebuild()
Expression.model_rebuild() Expression.model_rebuild()
FunctionCall.model_rebuild()
SourceCodeGraph.model_rebuild()

View file

@ -1,5 +1,6 @@
""" This module contains utility functions for the cognee. """ """ This module contains utility functions for the cognee. """
import os import os
import requests
from datetime import datetime, timezone from datetime import datetime, timezone
import graphistry import graphistry
import networkx as nx import networkx as nx
@ -8,7 +9,6 @@ import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import tiktoken import tiktoken
import nltk import nltk
from posthog import Posthog
from cognee.base_config import get_base_config from cognee.base_config import get_base_config
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
@ -16,6 +16,9 @@ from cognee.infrastructure.databases.graph import get_graph_engine
from uuid import uuid4 from uuid import uuid4
import pathlib import pathlib
# Analytics Proxy Url, currently hosted by Vercel
vercel_url = "https://proxyanalytics.vercel.app"
def get_anonymous_id(): def get_anonymous_id():
"""Creates or reads a anonymous user id""" """Creates or reads a anonymous user id"""
home_dir = str(pathlib.Path(pathlib.Path(__file__).parent.parent.parent.resolve())) home_dir = str(pathlib.Path(pathlib.Path(__file__).parent.parent.parent.resolve()))
@ -40,25 +43,24 @@ def send_telemetry(event_name: str, user_id, additional_properties: dict = {}):
if env in ["test", "dev"]: if env in ["test", "dev"]:
return return
posthog = Posthog(
project_api_key = "phc_UB1YVere1KtJg1MFxAo6ABfpkwN3OxCvGNDkMTjvH0",
host = "https://eu.i.posthog.com"
)
current_time = datetime.now(timezone.utc) current_time = datetime.now(timezone.utc)
properties = { payload = {
"time": current_time.strftime("%m/%d/%Y"), "anonymous_id": str(get_anonymous_id()),
"user_id": user_id, "event_name": event_name,
**additional_properties, "user_properties": {
"user_id": str(user_id),
},
"properties": {
"time": current_time.strftime("%m/%d/%Y"),
"user_id": str(user_id),
**additional_properties
},
} }
# Needed to forward properties to PostHog along with id response = requests.post(vercel_url, json=payload)
posthog.identify(get_anonymous_id(), properties)
try: if response.status_code != 200:
posthog.capture(get_anonymous_id(), event_name, properties) print(f"Error sending telemetry through proxy: {response.status_code}")
except Exception as e:
print("ERROR sending telemetric data to Posthog. See exception: %s", e)
def num_tokens_from_string(string: str, encoding_name: str) -> int: def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string.""" """Returns the number of tokens in a text string."""

View file

@ -1,2 +1,4 @@
from .ingest_data import ingest_data from .ingest_data import ingest_data
from .save_data_to_storage import save_data_to_storage from .save_data_to_storage import save_data_to_storage
from .save_data_item_to_storage import save_data_item_to_storage
from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage

View file

@ -3,7 +3,7 @@ import cognee.modules.ingestion as ingestion
from cognee.shared.utils import send_telemetry from cognee.shared.utils import send_telemetry
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.infrastructure.databases.relational import get_relational_config, get_relational_engine from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.methods import create_dataset from cognee.modules.data.methods import create_dataset
from cognee.modules.users.permissions.methods import give_permission_on_document from cognee.modules.users.permissions.methods import give_permission_on_document
from .get_dlt_destination import get_dlt_destination from .get_dlt_destination import get_dlt_destination

View file

@ -0,0 +1,92 @@
import dlt
import cognee.modules.ingestion as ingestion
from typing import Any
from cognee.shared.utils import send_telemetry
from cognee.modules.users.models import User
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.modules.data.methods import create_dataset
from cognee.modules.users.permissions.methods import give_permission_on_document
from .get_dlt_destination import get_dlt_destination
from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage
async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
destination = get_dlt_destination()
pipeline = dlt.pipeline(
pipeline_name = "file_load_from_filesystem",
destination = destination,
)
@dlt.resource(standalone = True, merge_key = "id")
async def data_resources(data: Any, user: User):
if not isinstance(data, list):
# Convert data to a list as we work with lists further down.
data = [data]
# Process data
for data_item in data:
file_path = save_data_item_with_metadata_to_storage(data_item, dataset_name)
# Ingest data and add metadata
with open(file_path.replace("file://", ""), mode = "rb") as file:
classified_data = ingestion.classify(file)
data_id = ingestion.identify(classified_data)
file_metadata = classified_data.get_metadata()
from sqlalchemy import select
from cognee.modules.data.models import Data
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
dataset = await create_dataset(dataset_name, user.id, session)
data_point = (await session.execute(
select(Data).filter(Data.id == data_id)
)).scalar_one_or_none()
if data_point is not None:
data_point.name = file_metadata["name"]
data_point.raw_data_location = file_metadata["file_path"]
data_point.extension = file_metadata["extension"]
data_point.mime_type = file_metadata["mime_type"]
await session.merge(data_point)
await session.commit()
else:
data_point = Data(
id = data_id,
name = file_metadata["name"],
raw_data_location = file_metadata["file_path"],
extension = file_metadata["extension"],
mime_type = file_metadata["mime_type"],
)
dataset.data.append(data_point)
await session.commit()
yield {
"id": data_id,
"name": file_metadata["name"],
"file_path": file_metadata["file_path"],
"extension": file_metadata["extension"],
"mime_type": file_metadata["mime_type"],
}
await give_permission_on_document(user, data_id, "read")
await give_permission_on_document(user, data_id, "write")
send_telemetry("cognee.add EXECUTION STARTED", user_id = user.id)
run_info = pipeline.run(
data_resources(data, user),
table_name = "file_metadata",
dataset_name = dataset_name,
write_disposition = "merge",
)
send_telemetry("cognee.add EXECUTION COMPLETED", user_id = user.id)
return run_info

View file

@ -0,0 +1,20 @@
from typing import Union, BinaryIO
from cognee.modules.ingestion import save_data_to_file
def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str) -> str:
# data is a file object coming from upload.
if hasattr(data_item, "file"):
file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
elif isinstance(data_item, str):
# data is a file path
if data_item.startswith("file://") or data_item.startswith("/"):
file_path = data_item.replace("file://", "")
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
else:
raise ValueError(f"Data type not supported: {type(data_item)}")
return file_path

View file

@ -0,0 +1,28 @@
from typing import Union, BinaryIO, Any
from cognee.modules.ingestion import save_data_to_file
def save_data_item_with_metadata_to_storage(data_item: Union[BinaryIO, str, Any], dataset_name: str) -> str:
# Dynamic import is used because the llama_index module is optional.
# For the same reason Any is accepted as a data item
from llama_index.core import Document
from .transform_data import get_data_from_llama_index
# Check if data is of type Document or any of it's subclasses
if isinstance(data_item, Document):
file_path = get_data_from_llama_index(data_item, dataset_name)
# data is a file object coming from upload.
elif hasattr(data_item, "file"):
file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
elif isinstance(data_item, str):
# data is a file path
if data_item.startswith("file://") or data_item.startswith("/"):
file_path = data_item.replace("file://", "")
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
else:
raise ValueError(f"Data type not supported: {type(data_item)}")
return file_path

View file

@ -1,5 +1,5 @@
from typing import Union, BinaryIO from typing import Union, BinaryIO
from cognee.modules.ingestion import save_data_to_file from cognee.tasks.ingestion.save_data_item_to_storage import save_data_item_to_storage
def save_data_to_storage(data: Union[BinaryIO, str], dataset_name) -> list[str]: def save_data_to_storage(data: Union[BinaryIO, str], dataset_name) -> list[str]:
if not isinstance(data, list): if not isinstance(data, list):
@ -9,19 +9,7 @@ def save_data_to_storage(data: Union[BinaryIO, str], dataset_name) -> list[str]:
file_paths = [] file_paths = []
for data_item in data: for data_item in data:
# data is a file object coming from upload. file_path = save_data_item_to_storage(data_item, dataset_name)
if hasattr(data_item, "file"): file_paths.append(file_path)
file_path = save_data_to_file(data_item.file, dataset_name, filename = data_item.filename)
file_paths.append(file_path)
if isinstance(data_item, str):
# data is a file path
if data_item.startswith("file://") or data_item.startswith("/"):
file_paths.append(data_item.replace("file://", ""))
# data is text
else:
file_path = save_data_to_file(data_item, dataset_name)
file_paths.append(file_path)
return file_paths return file_paths

View file

@ -0,0 +1,18 @@
from llama_index.core import Document
from llama_index.core.schema import ImageDocument
from cognee.modules.ingestion import save_data_to_file
from typing import Union
def get_data_from_llama_index(data_point: Union[Document, ImageDocument], dataset_name: str) -> str:
# Specific type checking is used to ensure it's not a child class from Document
if type(data_point) == Document:
file_path = data_point.metadata.get("file_path")
if file_path is None:
file_path = save_data_to_file(data_point.text, dataset_name)
return file_path
return file_path
elif type(data_point) == ImageDocument:
if data_point.image_path is None:
file_path = save_data_to_file(data_point.text, dataset_name)
return file_path
return data_point.image_path

View file

@ -35,24 +35,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "AI"))[0] random_node = (await vector_engine.search("Entity_name", "AI"))[0]
random_node_name = random_node.payload["text"] random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n") print("\n\nExtracted sentences are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name) search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n") print("\n\nExtracted chunks are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name) search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist." assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n") print("\nExtracted summaries are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__": if __name__ == "__main__":
import asyncio import asyncio

View file

@ -39,24 +39,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"] random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n") print("\n\nExtracted sentences are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name) search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n") print("\n\nExtracted chunks are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name) search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist." assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n") print("\nExtracted summaries are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__": if __name__ == "__main__":
import asyncio import asyncio

View file

@ -68,24 +68,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"] random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query=random_node_name) search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n") print("\n\nExtracted sentences are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query=random_node_name) search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n") print("\n\nExtracted chunks are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query=random_node_name) search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist." assert len(search_results) != 0, "Query related summaries don't exist."
print("\n\nExtracted summaries are:\n") print("\n\nExtracted summaries are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__": if __name__ == "__main__":
import asyncio import asyncio

View file

@ -40,24 +40,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"] random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n") print("\n\nExtracted sentences are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name) search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n") print("\n\nExtracted chunks are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name) search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist." assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n") print("\nExtracted summaries are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__": if __name__ == "__main__":
import asyncio import asyncio

View file

@ -38,24 +38,27 @@ async def main():
random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0] random_node = (await vector_engine.search("Entity_name", "Quantum computer"))[0]
random_node_name = random_node.payload["text"] random_node_name = random_node.payload["text"]
search_results = await cognee.search(SearchType.INSIGHTS, query = random_node_name) search_results = await cognee.search(SearchType.INSIGHTS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted sentences are:\n") print("\n\nExtracted sentences are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.CHUNKS, query = random_node_name) search_results = await cognee.search(SearchType.CHUNKS, query_text = random_node_name)
assert len(search_results) != 0, "The search results list is empty." assert len(search_results) != 0, "The search results list is empty."
print("\n\nExtracted chunks are:\n") print("\n\nExtracted chunks are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
search_results = await cognee.search(SearchType.SUMMARIES, query = random_node_name) search_results = await cognee.search(SearchType.SUMMARIES, query_text = random_node_name)
assert len(search_results) != 0, "Query related summaries don't exist." assert len(search_results) != 0, "Query related summaries don't exist."
print("\nExtracted summaries are:\n") print("\nExtracted summaries are:\n")
for result in search_results: for result in search_results:
print(f"{result}\n") print(f"{result}\n")
history = await cognee.get_search_history()
assert len(history) == 6, "Search history is not correct."
if __name__ == "__main__": if __name__ == "__main__":
import asyncio import asyncio

View file

@ -27,8 +27,7 @@ async def main():
# Query cognee for insights on the added text # Query cognee for insights on the added text
search_results = await cognee.search( search_results = await cognee.search(
SearchType.INSIGHTS, SearchType.INSIGHTS, query='Tell me about NLP'
{'query': 'Tell me about NLP'}
) )
# Display search results # Display search results

View file

@ -791,7 +791,7 @@
"node = (await vector_engine.search(\"Entity_name\", \"sarah.nguyen@example.com\"))[0]\n", "node = (await vector_engine.search(\"Entity_name\", \"sarah.nguyen@example.com\"))[0]\n",
"node_name = node.payload[\"text\"]\n", "node_name = node.payload[\"text\"]\n",
"\n", "\n",
"search_results = await cognee.search(SearchType.SUMMARIES, query = node_name)\n", "search_results = await cognee.search(SearchType.SUMMARIES, query_text = node_name)\n",
"print(\"\\n\\Extracted summaries are:\\n\")\n", "print(\"\\n\\Extracted summaries are:\\n\")\n",
"for result in search_results:\n", "for result in search_results:\n",
" print(f\"{result}\\n\")" " print(f\"{result}\\n\")"
@ -812,7 +812,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"search_results = await cognee.search(SearchType.CHUNKS, query = node_name)\n", "search_results = await cognee.search(SearchType.CHUNKS, query_text = node_name)\n",
"print(\"\\n\\nExtracted chunks are:\\n\")\n", "print(\"\\n\\nExtracted chunks are:\\n\")\n",
"for result in search_results:\n", "for result in search_results:\n",
" print(f\"{result}\\n\")" " print(f\"{result}\\n\")"
@ -833,7 +833,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"search_results = await cognee.search(SearchType.INSIGHTS, query = node_name)\n", "search_results = await cognee.search(SearchType.INSIGHTS, query_text = node_name)\n",
"print(\"\\n\\nExtracted sentences are:\\n\")\n", "print(\"\\n\\nExtracted sentences are:\\n\")\n",
"for result in search_results:\n", "for result in search_results:\n",
" print(f\"{result}\\n\")" " print(f\"{result}\\n\")"

View file

@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cognee GraphRAG with LlamaIndex Documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install llama-index-core"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data\n",
"\n",
"We will use a sample news article dataset retrieved from Diffbot, which Tomaz has conveniently made available on GitHub for easy access.\n",
"\n",
"The dataset contains 2,500 samples; for ease of experimentation, we will use 5 of these samples, which include the `title` and `text` of news articles."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from llama_index.core import Document\n",
"\n",
"news = pd.read_csv(\n",
" \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv\"\n",
")[:5]\n",
"\n",
"news.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare documents as required by LlamaIndex"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"documents = [\n",
" Document(text=f\"{row['title']}: {row['text']}\")\n",
" for i, row in news.iterrows()\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set environment variables"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Setting environment variables\n",
"if \"GRAPHISTRY_USERNAME\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_USERNAME\"] = \"\"\n",
"\n",
"if \"GRAPHISTRY_PASSWORD\" not in os.environ: \n",
" os.environ[\"GRAPHISTRY_PASSWORD\"] = \"\"\n",
"\n",
"if \"LLM_API_KEY\" not in os.environ:\n",
" os.environ[\"LLM_API_KEY\"] = \"\"\n",
"\n",
"# \"neo4j\" or \"networkx\"\n",
"os.environ[\"GRAPH_DATABASE_PROVIDER\"]=\"networkx\" \n",
"# Not needed if using networkx\n",
"#GRAPH_DATABASE_URL=\"\"\n",
"#GRAPH_DATABASE_USERNAME=\"\"\n",
"#GRAPH_DATABASE_PASSWORD=\"\"\n",
"\n",
"# \"qdrant\", \"weaviate\" or \"lancedb\"\n",
"os.environ[\"VECTOR_DB_PROVIDER\"]=\"lancedb\" \n",
"# Not needed if using \"lancedb\"\n",
"# os.environ[\"VECTOR_DB_URL\"]=\"\"\n",
"# os.environ[\"VECTOR_DB_KEY\"]=\"\"\n",
"\n",
"# Database provider\n",
"os.environ[\"DB_PROVIDER\"]=\"sqlite\" # or \"postgres\"\n",
"\n",
"# Database name\n",
"os.environ[\"DB_NAME\"]=\"cognee_db\"\n",
"\n",
"# Postgres specific parameters (Only if Postgres is run)\n",
"# os.environ[\"DB_HOST\"]=\"127.0.0.1\"\n",
"# os.environ[\"DB_PORT\"]=\"5432\"\n",
"# os.environ[\"DB_USERNAME\"]=\"cognee\"\n",
"# os.environ[\"DB_PASSWORD\"]=\"cognee\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run Cognee with LlamaIndex Documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from typing import Union, BinaryIO\n",
"\n",
"from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables\n",
"from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables\n",
"from cognee.infrastructure.databases.graph import get_graph_engine\n",
"from cognee.shared.utils import render_graph\n",
"from cognee.modules.users.models import User\n",
"from cognee.modules.users.methods import get_default_user\n",
"from cognee.tasks.ingestion.ingest_data_with_metadata import ingest_data_with_metadata\n",
"import cognee\n",
"\n",
"# Create a clean slate for cognee -- reset data and system state\n",
"await cognee.prune.prune_data()\n",
"await cognee.prune.prune_system(metadata=True)\n",
"\n",
"# Add the LlamaIndex documents, and make it available for cognify\n",
"async def add(data: Union[BinaryIO, list[BinaryIO], str, list[str]], dataset_name: str = \"main_dataset\", user: User = None):\n",
" await create_relational_db_and_tables()\n",
" await create_pgvector_db_and_tables()\n",
"\n",
" if user is None:\n",
" user = await get_default_user()\n",
"\n",
" await ingest_data_with_metadata(data, dataset_name, user)\n",
"\n",
"await add(documents)\n",
"\n",
"# Use LLMs and cognee to create knowledge graph\n",
"await cognee.cognify()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query Cognee for summaries related to data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from cognee import SearchType\n",
"\n",
"# Query cognee for summaries\n",
"search_results = await cognee.search(\n",
" SearchType.SUMMARIES, query=\"What are the main news discussed in the document?\"\n",
")\n",
"# Display search results\n",
"print(\"\\n Summary of main news discussed:\\n\")\n",
"print(search_results[0][\"text\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Render Knowledge Graph generated from provided data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import graphistry\n",
"\n",
"# Get graph\n",
"graphistry.login(username=os.getenv(\"GRAPHISTRY_USERNAME\"), password=os.getenv(\"GRAPHISTRY_PASSWORD\"))\n",
"graph_engine = await get_graph_engine()\n",
"\n",
"graph_url = await render_graph(graph_engine.graph)\n",
"print(graph_url)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

138
poetry.lock generated
View file

@ -1125,6 +1125,21 @@ files = [
docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
tests = ["pytest", "pytest-cov", "pytest-xdist"] tests = ["pytest", "pytest-cov", "pytest-xdist"]
[[package]]
name = "dataclasses-json"
version = "0.6.7"
description = "Easily serialize dataclasses to and from JSON."
optional = true
python-versions = "<4.0,>=3.7"
files = [
{file = "dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a"},
{file = "dataclasses_json-0.6.7.tar.gz", hash = "sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0"},
]
[package.dependencies]
marshmallow = ">=3.18.0,<4.0.0"
typing-inspect = ">=0.4.0,<1"
[[package]] [[package]]
name = "datasets" name = "datasets"
version = "3.1.0" version = "3.1.0"
@ -1220,6 +1235,23 @@ files = [
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
] ]
[[package]]
name = "deprecated"
version = "1.2.14"
description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
optional = true
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
{file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
]
[package.dependencies]
wrapt = ">=1.10,<2"
[package.extras]
dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
[[package]] [[package]]
name = "deprecation" name = "deprecation"
version = "2.1.0" version = "2.1.0"
@ -1275,6 +1307,17 @@ files = [
graph = ["objgraph (>=1.7.2)"] graph = ["objgraph (>=1.7.2)"]
profile = ["gprof2dot (>=2022.7.29)"] profile = ["gprof2dot (>=2022.7.29)"]
[[package]]
name = "dirtyjson"
version = "1.0.8"
description = "JSON decoder for Python that can extract data from the muck"
optional = true
python-versions = "*"
files = [
{file = "dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53"},
{file = "dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd"},
]
[[package]] [[package]]
name = "distro" name = "distro"
version = "1.9.0" version = "1.9.0"
@ -2396,37 +2439,35 @@ files = [
[[package]] [[package]]
name = "instructor" name = "instructor"
version = "1.6.3" version = "1.5.2"
description = "structured outputs for llm" description = "structured outputs for llm"
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "instructor-1.6.3-py3-none-any.whl", hash = "sha256:a8f973fea621c0188009b65a3429a526c24aeb249fc24100b605ea496e92d622"}, {file = "instructor-1.5.2-py3-none-any.whl", hash = "sha256:da25abbf1ab792fb094992f1d9ce593e26fe458cb1f9a8e7ebf9d68f3f2c757a"},
{file = "instructor-1.6.3.tar.gz", hash = "sha256:399cd90e30b5bc7cbd47acd7399c9c4e84926a96c20c8b5d00c5a04b41ed41ab"}, {file = "instructor-1.5.2.tar.gz", hash = "sha256:fdd5ccbca21b4c558a24e9ba12c84afd907e65153a39d035f47c25800011a977"},
] ]
[package.dependencies] [package.dependencies]
aiohttp = ">=3.9.1,<4.0.0" aiohttp = ">=3.9.1,<4.0.0"
docstring-parser = ">=0.16,<0.17" docstring-parser = ">=0.16,<0.17"
jinja2 = ">=3.1.4,<4.0.0"
jiter = ">=0.5.0,<0.6.0" jiter = ">=0.5.0,<0.6.0"
openai = ">=1.52.0,<2.0.0" openai = ">=1.45.0,<2.0.0"
pydantic = ">=2.8.0,<3.0.0" pydantic = ">=2.8.0,<3.0.0"
pydantic-core = ">=2.18.0,<3.0.0" pydantic-core = ">=2.18.0,<3.0.0"
rich = ">=13.7.0,<14.0.0" rich = ">=13.7.0,<14.0.0"
tenacity = ">=9.0.0,<10.0.0" tenacity = ">=8.4.1,<9.0.0"
typer = ">=0.9.0,<1.0.0" typer = ">=0.9.0,<1.0.0"
[package.extras] [package.extras]
anthropic = ["anthropic (>=0.36.2,<0.37.0)", "xmltodict (>=0.13.0,<0.14.0)"] anthropic = ["anthropic (>=0.34.0,<0.35.0)", "xmltodict (>=0.13.0,<0.14.0)"]
cerebras-cloud-sdk = ["cerebras_cloud_sdk (>=1.5.0,<2.0.0)"] cerebras-cloud-sdk = ["cerebras_cloud_sdk (>=1.5.0,<2.0.0)"]
cohere = ["cohere (>=5.1.8,<6.0.0)"] cohere = ["cohere (>=5.1.8,<6.0.0)"]
fireworks-ai = ["fireworks-ai (>=0.15.4,<0.16.0)"]
google-generativeai = ["google-generativeai (>=0.8.2,<0.9.0)"] google-generativeai = ["google-generativeai (>=0.8.2,<0.9.0)"]
groq = ["groq (>=0.4.2,<0.5.0)"] groq = ["groq (>=0.4.2,<0.5.0)"]
litellm = ["litellm (>=1.35.31,<2.0.0)"] litellm = ["litellm (>=1.35.31,<2.0.0)"]
mistralai = ["mistralai (>=1.0.3,<2.0.0)"] mistralai = ["mistralai (>=1.0.3,<2.0.0)"]
test-docs = ["anthropic (>=0.36.2,<0.37.0)", "cohere (>=5.1.8,<6.0.0)", "diskcache (>=5.6.3,<6.0.0)", "fastapi (>=0.109.2,<0.110.0)", "groq (>=0.4.2,<0.5.0)", "litellm (>=1.35.31,<2.0.0)", "mistralai (>=1.0.3,<2.0.0)", "pandas (>=2.2.0,<3.0.0)", "pydantic_extra_types (>=2.6.0,<3.0.0)", "redis (>=5.0.1,<6.0.0)", "tabulate (>=0.9.0,<0.10.0)"] test-docs = ["anthropic (>=0.34.0,<0.35.0)", "cohere (>=5.1.8,<6.0.0)", "diskcache (>=5.6.3,<6.0.0)", "fastapi (>=0.109.2,<0.110.0)", "groq (>=0.4.2,<0.5.0)", "litellm (>=1.35.31,<2.0.0)", "mistralai (>=1.0.3,<2.0.0)", "pandas (>=2.2.0,<3.0.0)", "pydantic_extra_types (>=2.6.0,<3.0.0)", "redis (>=5.0.1,<6.0.0)", "tabulate (>=0.9.0,<0.10.0)"]
vertexai = ["google-cloud-aiplatform (>=1.53.0,<2.0.0)", "jsonref (>=1.1.0,<2.0.0)"] vertexai = ["google-cloud-aiplatform (>=1.53.0,<2.0.0)", "jsonref (>=1.1.0,<2.0.0)"]
[[package]] [[package]]
@ -3246,6 +3287,40 @@ tokenizers = "*"
extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"] extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"]
proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"] proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"]
[[package]]
name = "llama-index-core"
version = "0.11.22"
description = "Interface between LLMs and your data"
optional = true
python-versions = "<4.0,>=3.8.1"
files = [
{file = "llama_index_core-0.11.22-py3-none-any.whl", hash = "sha256:5c59d95dec9bb0727f25b03de89392c69076b2e4aaa6acbd8773de1f07502e9e"},
{file = "llama_index_core-0.11.22.tar.gz", hash = "sha256:ddc30b9c873495de40ad8278d0c894ba09f32f6aa7fc638012b1b22b74c32553"},
]
[package.dependencies]
aiohttp = ">=3.8.6,<4.0.0"
dataclasses-json = "*"
deprecated = ">=1.2.9.3"
dirtyjson = ">=1.0.8,<2.0.0"
fsspec = ">=2023.5.0"
httpx = "*"
nest-asyncio = ">=1.5.8,<2.0.0"
networkx = ">=3.0"
nltk = ">3.8.1"
numpy = "<2.0.0"
pillow = ">=9.0.0"
pydantic = ">=2.7.0,<3.0.0"
PyYAML = ">=6.0.1"
requests = ">=2.31.0"
SQLAlchemy = {version = ">=1.4.49", extras = ["asyncio"]}
tenacity = ">=8.2.0,<8.4.0 || >8.4.0,<9.0.0"
tiktoken = ">=0.3.3"
tqdm = ">=4.66.1,<5.0.0"
typing-extensions = ">=4.5.0"
typing-inspect = ">=0.8.0"
wrapt = "*"
[[package]] [[package]]
name = "makefun" name = "makefun"
version = "1.15.6" version = "1.15.6"
@ -3388,6 +3463,25 @@ files = [
{file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
] ]
[[package]]
name = "marshmallow"
version = "3.23.1"
description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
optional = true
python-versions = ">=3.9"
files = [
{file = "marshmallow-3.23.1-py3-none-any.whl", hash = "sha256:fece2eb2c941180ea1b7fcbd4a83c51bfdd50093fdd3ad2585ee5e1df2508491"},
{file = "marshmallow-3.23.1.tar.gz", hash = "sha256:3a8dfda6edd8dcdbf216c0ede1d1e78d230a6dc9c5a088f58c4083b974a0d468"},
]
[package.dependencies]
packaging = ">=17.0"
[package.extras]
dev = ["marshmallow[tests]", "pre-commit (>=3.5,<5.0)", "tox"]
docs = ["alabaster (==1.0.0)", "autodocsumm (==0.2.14)", "sphinx (==8.1.3)", "sphinx-issues (==5.0.0)", "sphinx-version-warning (==1.1.2)"]
tests = ["pytest", "simplejson"]
[[package]] [[package]]
name = "matplotlib" name = "matplotlib"
version = "3.9.2" version = "3.9.2"
@ -4885,7 +4979,7 @@ test = ["pytest", "pytest-xdist", "setuptools"]
name = "psycopg2" name = "psycopg2"
version = "2.9.10" version = "2.9.10"
description = "psycopg2 - Python-PostgreSQL Database Adapter" description = "psycopg2 - Python-PostgreSQL Database Adapter"
optional = false optional = true
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"},
@ -6551,13 +6645,13 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7
[[package]] [[package]]
name = "tenacity" name = "tenacity"
version = "9.0.0" version = "8.5.0"
description = "Retry code until it succeeds" description = "Retry code until it succeeds"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"}, {file = "tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687"},
{file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"}, {file = "tenacity-8.5.0.tar.gz", hash = "sha256:8bc6c0c8a09b31e6cad13c47afbed1a567518250a9a171418582ed8d9c20ca78"},
] ]
[package.extras] [package.extras]
@ -6946,6 +7040,21 @@ files = [
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
] ]
[[package]]
name = "typing-inspect"
version = "0.9.0"
description = "Runtime inspection utilities for typing module."
optional = true
python-versions = "*"
files = [
{file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"},
{file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"},
]
[package.dependencies]
mypy-extensions = ">=0.3.0"
typing-extensions = ">=3.7.4"
[[package]] [[package]]
name = "tzdata" name = "tzdata"
version = "2024.2" version = "2024.2"
@ -7513,6 +7622,7 @@ type = ["pytest-mypy"]
[extras] [extras]
cli = [] cli = []
filesystem = ["botocore"] filesystem = ["botocore"]
llama-index = ["llama-index-core"]
neo4j = ["neo4j"] neo4j = ["neo4j"]
notebook = [] notebook = []
postgres = ["asyncpg", "pgvector", "psycopg2"] postgres = ["asyncpg", "pgvector", "psycopg2"]
@ -7522,4 +7632,4 @@ weaviate = ["weaviate-client"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.9.0,<3.12" python-versions = ">=3.9.0,<3.12"
content-hash = "57a154a7bbdd990e0fbe2313fa24c412dad98e47b9cd05e41bf378a3f597713f" content-hash = "f5874af8364839dd2a362b6b3209c4aae108f30dcc27be43d0d07f7b28160eda"

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "cognee" name = "cognee"
version = "0.1.18" version = "0.1.19"
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning." description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
authors = ["Vasilije Markovic", "Boris Arzentar"] authors = ["Vasilije Markovic", "Boris Arzentar"]
readme = "README.md" readme = "README.md"
@ -35,7 +35,7 @@ boto3 = "^1.26.125"
botocore="^1.35.54" botocore="^1.35.54"
gunicorn = "^20.1.0" gunicorn = "^20.1.0"
sqlalchemy = "2.0.35" sqlalchemy = "2.0.35"
instructor = "1.6.3" instructor = "1.5.2"
networkx = "^3.2.1" networkx = "^3.2.1"
aiosqlite = "^0.20.0" aiosqlite = "^0.20.0"
pandas = "2.0.3" pandas = "2.0.3"
@ -45,7 +45,7 @@ dlt = {extras = ["sqlalchemy"], version = "^1.3.0"}
aiofiles = "^23.2.1" aiofiles = "^23.2.1"
qdrant-client = "^1.9.0" qdrant-client = "^1.9.0"
graphistry = "^0.33.5" graphistry = "^0.33.5"
tenacity = "^9.0.0" tenacity = "^8.4.1"
weaviate-client = "4.6.7" weaviate-client = "4.6.7"
scikit-learn = "^1.5.0" scikit-learn = "^1.5.0"
pypdf = "^4.1.0" pypdf = "^4.1.0"
@ -68,7 +68,8 @@ fastapi-users = {version = "*", extras = ["sqlalchemy"]}
alembic = "^1.13.3" alembic = "^1.13.3"
asyncpg = "^0.29.0" asyncpg = "^0.29.0"
pgvector = "^0.3.5" pgvector = "^0.3.5"
psycopg2 = "^2.9.10" psycopg2 = {version = "^2.9.10", optional = true}
llama-index-core = {version = "^0.11.22", optional = true}
[tool.poetry.extras] [tool.poetry.extras]
filesystem = ["s3fs", "botocore"] filesystem = ["s3fs", "botocore"]
@ -78,6 +79,7 @@ qdrant = ["qdrant-client"]
neo4j = ["neo4j"] neo4j = ["neo4j"]
postgres = ["psycopg2", "pgvector", "asyncpg"] postgres = ["psycopg2", "pgvector", "asyncpg"]
notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"] notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
llama-index = ["llama-index-core"]
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]