Revert "Clean up core cognee repo"

This reverts commit c058219e42.
This commit is contained in:
vasilije 2025-05-15 10:46:01 +02:00
parent c058219e42
commit 729cb9b829
34 changed files with 515 additions and 13 deletions

28
.data/code/example.txt Normal file
View file

@ -0,0 +1,28 @@
'''
Given a string, find the length of the longest substring without repeating characters.
Examples:
Given "abcabcbb", the answer is "abc", which the length is 3.
Given "bbbbb", the answer is "b", with the length of 1.
Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring.
'''
class Solution(object):
def lengthOfLongestSubstring(self, s):
"""
:type s: str
:rtype: int
"""
mapSet = {}
start, result = 0, 0
for end in range(len(s)):
if s[end] in mapSet:
start = max(mapSet[s[end]], start)
result = max(result, end-start+1)
mapSet[s[end]] = end+1
return result

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

Binary file not shown.

6
.dlt/config.toml Normal file
View file

@ -0,0 +1,6 @@
# put your configuration values here
[runtime]
log_level = "WARNING" # the system log level of dlt
# use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
dlthub_telemetry = false

View file

@ -1,4 +1,4 @@
tools/bin bin
dist dist
docs docs
evals evals

2
.gitignore vendored
View file

@ -1,4 +1,4 @@
examples/.data .data
.env .env
.local.env .local.env
.prod.env .prod.env

View file

@ -21,12 +21,12 @@ WORKDIR /app
ENV PYTHONPATH=/app ENV PYTHONPATH=/app
WORKDIR /app WORKDIR /app
COPY ../pyproject.toml poetry.lock /app/ COPY pyproject.toml poetry.lock /app/
RUN pip install poetry RUN pip install poetry
RUN poetry install --all-extras --no-root --without dev RUN poetry install --all-extras --no-root --without dev
COPY ../cognee /app/cognee COPY cognee/ /app/cognee
COPY ../README.md /app/README.md COPY README.md /app/README.md

View file

@ -35,9 +35,9 @@ More on [use-cases](https://docs.cognee.ai/use-cases) and [evals](https://github
<p align="center"> <p align="center">
🌐 Available Languages 🌐 Available Languages
: :
<a href="assets/community/README.pt.md">🇵🇹 Português</a> <a href="community/README.pt.md">🇵🇹 Português</a>
· ·
<a href="assets/community/README.zh.md">🇨🇳 [中文]</a> <a href="community/README.zh.md">🇨🇳 [中文]</a>
</p> </p>
<div style="text-align: center"> <div style="text-align: center">

117
alembic.ini Normal file
View file

@ -0,0 +1,117 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts
# Use forward slashes (/) also on windows to provide an os agnostic path
script_location = alembic
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
# for all available tokens
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory.
prepend_sys_path = .
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python>=3.9 or backports.zoneinfo library.
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
# string value is passed to ZoneInfo()
# leave blank for localtime
# timezone =
# max length of characters to apply to the "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; This defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "version_path_separator" below.
# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
# version path separator; As mentioned above, this is the character used to split
# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
# Valid values for version_path_separator are:
#
# version_path_separator = :
# version_path_separator = ;
# version_path_separator = space
# version_path_separator = newline
version_path_separator = os # Use os.pathsep. Default configuration used for new projects.
# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = %(SQLALCHEMY_DATABASE_URI)s
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
# hooks = ruff
# ruff.type = exec
# ruff.executable = %(here)s/.venv/bin/ruff
# ruff.options = --fix REVISION_SCRIPT_FILENAME
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 MiB

After

Width:  |  Height:  |  Size: 353 KiB

153
cognee-gui.py Normal file
View file

@ -0,0 +1,153 @@
import sys
import asyncio
try:
import cognee
from PySide6.QtWidgets import (
QApplication,
QWidget,
QPushButton,
QLineEdit,
QFileDialog,
QVBoxLayout,
QHBoxLayout,
QLabel,
QMessageBox,
QTextEdit,
QProgressDialog,
)
from PySide6.QtCore import Qt
from qasync import QEventLoop # Import QEventLoop from qasync
except ImportError as e:
print(
"\nPlease install Cognee with optional gui dependencies or manually install missing dependencies.\n"
)
print("\nTo install with poetry use:")
print("\npoetry install -E gui\n")
print("\nOr to install with poetry and all dependencies use:")
print("\npoetry install --all-extras\n")
print("\nTo install with pip use: ")
print('\npip install ".[gui]"\n')
raise e
class FileSearchApp(QWidget):
def __init__(self):
super().__init__()
self.selected_file = None
self.init_ui()
def init_ui(self):
# Horizontal layout for file upload and visualization buttons
button_layout = QHBoxLayout()
# Button to open file dialog
self.file_button = QPushButton("Upload File to Cognee", parent=self)
self.file_button.clicked.connect(self.open_file_dialog)
button_layout.addWidget(self.file_button)
# Button to visualize data
self.visualize_button = QPushButton("Visualize Data", parent=self)
self.visualize_button.clicked.connect(lambda: asyncio.ensure_future(self.visualize_data()))
button_layout.addWidget(self.visualize_button)
# Label to display selected file path
self.file_label = QLabel("No file selected", parent=self)
# Line edit for search input
self.search_input = QLineEdit(parent=self)
self.search_input.setPlaceholderText("Enter text to search...")
# Button to perform search; schedule the async search on click
self.search_button = QPushButton("Cognee Search", parent=self)
self.search_button.clicked.connect(lambda: asyncio.ensure_future(self._cognee_search()))
# Text output area for search results
self.result_output = QTextEdit(parent=self)
self.result_output.setReadOnly(True)
self.result_output.setPlaceholderText("Search results will appear here...")
# Progress dialog
self.progress_dialog = QProgressDialog("Processing..", None, 0, 0, parent=self)
self.progress_dialog.setWindowModality(Qt.WindowModal)
self.progress_dialog.setCancelButton(None) # Remove the cancel button
self.progress_dialog.close()
# Layout setup
layout = QVBoxLayout()
layout.addLayout(button_layout)
layout.addWidget(self.file_label)
layout.addWidget(self.search_input)
layout.addWidget(self.search_button)
layout.addWidget(self.result_output)
self.setLayout(layout)
self.setWindowTitle("Cognee")
self.resize(500, 300)
def open_file_dialog(self):
file_path, _ = QFileDialog.getOpenFileName(
self, "Select a File", "", "All Files (*.*);;Text Files (*.txt)"
)
if file_path:
self.selected_file = file_path
self.file_label.setText(f"Selected: {file_path}")
asyncio.ensure_future(self.process_file_async())
async def process_file_async(self):
"""Asynchronously add and process the selected file."""
# Disable the entire window
self.progress_dialog.show()
self.setEnabled(False)
try:
await cognee.add(self.selected_file)
await cognee.cognify()
except Exception as e:
QMessageBox.critical(self, "Error", f"File processing failed: {str(e)}")
# Once finished, re-enable the window
self.setEnabled(True)
self.progress_dialog.close()
async def _cognee_search(self):
"""Performs an async search and updates the result output."""
# Disable the entire window
self.setEnabled(False)
self.progress_dialog.show()
try:
search_text = self.search_input.text().strip()
result = await cognee.search(query_text=search_text)
print(result)
# Assuming result is a list-like object; adjust if necessary
self.result_output.setText(result[0])
except Exception as e:
QMessageBox.critical(self, "Error", f"Search failed: {str(e)}")
# Once finished, re-enable the window
self.setEnabled(True)
self.progress_dialog.close()
async def visualize_data(self):
"""Async slot for handling visualize data button press."""
import webbrowser
from cognee.api.v1.visualize.visualize import visualize_graph
import os
import pathlib
html_file = os.path.join(pathlib.Path(__file__).parent, ".data", "graph_visualization.html")
await visualize_graph(html_file)
webbrowser.open(f"file://{html_file}")
if __name__ == "__main__":
app = QApplication(sys.argv)
# Create a qasync event loop and set it as the current event loop
loop = QEventLoop(app)
asyncio.set_event_loop(loop)
window = FileSearchApp()
window.show()
with loop:
loop.run_forever()

View file

Before

Width:  |  Height:  |  Size: 262 KiB

After

Width:  |  Height:  |  Size: 262 KiB

View file

Before

Width:  |  Height:  |  Size: 181 KiB

After

Width:  |  Height:  |  Size: 181 KiB

View file

@ -4,8 +4,8 @@ services:
networks: networks:
- cognee-network - cognee-network
build: build:
context: .. context: .
dockerfile: ../Dockerfile dockerfile: Dockerfile
volumes: volumes:
- ./cognee:/app/cognee - ./cognee:/app/cognee
- .env:/app/.env - .env:/app/.env
@ -33,8 +33,8 @@ services:
profiles: profiles:
- ui - ui
build: build:
context: ../cognee-frontend context: ./cognee-frontend
dockerfile: ../cognee-frontend/Dockerfile dockerfile: Dockerfile
volumes: volumes:
- ./cognee-frontend/src:/app/src - ./cognee-frontend/src:/app/src
- ./cognee-frontend/public:/app/public - ./cognee-frontend/public:/app/public

4
licenses/README.md Normal file
View file

@ -0,0 +1,4 @@
# Third party licenses
This folder contains the licenses of third-party open-source software that has been redistributed in this project.
Details of included files and modifications can be found in [NOTICE](/NOTICE.md).

View file

@ -12,8 +12,8 @@ app = modal.App("cognee-runner")
image = ( image = (
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False) modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
.copy_local_file("../pyproject.toml", "pyproject.toml") .copy_local_file("pyproject.toml", "pyproject.toml")
.copy_local_file("../poetry.lock", "poetry.lock") .copy_local_file("poetry.lock", "poetry.lock")
.env({"ENV": os.getenv("ENV"), "LLM_API_KEY": os.getenv("LLM_API_KEY")}) .env({"ENV": os.getenv("ENV"), "LLM_API_KEY": os.getenv("LLM_API_KEY")})
.poetry_install_from_file(poetry_pyproject_toml="pyproject.toml") .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
.pip_install("protobuf", "h2") .pip_install("protobuf", "h2")

View file

@ -0,0 +1,62 @@
import statistics
import time
import tracemalloc
from typing import Any, Callable, Dict
import psutil
def benchmark_function(func: Callable, *args, num_runs: int = 5) -> Dict[str, Any]:
"""
Benchmark a function for memory usage and computational performance.
Args:
func: Function to benchmark
*args: Arguments to pass to the function
num_runs: Number of times to run the benchmark
Returns:
Dictionary containing benchmark metrics
"""
execution_times = []
peak_memory_usages = []
cpu_percentages = []
process = psutil.Process()
for _ in range(num_runs):
# Start memory tracking
tracemalloc.start()
# Measure execution time and CPU usage
start_time = time.perf_counter()
start_cpu_time = process.cpu_times()
end_cpu_time = process.cpu_times()
end_time = time.perf_counter()
# Calculate metrics
execution_time = end_time - start_time
cpu_time = (end_cpu_time.user + end_cpu_time.system) - (
start_cpu_time.user + start_cpu_time.system
)
current, peak = tracemalloc.get_traced_memory()
# Store results
execution_times.append(execution_time)
peak_memory_usages.append(peak / 1024 / 1024) # Convert to MB
cpu_percentages.append((cpu_time / execution_time) * 100)
tracemalloc.stop()
analysis = {
"mean_execution_time": statistics.mean(execution_times),
"mean_peak_memory_mb": statistics.mean(peak_memory_usages),
"mean_cpu_percent": statistics.mean(cpu_percentages),
"num_runs": num_runs,
}
if num_runs > 1:
analysis["std_execution_time"] = statistics.stdev(execution_times)
return analysis

View file

@ -0,0 +1,63 @@
import argparse
import asyncio
from .benchmark_function import benchmark_function
from cognee.modules.graph.utils import get_graph_from_model
from cognee.tests.unit.interfaces.graph.util import (
PERSON_NAMES,
create_organization_recursive,
)
# Example usage:
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark graph model with configurable recursive depth"
)
parser.add_argument(
"--recursive-depth",
type=int,
default=3,
help="Recursive depth for graph generation (default: 3)",
)
parser.add_argument("--runs", type=int, default=5, help="Number of benchmark runs (default: 5)")
args = parser.parse_args()
society = create_organization_recursive(
"society", "Society", PERSON_NAMES, args.recursive_depth
)
added_nodes = {}
added_edges = {}
visited_properties = {}
nodes, edges = asyncio.run(
get_graph_from_model(
society,
added_nodes=added_nodes,
added_edges=added_edges,
visited_properties=visited_properties,
)
)
def get_graph_from_model_sync(model):
added_nodes = {}
added_edges = {}
visited_properties = {}
return asyncio.run(
get_graph_from_model(
model,
added_nodes=added_nodes,
added_edges=added_edges,
visited_properties=visited_properties,
)
)
results = benchmark_function(get_graph_from_model_sync, society, num_runs=args.runs)
print("\nBenchmark Results:")
print(f"N nodes: {len(nodes)}, N edges: {len(edges)}, Recursion depth: {args.recursive_depth}")
print(f"Mean Peak Memory: {results['mean_peak_memory_mb']:.2f} MB")
print(f"Mean CPU Usage: {results['mean_cpu_percent']:.2f}%")
print(f"Mean Execution Time: {results['mean_execution_time']:.4f} seconds")
if "std_execution_time" in results:
print(f"Execution Time Std: {results['std_execution_time']:.4f} seconds")

View file

@ -0,0 +1,10 @@
import numpy as np
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
class DummyEmbeddingEngine(EmbeddingEngine):
async def embed_text(self, text: list[str]) -> list[list[float]]:
return list(list(np.random.randn(3072)))
def get_vector_size(self) -> int:
return 3072

View file

@ -0,0 +1,59 @@
from typing import Type
from uuid import uuid4
import spacy
import textacy
from pydantic import BaseModel
from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent
class DummyLLMAdapter(LLMInterface):
nlp = spacy.load("en_core_web_sm")
async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel:
if str(response_model) == "<class 'cognee.shared.data_models.SummarizedContent'>":
return dummy_summarize_content(text_input)
elif str(response_model) == "<class 'cognee.shared.data_models.KnowledgeGraph'>":
return dummy_extract_knowledge_graph(text_input, self.nlp)
else:
raise Exception(
"Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph"
)
def dummy_extract_knowledge_graph(text, nlp):
doc = nlp(text)
triples = list(textacy.extract.subject_verb_object_triples(doc))
nodes = {}
edges = []
for triple in triples:
source = "_".join([str(e) for e in triple.subject])
target = "_".join([str(e) for e in triple.object])
nodes[source] = nodes.get(
source, Node(id=str(uuid4()), name=source, type="object", description="")
)
nodes[target] = nodes.get(
target, Node(id=str(uuid4()), name=target, type="object", description="")
)
edge_type = "_".join([str(e) for e in triple.verb])
edges.append(
Edge(
source_node_id=nodes[source].id,
target_node_id=nodes[target].id,
relationship_name=edge_type,
)
)
return KnowledgeGraph(nodes=list(nodes.values()), edges=edges)
def dummy_summarize_content(text):
words = [(word, len(word)) for word in set(text.split(" "))]
words = sorted(words, key=lambda x: x[1], reverse=True)
summary = " ".join([word for word, _ in words[:50]])
description = " ".join([word for word, _ in words[:10]])
return SummarizedContent(summary=summary, description=description)