diff --git a/.data/code/example.txt b/.data/code/example.txt new file mode 100644 index 000000000..4596a08eb --- /dev/null +++ b/.data/code/example.txt @@ -0,0 +1,28 @@ +''' + Given a string, find the length of the longest substring without repeating characters. + + Examples: + + Given "abcabcbb", the answer is "abc", which the length is 3. + + Given "bbbbb", the answer is "b", with the length of 1. + + Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring. +''' + +class Solution(object): + def lengthOfLongestSubstring(self, s): + """ + :type s: str + :rtype: int + """ + mapSet = {} + start, result = 0, 0 + + for end in range(len(s)): + if s[end] in mapSet: + start = max(mapSet[s[end]], start) + result = max(result, end-start+1) + mapSet[s[end]] = end+1 + + return result diff --git a/.data/multimedia/example.png b/.data/multimedia/example.png new file mode 100644 index 000000000..4d406cafd Binary files /dev/null and b/.data/multimedia/example.png differ diff --git a/.data/multimedia/text_to_speech.mp3 b/.data/multimedia/text_to_speech.mp3 new file mode 100644 index 000000000..e84aea505 Binary files /dev/null and b/.data/multimedia/text_to_speech.mp3 differ diff --git a/.data/short_stories/soldiers-home.pdf b/.data/short_stories/soldiers-home.pdf new file mode 100644 index 000000000..e453ca4bc Binary files /dev/null and b/.data/short_stories/soldiers-home.pdf differ diff --git a/.dlt/config.toml b/.dlt/config.toml new file mode 100644 index 000000000..c72c145b5 --- /dev/null +++ b/.dlt/config.toml @@ -0,0 +1,6 @@ +# put your configuration values here + +[runtime] +log_level = "WARNING" # the system log level of dlt +# use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry +dlthub_telemetry = false diff --git a/.dockerignore b/.dockerignore index 77a93d28a..d2d26277f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,4 @@ -tools/bin +bin dist docs evals diff --git a/.gitignore b/.gitignore index 1bfd41dd8..c99e3a58e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -examples/.data +.data .env .local.env .prod.env diff --git a/deployment/Dockerfile_modal b/Dockerfile_modal similarity index 82% rename from deployment/Dockerfile_modal rename to Dockerfile_modal index 579dfd7b9..f8ca663a8 100644 --- a/deployment/Dockerfile_modal +++ b/Dockerfile_modal @@ -21,12 +21,12 @@ WORKDIR /app ENV PYTHONPATH=/app WORKDIR /app -COPY ../pyproject.toml poetry.lock /app/ +COPY pyproject.toml poetry.lock /app/ RUN pip install poetry RUN poetry install --all-extras --no-root --without dev -COPY ../cognee /app/cognee -COPY ../README.md /app/README.md +COPY cognee/ /app/cognee +COPY README.md /app/README.md diff --git a/README.md b/README.md index 32522e19c..8c1994b99 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,9 @@ More on [use-cases](https://docs.cognee.ai/use-cases) and [evals](https://github

🌐 Available Languages : - 🇵🇹 Português + 🇵🇹 Português · - 🇨🇳 [中文] + 🇨🇳 [中文]

diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 000000000..e7cb55ee6 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,117 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +# Use forward slashes (/) also on windows to provide an os agnostic path +script_location = alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +# version_path_separator = newline +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = %(SQLALCHEMY_DATABASE_URI)s + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/assets/cognee_benefits.png b/assets/cognee_benefits.png index db1e1cc42..d435bed05 100644 Binary files a/assets/cognee_benefits.png and b/assets/cognee_benefits.png differ diff --git a/tools/bin/dockerize b/bin/dockerize similarity index 100% rename from tools/bin/dockerize rename to bin/dockerize diff --git a/cognee-gui.py b/cognee-gui.py new file mode 100644 index 000000000..e62a08380 --- /dev/null +++ b/cognee-gui.py @@ -0,0 +1,153 @@ +import sys +import asyncio + +try: + import cognee + from PySide6.QtWidgets import ( + QApplication, + QWidget, + QPushButton, + QLineEdit, + QFileDialog, + QVBoxLayout, + QHBoxLayout, + QLabel, + QMessageBox, + QTextEdit, + QProgressDialog, + ) + from PySide6.QtCore import Qt + + from qasync import QEventLoop # Import QEventLoop from qasync +except ImportError as e: + print( + "\nPlease install Cognee with optional gui dependencies or manually install missing dependencies.\n" + ) + print("\nTo install with poetry use:") + print("\npoetry install -E gui\n") + print("\nOr to install with poetry and all dependencies use:") + print("\npoetry install --all-extras\n") + print("\nTo install with pip use: ") + print('\npip install ".[gui]"\n') + raise e + + +class FileSearchApp(QWidget): + def __init__(self): + super().__init__() + self.selected_file = None + self.init_ui() + + def init_ui(self): + # Horizontal layout for file upload and visualization buttons + button_layout = QHBoxLayout() + + # Button to open file dialog + self.file_button = QPushButton("Upload File to Cognee", parent=self) + self.file_button.clicked.connect(self.open_file_dialog) + button_layout.addWidget(self.file_button) + + # Button to visualize data + self.visualize_button = QPushButton("Visualize Data", parent=self) + self.visualize_button.clicked.connect(lambda: asyncio.ensure_future(self.visualize_data())) + button_layout.addWidget(self.visualize_button) + + # Label to display selected file path + self.file_label = QLabel("No file selected", parent=self) + + # Line edit for search input + self.search_input = QLineEdit(parent=self) + self.search_input.setPlaceholderText("Enter text to search...") + + # Button to perform search; schedule the async search on click + self.search_button = QPushButton("Cognee Search", parent=self) + self.search_button.clicked.connect(lambda: asyncio.ensure_future(self._cognee_search())) + + # Text output area for search results + self.result_output = QTextEdit(parent=self) + self.result_output.setReadOnly(True) + self.result_output.setPlaceholderText("Search results will appear here...") + + # Progress dialog + self.progress_dialog = QProgressDialog("Processing..", None, 0, 0, parent=self) + self.progress_dialog.setWindowModality(Qt.WindowModal) + self.progress_dialog.setCancelButton(None) # Remove the cancel button + self.progress_dialog.close() + + # Layout setup + layout = QVBoxLayout() + layout.addLayout(button_layout) + layout.addWidget(self.file_label) + layout.addWidget(self.search_input) + layout.addWidget(self.search_button) + layout.addWidget(self.result_output) + + self.setLayout(layout) + self.setWindowTitle("Cognee") + self.resize(500, 300) + + def open_file_dialog(self): + file_path, _ = QFileDialog.getOpenFileName( + self, "Select a File", "", "All Files (*.*);;Text Files (*.txt)" + ) + if file_path: + self.selected_file = file_path + self.file_label.setText(f"Selected: {file_path}") + asyncio.ensure_future(self.process_file_async()) + + async def process_file_async(self): + """Asynchronously add and process the selected file.""" + # Disable the entire window + self.progress_dialog.show() + self.setEnabled(False) + try: + await cognee.add(self.selected_file) + await cognee.cognify() + except Exception as e: + QMessageBox.critical(self, "Error", f"File processing failed: {str(e)}") + # Once finished, re-enable the window + self.setEnabled(True) + self.progress_dialog.close() + + async def _cognee_search(self): + """Performs an async search and updates the result output.""" + # Disable the entire window + self.setEnabled(False) + self.progress_dialog.show() + + try: + search_text = self.search_input.text().strip() + result = await cognee.search(query_text=search_text) + print(result) + # Assuming result is a list-like object; adjust if necessary + self.result_output.setText(result[0]) + except Exception as e: + QMessageBox.critical(self, "Error", f"Search failed: {str(e)}") + + # Once finished, re-enable the window + self.setEnabled(True) + self.progress_dialog.close() + + async def visualize_data(self): + """Async slot for handling visualize data button press.""" + import webbrowser + from cognee.api.v1.visualize.visualize import visualize_graph + import os + import pathlib + + html_file = os.path.join(pathlib.Path(__file__).parent, ".data", "graph_visualization.html") + await visualize_graph(html_file) + webbrowser.open(f"file://{html_file}") + + +if __name__ == "__main__": + app = QApplication(sys.argv) + # Create a qasync event loop and set it as the current event loop + loop = QEventLoop(app) + asyncio.set_event_loop(loop) + + window = FileSearchApp() + window.show() + + with loop: + loop.run_forever() diff --git a/assets/community/README.zh.md b/community/README.zh.md similarity index 100% rename from assets/community/README.zh.md rename to community/README.zh.md diff --git a/assets/community/cognee_benefits_zh.JPG b/community/cognee_benefits_zh.JPG similarity index 100% rename from assets/community/cognee_benefits_zh.JPG rename to community/cognee_benefits_zh.JPG diff --git a/assets/community/cognee_diagram_zh.JPG b/community/cognee_diagram_zh.JPG similarity index 100% rename from assets/community/cognee_diagram_zh.JPG rename to community/cognee_diagram_zh.JPG diff --git a/deployment/docker-compose.yml b/docker-compose.yml similarity index 95% rename from deployment/docker-compose.yml rename to docker-compose.yml index 81773eb28..91e3291b7 100644 --- a/deployment/docker-compose.yml +++ b/docker-compose.yml @@ -4,8 +4,8 @@ services: networks: - cognee-network build: - context: .. - dockerfile: ../Dockerfile + context: . + dockerfile: Dockerfile volumes: - ./cognee:/app/cognee - .env:/app/.env @@ -33,8 +33,8 @@ services: profiles: - ui build: - context: ../cognee-frontend - dockerfile: ../cognee-frontend/Dockerfile + context: ./cognee-frontend + dockerfile: Dockerfile volumes: - ./cognee-frontend/src:/app/src - ./cognee-frontend/public:/app/public diff --git a/deployment/entrypoint.sh b/entrypoint.sh similarity index 100% rename from deployment/entrypoint.sh rename to entrypoint.sh diff --git a/deployment/helm/Chart.yaml b/helm/Chart.yaml similarity index 100% rename from deployment/helm/Chart.yaml rename to helm/Chart.yaml diff --git a/deployment/helm/Dockerfile b/helm/Dockerfile similarity index 100% rename from deployment/helm/Dockerfile rename to helm/Dockerfile diff --git a/deployment/helm/README.md b/helm/README.md similarity index 100% rename from deployment/helm/README.md rename to helm/README.md diff --git a/deployment/helm/docker-compose-helm.yml b/helm/docker-compose-helm.yml similarity index 100% rename from deployment/helm/docker-compose-helm.yml rename to helm/docker-compose-helm.yml diff --git a/deployment/helm/templates/cognee_deployment.yaml b/helm/templates/cognee_deployment.yaml similarity index 100% rename from deployment/helm/templates/cognee_deployment.yaml rename to helm/templates/cognee_deployment.yaml diff --git a/deployment/helm/templates/cognee_service.yaml b/helm/templates/cognee_service.yaml similarity index 100% rename from deployment/helm/templates/cognee_service.yaml rename to helm/templates/cognee_service.yaml diff --git a/deployment/helm/templates/postgres_deployment.yaml b/helm/templates/postgres_deployment.yaml similarity index 100% rename from deployment/helm/templates/postgres_deployment.yaml rename to helm/templates/postgres_deployment.yaml diff --git a/deployment/helm/templates/postgres_pvc.yaml b/helm/templates/postgres_pvc.yaml similarity index 100% rename from deployment/helm/templates/postgres_pvc.yaml rename to helm/templates/postgres_pvc.yaml diff --git a/deployment/helm/templates/postgres_service.yaml b/helm/templates/postgres_service.yaml similarity index 100% rename from deployment/helm/templates/postgres_service.yaml rename to helm/templates/postgres_service.yaml diff --git a/deployment/helm/values.yaml b/helm/values.yaml similarity index 100% rename from deployment/helm/values.yaml rename to helm/values.yaml diff --git a/licenses/README.md b/licenses/README.md new file mode 100644 index 000000000..8b3d13963 --- /dev/null +++ b/licenses/README.md @@ -0,0 +1,4 @@ +# Third party licenses + +This folder contains the licenses of third-party open-source software that has been redistributed in this project. +Details of included files and modifications can be found in [NOTICE](/NOTICE.md). diff --git a/deployment/modal_deployment.py b/modal_deployment.py similarity index 97% rename from deployment/modal_deployment.py rename to modal_deployment.py index cf1cf32e3..4c2ff7d5d 100644 --- a/deployment/modal_deployment.py +++ b/modal_deployment.py @@ -12,8 +12,8 @@ app = modal.App("cognee-runner") image = ( modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False) - .copy_local_file("../pyproject.toml", "pyproject.toml") - .copy_local_file("../poetry.lock", "poetry.lock") + .copy_local_file("pyproject.toml", "pyproject.toml") + .copy_local_file("poetry.lock", "poetry.lock") .env({"ENV": os.getenv("ENV"), "LLM_API_KEY": os.getenv("LLM_API_KEY")}) .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml") .pip_install("protobuf", "h2") diff --git a/profiling/graph_pydantic_conversion/benchmark_function.py b/profiling/graph_pydantic_conversion/benchmark_function.py new file mode 100644 index 000000000..a4f5c839b --- /dev/null +++ b/profiling/graph_pydantic_conversion/benchmark_function.py @@ -0,0 +1,62 @@ +import statistics +import time +import tracemalloc +from typing import Any, Callable, Dict + +import psutil + + +def benchmark_function(func: Callable, *args, num_runs: int = 5) -> Dict[str, Any]: + """ + Benchmark a function for memory usage and computational performance. + + Args: + func: Function to benchmark + *args: Arguments to pass to the function + num_runs: Number of times to run the benchmark + + Returns: + Dictionary containing benchmark metrics + """ + execution_times = [] + peak_memory_usages = [] + cpu_percentages = [] + + process = psutil.Process() + + for _ in range(num_runs): + # Start memory tracking + tracemalloc.start() + + # Measure execution time and CPU usage + start_time = time.perf_counter() + start_cpu_time = process.cpu_times() + + end_cpu_time = process.cpu_times() + end_time = time.perf_counter() + + # Calculate metrics + execution_time = end_time - start_time + cpu_time = (end_cpu_time.user + end_cpu_time.system) - ( + start_cpu_time.user + start_cpu_time.system + ) + current, peak = tracemalloc.get_traced_memory() + + # Store results + execution_times.append(execution_time) + peak_memory_usages.append(peak / 1024 / 1024) # Convert to MB + cpu_percentages.append((cpu_time / execution_time) * 100) + + tracemalloc.stop() + + analysis = { + "mean_execution_time": statistics.mean(execution_times), + "mean_peak_memory_mb": statistics.mean(peak_memory_usages), + "mean_cpu_percent": statistics.mean(cpu_percentages), + "num_runs": num_runs, + } + + if num_runs > 1: + analysis["std_execution_time"] = statistics.stdev(execution_times) + + return analysis diff --git a/profiling/graph_pydantic_conversion/profile_graph_pydantic_conversion.py b/profiling/graph_pydantic_conversion/profile_graph_pydantic_conversion.py new file mode 100644 index 000000000..c1c0b6756 --- /dev/null +++ b/profiling/graph_pydantic_conversion/profile_graph_pydantic_conversion.py @@ -0,0 +1,63 @@ +import argparse +import asyncio + +from .benchmark_function import benchmark_function + +from cognee.modules.graph.utils import get_graph_from_model +from cognee.tests.unit.interfaces.graph.util import ( + PERSON_NAMES, + create_organization_recursive, +) + +# Example usage: +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark graph model with configurable recursive depth" + ) + parser.add_argument( + "--recursive-depth", + type=int, + default=3, + help="Recursive depth for graph generation (default: 3)", + ) + parser.add_argument("--runs", type=int, default=5, help="Number of benchmark runs (default: 5)") + args = parser.parse_args() + + society = create_organization_recursive( + "society", "Society", PERSON_NAMES, args.recursive_depth + ) + added_nodes = {} + added_edges = {} + visited_properties = {} + nodes, edges = asyncio.run( + get_graph_from_model( + society, + added_nodes=added_nodes, + added_edges=added_edges, + visited_properties=visited_properties, + ) + ) + + def get_graph_from_model_sync(model): + added_nodes = {} + added_edges = {} + visited_properties = {} + + return asyncio.run( + get_graph_from_model( + model, + added_nodes=added_nodes, + added_edges=added_edges, + visited_properties=visited_properties, + ) + ) + + results = benchmark_function(get_graph_from_model_sync, society, num_runs=args.runs) + print("\nBenchmark Results:") + print(f"N nodes: {len(nodes)}, N edges: {len(edges)}, Recursion depth: {args.recursive_depth}") + print(f"Mean Peak Memory: {results['mean_peak_memory_mb']:.2f} MB") + print(f"Mean CPU Usage: {results['mean_cpu_percent']:.2f}%") + print(f"Mean Execution Time: {results['mean_execution_time']:.4f} seconds") + + if "std_execution_time" in results: + print(f"Execution Time Std: {results['std_execution_time']:.4f} seconds") diff --git a/profiling/util/DummyEmbeddingEngine.py b/profiling/util/DummyEmbeddingEngine.py new file mode 100644 index 000000000..0ba742182 --- /dev/null +++ b/profiling/util/DummyEmbeddingEngine.py @@ -0,0 +1,10 @@ +import numpy as np +from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine + + +class DummyEmbeddingEngine(EmbeddingEngine): + async def embed_text(self, text: list[str]) -> list[list[float]]: + return list(list(np.random.randn(3072))) + + def get_vector_size(self) -> int: + return 3072 diff --git a/profiling/util/DummyLLMAdapter.py b/profiling/util/DummyLLMAdapter.py new file mode 100644 index 000000000..b28261665 --- /dev/null +++ b/profiling/util/DummyLLMAdapter.py @@ -0,0 +1,59 @@ +from typing import Type +from uuid import uuid4 + +import spacy +import textacy +from pydantic import BaseModel + +from cognee.infrastructure.llm.llm_interface import LLMInterface +from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent + + +class DummyLLMAdapter(LLMInterface): + nlp = spacy.load("en_core_web_sm") + + async def acreate_structured_output( + self, text_input: str, system_prompt: str, response_model: Type[BaseModel] + ) -> BaseModel: + if str(response_model) == "": + return dummy_summarize_content(text_input) + elif str(response_model) == "": + return dummy_extract_knowledge_graph(text_input, self.nlp) + else: + raise Exception( + "Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph" + ) + + +def dummy_extract_knowledge_graph(text, nlp): + doc = nlp(text) + triples = list(textacy.extract.subject_verb_object_triples(doc)) + + nodes = {} + edges = [] + for triple in triples: + source = "_".join([str(e) for e in triple.subject]) + target = "_".join([str(e) for e in triple.object]) + nodes[source] = nodes.get( + source, Node(id=str(uuid4()), name=source, type="object", description="") + ) + nodes[target] = nodes.get( + target, Node(id=str(uuid4()), name=target, type="object", description="") + ) + edge_type = "_".join([str(e) for e in triple.verb]) + edges.append( + Edge( + source_node_id=nodes[source].id, + target_node_id=nodes[target].id, + relationship_name=edge_type, + ) + ) + return KnowledgeGraph(nodes=list(nodes.values()), edges=edges) + + +def dummy_summarize_content(text): + words = [(word, len(word)) for word in set(text.split(" "))] + words = sorted(words, key=lambda x: x[1], reverse=True) + summary = " ".join([word for word, _ in words[:50]]) + description = " ".join([word for word, _ in words[:10]]) + return SummarizedContent(summary=summary, description=description)