Revert "Clean up core cognee repo"

This reverts commit c058219e42.
2025-05-15 10:46:01 +02:00 · 2025-05-15 10:46:01 +02:00 · 729cb9b829
commit 729cb9b829
parent c058219e42
34 changed files with 515 additions and 13 deletions
--- a/.data/code/example.txt
+++ b/.data/code/example.txt
@ -0,0 +1,28 @@
 '''
 	Given a string, find the length of the longest substring without repeating characters.
 	Examples:
 	Given "abcabcbb", the answer is "abc", which the length is 3.
 	Given "bbbbb", the answer is "b", with the length of 1.
 	Given "pwwkew", the answer is "wke", with the length of 3. Note that the answer must be a substring, "pwke" is a subsequence and not a substring.
 '''
 class Solution(object):
    def lengthOfLongestSubstring(self, s):
        """
        :type s: str
        :rtype: int
        """
        mapSet = {}
        start, result = 0, 0
        for end in range(len(s)):
        	if s[end] in mapSet:
        		start = max(mapSet[s[end]], start)
        	result = max(result, end-start+1)
        	mapSet[s[end]] = end+1
        return result
--- a/.data/multimedia/example.png
+++ b/.data/multimedia/example.png
--- a/.data/multimedia/text_to_speech.mp3
+++ b/.data/multimedia/text_to_speech.mp3
--- a/.data/short_stories/soldiers-home.pdf
+++ b/.data/short_stories/soldiers-home.pdf
--- a/.dlt/config.toml
+++ b/.dlt/config.toml
@ -0,0 +1,6 @@
 # put your configuration values here
 [runtime]
 log_level = "WARNING"  # the system log level of dlt
 # use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry
 dlthub_telemetry = false
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1,4 @@
-tools/bin
+bin
 dist
 docs
 evals
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-examples/.data
+.data
 .env
 .local.env
 .prod.env
--- a/deployment/Dockerfile_modal
+++ b/deployment/Dockerfile_modal
@ -21,12 +21,12 @@ WORKDIR /app
 ENV PYTHONPATH=/app
 WORKDIR /app
-COPY ../pyproject.toml poetry.lock /app/
+COPY pyproject.toml poetry.lock /app/
 RUN pip install poetry
 RUN poetry install --all-extras --no-root --without dev
-COPY ../cognee /app/cognee
+COPY cognee/ /app/cognee
-COPY ../README.md /app/README.md
+COPY README.md /app/README.md
--- a/README.md
+++ b/README.md
@ -35,9 +35,9 @@ More on [use-cases](https://docs.cognee.ai/use-cases) and [evals](https://github
  <p align="center">
  🌐 Available Languages
  :
-  <a href="assets/community/README.pt.md">🇵🇹 Português</a>
+  <a href="community/README.pt.md">🇵🇹 Português</a>
  ·
-  <a href="assets/community/README.zh.md">🇨🇳 [中文]</a>
+  <a href="community/README.zh.md">🇨🇳 [中文]</a>
  </p>
 <div style="text-align: center">
--- a/alembic.ini
+++ b/alembic.ini
@ -0,0 +1,117 @@
 # A generic, single database configuration.
 [alembic]
 # path to migration scripts
 # Use forward slashes (/) also on windows to provide an os agnostic path
 script_location = alembic
 # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
 # Uncomment the line below if you want the files to be prepended with date and time
 # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
 # for all available tokens
 # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
 # sys.path path, will be prepended to sys.path if present.
 # defaults to the current working directory.
 prepend_sys_path = .
 # timezone to use when rendering the date within the migration file
 # as well as the filename.
 # If specified, requires the python>=3.9 or backports.zoneinfo library.
 # Any required deps can installed by adding `alembic[tz]` to the pip requirements
 # string value is passed to ZoneInfo()
 # leave blank for localtime
 # timezone =
 # max length of characters to apply to the "slug" field
 # truncate_slug_length = 40
 # set to 'true' to run the environment during
 # the 'revision' command, regardless of autogenerate
 # revision_environment = false
 # set to 'true' to allow .pyc and .pyo files without
 # a source .py file to be detected as revisions in the
 # versions/ directory
 # sourceless = false
 # version location specification; This defaults
 # to alembic/versions.  When using multiple version
 # directories, initial revisions must be specified with --version-path.
 # The path separator used here should be the separator specified by "version_path_separator" below.
 # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
 # version path separator; As mentioned above, this is the character used to split
 # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
 # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
 # Valid values for version_path_separator are:
 #
 # version_path_separator = :
 # version_path_separator = ;
 # version_path_separator = space
 # version_path_separator = newline
 version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
 # set to 'true' to search source files recursively
 # in each "version_locations" directory
 # new in Alembic version 1.10
 # recursive_version_locations = false
 # the output encoding used when revision files
 # are written from script.py.mako
 # output_encoding = utf-8
 sqlalchemy.url = %(SQLALCHEMY_DATABASE_URI)s
 [post_write_hooks]
 # post_write_hooks defines scripts or Python functions that are run
 # on newly generated revision scripts.  See the documentation for further
 # detail and examples
 # format using "black" - use the console_scripts runner, against the "black" entrypoint
 # hooks = black
 # black.type = console_scripts
 # black.entrypoint = black
 # black.options = -l 79 REVISION_SCRIPT_FILENAME
 # lint with attempts to fix using "ruff" - use the exec runner, execute a binary
 # hooks = ruff
 # ruff.type = exec
 # ruff.executable = %(here)s/.venv/bin/ruff
 # ruff.options = --fix REVISION_SCRIPT_FILENAME
 # Logging configuration
 [loggers]
 keys = root,sqlalchemy,alembic
 [handlers]
 keys = console
 [formatters]
 keys = generic
 [logger_root]
 level = WARN
 handlers = console
 qualname =
 [logger_sqlalchemy]
 level = WARN
 handlers =
 qualname = sqlalchemy.engine
 [logger_alembic]
 level = INFO
 handlers =
 qualname = alembic
 [handler_console]
 class = StreamHandler
 args = (sys.stderr,)
 level = NOTSET
 formatter = generic
 [formatter_generic]
 format = %(levelname)-5.5s [%(name)s] %(message)s
 datefmt = %H:%M:%S
--- a/assets/cognee_benefits.png
+++ b/assets/cognee_benefits.png
--- a/tools/bin/dockerize
+++ b/tools/bin/dockerize
--- a/cognee-gui.py
+++ b/cognee-gui.py
@ -0,0 +1,153 @@
 import sys
 import asyncio
 try:
    import cognee
    from PySide6.QtWidgets import (
        QApplication,
        QWidget,
        QPushButton,
        QLineEdit,
        QFileDialog,
        QVBoxLayout,
        QHBoxLayout,
        QLabel,
        QMessageBox,
        QTextEdit,
        QProgressDialog,
    )
    from PySide6.QtCore import Qt
    from qasync import QEventLoop  # Import QEventLoop from qasync
 except ImportError as e:
    print(
        "\nPlease install Cognee with optional gui dependencies or manually install missing dependencies.\n"
    )
    print("\nTo install with poetry use:")
    print("\npoetry install -E gui\n")
    print("\nOr to install with poetry and all dependencies use:")
    print("\npoetry install --all-extras\n")
    print("\nTo install with pip use: ")
    print('\npip install ".[gui]"\n')
    raise e
 class FileSearchApp(QWidget):
    def __init__(self):
        super().__init__()
        self.selected_file = None
        self.init_ui()
    def init_ui(self):
        # Horizontal layout for file upload and visualization buttons
        button_layout = QHBoxLayout()
        # Button to open file dialog
        self.file_button = QPushButton("Upload File to Cognee", parent=self)
        self.file_button.clicked.connect(self.open_file_dialog)
        button_layout.addWidget(self.file_button)
        # Button to visualize data
        self.visualize_button = QPushButton("Visualize Data", parent=self)
        self.visualize_button.clicked.connect(lambda: asyncio.ensure_future(self.visualize_data()))
        button_layout.addWidget(self.visualize_button)
        # Label to display selected file path
        self.file_label = QLabel("No file selected", parent=self)
        # Line edit for search input
        self.search_input = QLineEdit(parent=self)
        self.search_input.setPlaceholderText("Enter text to search...")
        # Button to perform search; schedule the async search on click
        self.search_button = QPushButton("Cognee Search", parent=self)
        self.search_button.clicked.connect(lambda: asyncio.ensure_future(self._cognee_search()))
        # Text output area for search results
        self.result_output = QTextEdit(parent=self)
        self.result_output.setReadOnly(True)
        self.result_output.setPlaceholderText("Search results will appear here...")
        # Progress dialog
        self.progress_dialog = QProgressDialog("Processing..", None, 0, 0, parent=self)
        self.progress_dialog.setWindowModality(Qt.WindowModal)
        self.progress_dialog.setCancelButton(None)  # Remove the cancel button
        self.progress_dialog.close()
        # Layout setup
        layout = QVBoxLayout()
        layout.addLayout(button_layout)
        layout.addWidget(self.file_label)
        layout.addWidget(self.search_input)
        layout.addWidget(self.search_button)
        layout.addWidget(self.result_output)
        self.setLayout(layout)
        self.setWindowTitle("Cognee")
        self.resize(500, 300)
    def open_file_dialog(self):
        file_path, _ = QFileDialog.getOpenFileName(
            self, "Select a File", "", "All Files (*.*);;Text Files (*.txt)"
        )
        if file_path:
            self.selected_file = file_path
            self.file_label.setText(f"Selected: {file_path}")
            asyncio.ensure_future(self.process_file_async())
    async def process_file_async(self):
        """Asynchronously add and process the selected file."""
        # Disable the entire window
        self.progress_dialog.show()
        self.setEnabled(False)
        try:
            await cognee.add(self.selected_file)
            await cognee.cognify()
        except Exception as e:
            QMessageBox.critical(self, "Error", f"File processing failed: {str(e)}")
        # Once finished, re-enable the window
        self.setEnabled(True)
        self.progress_dialog.close()
    async def _cognee_search(self):
        """Performs an async search and updates the result output."""
        # Disable the entire window
        self.setEnabled(False)
        self.progress_dialog.show()
        try:
            search_text = self.search_input.text().strip()
            result = await cognee.search(query_text=search_text)
            print(result)
            # Assuming result is a list-like object; adjust if necessary
            self.result_output.setText(result[0])
        except Exception as e:
            QMessageBox.critical(self, "Error", f"Search failed: {str(e)}")
        # Once finished, re-enable the window
        self.setEnabled(True)
        self.progress_dialog.close()
    async def visualize_data(self):
        """Async slot for handling visualize data button press."""
        import webbrowser
        from cognee.api.v1.visualize.visualize import visualize_graph
        import os
        import pathlib
        html_file = os.path.join(pathlib.Path(__file__).parent, ".data", "graph_visualization.html")
        await visualize_graph(html_file)
        webbrowser.open(f"file://{html_file}")
 if __name__ == "__main__":
    app = QApplication(sys.argv)
    # Create a qasync event loop and set it as the current event loop
    loop = QEventLoop(app)
    asyncio.set_event_loop(loop)
    window = FileSearchApp()
    window.show()
    with loop:
        loop.run_forever()
--- a/assets/community/README.zh.md
+++ b/assets/community/README.zh.md
--- a/assets/community/cognee_benefits_zh.JPG
+++ b/assets/community/cognee_benefits_zh.JPG
--- a/assets/community/cognee_diagram_zh.JPG
+++ b/assets/community/cognee_diagram_zh.JPG
--- a/deployment/docker-compose.yml
+++ b/deployment/docker-compose.yml
@ -4,8 +4,8 @@ services:
    networks:
      - cognee-network
    build:
-      context: ..
+      context: .
-      dockerfile: ../Dockerfile
+      dockerfile: Dockerfile
    volumes:
      - ./cognee:/app/cognee
      - .env:/app/.env
@ -33,8 +33,8 @@ services:
    profiles:
        - ui
    build:
-      context: ../cognee-frontend
+      context: ./cognee-frontend
-      dockerfile: ../cognee-frontend/Dockerfile
+      dockerfile: Dockerfile
    volumes:
      - ./cognee-frontend/src:/app/src
      - ./cognee-frontend/public:/app/public
--- a/deployment/entrypoint.sh
+++ b/deployment/entrypoint.sh
--- a/deployment/helm/Chart.yaml
+++ b/deployment/helm/Chart.yaml
--- a/deployment/helm/Dockerfile
+++ b/deployment/helm/Dockerfile
--- a/deployment/helm/README.md
+++ b/deployment/helm/README.md
--- a/deployment/helm/docker-compose-helm.yml
+++ b/deployment/helm/docker-compose-helm.yml
--- a/deployment/helm/templates/cognee_deployment.yaml
+++ b/deployment/helm/templates/cognee_deployment.yaml
--- a/deployment/helm/templates/cognee_service.yaml
+++ b/deployment/helm/templates/cognee_service.yaml
--- a/deployment/helm/templates/postgres_deployment.yaml
+++ b/deployment/helm/templates/postgres_deployment.yaml
--- a/deployment/helm/templates/postgres_pvc.yaml
+++ b/deployment/helm/templates/postgres_pvc.yaml
--- a/deployment/helm/templates/postgres_service.yaml
+++ b/deployment/helm/templates/postgres_service.yaml
--- a/deployment/helm/values.yaml
+++ b/deployment/helm/values.yaml
--- a/licenses/README.md
+++ b/licenses/README.md
@ -0,0 +1,4 @@
 # Third party licenses
 This folder contains the licenses of third-party open-source software that has been redistributed in this project.
 Details of included files and modifications can be found in [NOTICE](/NOTICE.md).
--- a/deployment/modal_deployment.py
+++ b/deployment/modal_deployment.py
@ -12,8 +12,8 @@ app = modal.App("cognee-runner")
 image = (
    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
-    .copy_local_file("../pyproject.toml", "pyproject.toml")
+    .copy_local_file("pyproject.toml", "pyproject.toml")
-    .copy_local_file("../poetry.lock", "poetry.lock")
+    .copy_local_file("poetry.lock", "poetry.lock")
    .env({"ENV": os.getenv("ENV"), "LLM_API_KEY": os.getenv("LLM_API_KEY")})
    .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
    .pip_install("protobuf", "h2")
--- a/profiling/graph_pydantic_conversion/benchmark_function.py
+++ b/profiling/graph_pydantic_conversion/benchmark_function.py
@ -0,0 +1,62 @@
 import statistics
 import time
 import tracemalloc
 from typing import Any, Callable, Dict
 import psutil
 def benchmark_function(func: Callable, *args, num_runs: int = 5) -> Dict[str, Any]:
    """
    Benchmark a function for memory usage and computational performance.
    Args:
        func: Function to benchmark
        *args: Arguments to pass to the function
        num_runs: Number of times to run the benchmark
    Returns:
        Dictionary containing benchmark metrics
    """
    execution_times = []
    peak_memory_usages = []
    cpu_percentages = []
    process = psutil.Process()
    for _ in range(num_runs):
        # Start memory tracking
        tracemalloc.start()
        # Measure execution time and CPU usage
        start_time = time.perf_counter()
        start_cpu_time = process.cpu_times()
        end_cpu_time = process.cpu_times()
        end_time = time.perf_counter()
        # Calculate metrics
        execution_time = end_time - start_time
        cpu_time = (end_cpu_time.user + end_cpu_time.system) - (
            start_cpu_time.user + start_cpu_time.system
        )
        current, peak = tracemalloc.get_traced_memory()
        # Store results
        execution_times.append(execution_time)
        peak_memory_usages.append(peak / 1024 / 1024)  # Convert to MB
        cpu_percentages.append((cpu_time / execution_time) * 100)
        tracemalloc.stop()
    analysis = {
        "mean_execution_time": statistics.mean(execution_times),
        "mean_peak_memory_mb": statistics.mean(peak_memory_usages),
        "mean_cpu_percent": statistics.mean(cpu_percentages),
        "num_runs": num_runs,
    }
    if num_runs > 1:
        analysis["std_execution_time"] = statistics.stdev(execution_times)
    return analysis
--- a/profiling/graph_pydantic_conversion/profile_graph_pydantic_conversion.py
+++ b/profiling/graph_pydantic_conversion/profile_graph_pydantic_conversion.py
@ -0,0 +1,63 @@
 import argparse
 import asyncio
 from .benchmark_function import benchmark_function
 from cognee.modules.graph.utils import get_graph_from_model
 from cognee.tests.unit.interfaces.graph.util import (
    PERSON_NAMES,
    create_organization_recursive,
 )
 # Example usage:
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Benchmark graph model with configurable recursive depth"
    )
    parser.add_argument(
        "--recursive-depth",
        type=int,
        default=3,
        help="Recursive depth for graph generation (default: 3)",
    )
    parser.add_argument("--runs", type=int, default=5, help="Number of benchmark runs (default: 5)")
    args = parser.parse_args()
    society = create_organization_recursive(
        "society", "Society", PERSON_NAMES, args.recursive_depth
    )
    added_nodes = {}
    added_edges = {}
    visited_properties = {}
    nodes, edges = asyncio.run(
        get_graph_from_model(
            society,
            added_nodes=added_nodes,
            added_edges=added_edges,
            visited_properties=visited_properties,
        )
    )
    def get_graph_from_model_sync(model):
        added_nodes = {}
        added_edges = {}
        visited_properties = {}
        return asyncio.run(
            get_graph_from_model(
                model,
                added_nodes=added_nodes,
                added_edges=added_edges,
                visited_properties=visited_properties,
            )
        )
    results = benchmark_function(get_graph_from_model_sync, society, num_runs=args.runs)
    print("\nBenchmark Results:")
    print(f"N nodes: {len(nodes)}, N edges: {len(edges)}, Recursion depth: {args.recursive_depth}")
    print(f"Mean Peak Memory: {results['mean_peak_memory_mb']:.2f} MB")
    print(f"Mean CPU Usage: {results['mean_cpu_percent']:.2f}%")
    print(f"Mean Execution Time: {results['mean_execution_time']:.4f} seconds")
    if "std_execution_time" in results:
        print(f"Execution Time Std: {results['std_execution_time']:.4f} seconds")
--- a/profiling/util/DummyEmbeddingEngine.py
+++ b/profiling/util/DummyEmbeddingEngine.py
@ -0,0 +1,10 @@
 import numpy as np
 from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
 class DummyEmbeddingEngine(EmbeddingEngine):
    async def embed_text(self, text: list[str]) -> list[list[float]]:
        return list(list(np.random.randn(3072)))
    def get_vector_size(self) -> int:
        return 3072
--- a/profiling/util/DummyLLMAdapter.py
+++ b/profiling/util/DummyLLMAdapter.py
@ -0,0 +1,59 @@
 from typing import Type
 from uuid import uuid4
 import spacy
 import textacy
 from pydantic import BaseModel
 from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.shared.data_models import Edge, KnowledgeGraph, Node, SummarizedContent
 class DummyLLMAdapter(LLMInterface):
    nlp = spacy.load("en_core_web_sm")
    async def acreate_structured_output(
        self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
    ) -> BaseModel:
        if str(response_model) == "<class 'cognee.shared.data_models.SummarizedContent'>":
            return dummy_summarize_content(text_input)
        elif str(response_model) == "<class 'cognee.shared.data_models.KnowledgeGraph'>":
            return dummy_extract_knowledge_graph(text_input, self.nlp)
        else:
            raise Exception(
                "Currently dummy acreate_structured_input is only implemented for SummarizedContent and KnowledgeGraph"
            )
 def dummy_extract_knowledge_graph(text, nlp):
    doc = nlp(text)
    triples = list(textacy.extract.subject_verb_object_triples(doc))
    nodes = {}
    edges = []
    for triple in triples:
        source = "_".join([str(e) for e in triple.subject])
        target = "_".join([str(e) for e in triple.object])
        nodes[source] = nodes.get(
            source, Node(id=str(uuid4()), name=source, type="object", description="")
        )
        nodes[target] = nodes.get(
            target, Node(id=str(uuid4()), name=target, type="object", description="")
        )
        edge_type = "_".join([str(e) for e in triple.verb])
        edges.append(
            Edge(
                source_node_id=nodes[source].id,
                target_node_id=nodes[target].id,
                relationship_name=edge_type,
            )
        )
    return KnowledgeGraph(nodes=list(nodes.values()), edges=edges)
 def dummy_summarize_content(text):
    words = [(word, len(word)) for word in set(text.split(" "))]
    words = sorted(words, key=lambda x: x[1], reverse=True)
    summary = " ".join([word for word, _ in words[:50]])
    description = " ".join([word for word, _ in words[:10]])
    return SummarizedContent(summary=summary, description=description)