Merge branch 'dev' into cog-1069-update-notebooks-evals

2025-01-24 19:31:44 +01:00 · 2025-01-24 19:31:44 +01:00 · 39df73b811
commit 39df73b811
parent 52e5b5c6f4 7d23b32b5f
10 changed files with 247 additions and 33 deletions
--- a/.github/workflows/profiling.yaml
+++ b/.github/workflows/profiling.yaml
@ -68,32 +68,32 @@ jobs:
        echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
    # Run profiler on the base branch
-    - name: Run profiler on base branch
+#    - name: Run profiler on base branch
      env:
        BASE_SHA: ${{ env.BASE_SHA }}
      run: |
        echo "Profiling the base branch for code_graph_pipeline.py"
        echo "Checking out base SHA: $BASE_SHA"
        git checkout $BASE_SHA
        echo "This is the working directory: $PWD"
        # Ensure the script is executable
        chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
        # Run Scalene
        poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py
    # Run profiler on head branch
 #    - name: Run profiler on head branch
 #      env:
-#        HEAD_SHA: ${{ env.HEAD_SHA }}
+#        BASE_SHA: ${{ env.BASE_SHA }}
 #      run: |
-#        echo "Profiling the head branch for code_graph_pipeline.py"
+#        echo "Profiling the base branch for code_graph_pipeline.py"
-#        echo "Checking out head SHA: $HEAD_SHA"
+#        echo "Checking out base SHA: $BASE_SHA"
-#        git checkout $HEAD_SHA
+#        git checkout $BASE_SHA
 #        echo "This is the working directory: $PWD"
 #        # Ensure the script is executable
 #        chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
 #        # Run Scalene
-#        poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py
+#        poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py
    # Run profiler on head branch
    - name: Run profiler on head branch
      env:
        HEAD_SHA: ${{ env.HEAD_SHA }}
      run: |
        echo "Profiling the head branch for code_graph_pipeline.py"
        echo "Checking out head SHA: $HEAD_SHA"
        git checkout $HEAD_SHA
        echo "This is the working directory: $PWD"
        # Ensure the script is executable
        chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
        # Run Scalene
        poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py
 #    # Compare profiling results
 #    - name: Compare profiling results
--- a/.github/workflows/reusable_python_example.yml
+++ b/.github/workflows/reusable_python_example.yml
@ -7,6 +7,10 @@ on:
        description: "Location of example script to run"
        required: true
        type: string
      arguments:
        description: "Arguments for example script"
        required: false
        type: string
    secrets:
      GRAPHISTRY_USERNAME:
        required: true
@ -53,4 +57,4 @@ jobs:
          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
          GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
-        run: poetry run python ${{ inputs.example-location }}
+        run: poetry run python ${{ inputs.example-location }} ${{ inputs.arguments }}
--- a/.github/workflows/test_code_graph_example.yml
+++ b/.github/workflows/test_code_graph_example.yml
@ -0,0 +1,22 @@
 name: test | code graph example
 on:
  workflow_dispatch:
  pull_request:
    types: [labeled, synchronize]
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 jobs:
  run_simple_example_test:
      uses: ./.github/workflows/reusable_python_example.yml
      with:
        example-location: ./examples/python/code_graph_example.py
        arguments: "--repo_path ./evals"
      secrets:
        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
        GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
--- a/32
+++ b/32
@ -0,0 +1,32 @@
 FROM python:3.11-slim
 # Set environment variables
 ENV PIP_NO_CACHE_DIR=true
 ENV PATH="${PATH}:/root/.poetry/bin"
 ENV PYTHONPATH=/app
 ENV RUN_MODE=modal
 ENV SKIP_MIGRATIONS=true
 # System dependencies
 RUN apt-get update && apt-get install -y \
    gcc \
    libpq-dev \
    git \
    curl \
    build-essential \
    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 ENV PYTHONPATH=/app
 WORKDIR /app
 COPY pyproject.toml poetry.lock /app/
 RUN pip install poetry
 RUN poetry install --all-extras --no-root --without dev
 COPY cognee/ /app/cognee
 COPY README.md /app/README.md
--- a/README.md
+++ b/README.md
@ -241,6 +241,28 @@ Please see the cognee [Development Guide](https://docs.cognee.ai/quickstart/) fo
 ```bash
 pip install cognee
 ```
 ### Deployment at Scale (Modal)
 Scale cognee in 4(+1) simple steps to handle enterprise workloads using [Modal](https://modal.com)'s GPU-powered infrastructure
 **1. Install the modal python client**
 ```bash
 pip install modal
 ```
 **2. Create a free account on [Modal](https://modal.com)**
 **3. Set Up Modal API Key**
 ```bash
 modal token set --token-id TOKEN_ID --token-secret TOKEN_SECRET --profile=PROFILE
 modal profile activate PROFILE
 ```
 **4. Run cognee example**
 This simple example will deploy separate cognee instances building their own memory stores and answering a list of questions at scale.
 ```bash
 modal run -d modal_deployment.py
 ```
 **5. Change the modal_deploy script and develop your own AI memory at scale 🚀**
 ## 💫 Contributors
--- a/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py
+++ b/cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py
@ -77,14 +77,51 @@ class SQLAlchemyAdapter:
                    text(f"DROP TABLE IF EXISTS {schema_name}.{table_name} CASCADE;")
                )
-    async def insert_data(self, schema_name: str, table_name: str, data: list[dict]):
+    async def insert_data(
-        columns = ", ".join(data[0].keys())
+        self,
-        values = ", ".join([f"({', '.join([f':{key}' for key in row.keys()])})" for row in data])
+        table_name: str,
-        insert_query = text(f"INSERT INTO {schema_name}.{table_name} ({columns}) VALUES {values};")
+        data: list[dict],
        schema_name: Optional[str] = "public",
    ) -> int:
        """
        Insert data into specified table using SQLAlchemy Core with batch optimization
        Returns number of inserted rows
-        async with self.engine.begin() as connection:
+        Usage Example:
-            await connection.execute(insert_query, data)
+            from cognee.infrastructure.databases.relational.get_relational_engine import get_relational_engine
-            await connection.close()
+            from uuid import UUID
            db = get_relational_engine()
            table_name = "groups"
            data = {
                "id": UUID("c70a3cec-3309-44df-8ee6-eced820cf438"),
                "name": "test"
            }
            await db.insert_data(table_name, data)
        """
        if not data:
            logger.info("No data provided for insertion")
            return 0
        try:
            # Use SQLAlchemy Core insert with execution options
            async with self.engine.begin() as conn:
                # Dialect-agnostic table reference
                if self.engine.dialect.name == "sqlite":
                    # Foreign key constraints are disabled by default in SQLite (for backwards compatibility),
                    # so must be enabled for each database connection/session separately.
                    await conn.execute(text("PRAGMA foreign_keys=ON"))
                    table = await self.get_table(table_name)  # SQLite ignores schemas
                else:
                    table = await self.get_table(table_name, schema_name)
                result = await conn.execute(table.insert().values(data))
                # Return rowcount for validation
                return result.rowcount
        except Exception as e:
            logger.error(f"Insert failed: {str(e)}")
            raise e  # Re-raise for error handling upstream
    async def get_schema_list(self) -> List[str]:
        """
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@ -8,7 +8,6 @@ from cognee.modules.data.methods import get_datasets
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
 from cognee.modules.data.methods.get_datasets_by_name import get_datasets_by_name
 from cognee.modules.data.models import Data
 from cognee.modules.data.operations.write_metadata import write_metadata
 from cognee.modules.ingestion.data_types import BinaryData
 from cognee.modules.users.methods import get_default_user
 from cognee.shared.CodeGraphEntities import Repository
--- a/modal_deployment.py
+++ b/modal_deployment.py
@ -0,0 +1,94 @@
 import modal
 import os
 import logging
 import asyncio
 import cognee
 import signal
 from cognee.api.v1.search import SearchType
 from cognee.shared.utils import setup_logging
 app = modal.App("cognee-runner")
 image = (
    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
    .copy_local_file("pyproject.toml", "pyproject.toml")
    .copy_local_file("poetry.lock", "poetry.lock")
    .env({"ENV": os.getenv("ENV"), "LLM_API_KEY": os.getenv("LLM_API_KEY")})
    .poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
    .pip_install("protobuf", "h2")
 )
@app.function(image=image, concurrency_limit=10)
 async def entry(text: str, query: str):
    setup_logging(logging.ERROR)
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    await cognee.add(text)
    await cognee.cognify()
    search_results = await cognee.search(SearchType.GRAPH_COMPLETION, query_text=query)
    return {
        "text": text,
        "query": query,
        "answer": search_results[0] if search_results else None,
    }
@app.local_entrypoint()
 async def main():
    text_queries = [
        {
            "text": "NASA's Artemis program aims to return humans to the Moon by 2026, focusing on sustainable exploration and preparing for future Mars missions.",
            "query": "When does NASA plan to return humans to the Moon under the Artemis program?",
        },
        {
            "text": "According to a 2022 UN report, global food waste amounts to approximately 931 million tons annually, with households contributing 61% of the total.",
            "query": "How much food waste do households contribute annually according to the 2022 UN report?",
        },
        {
            "text": "The 2021 census data revealed that Tokyo's population reached 14 million, reflecting a 2.1% increase compared to the previous census conducted in 2015.",
            "query": "What was Tokyo's population according to the 2021 census data?",
        },
        {
            "text": "A recent study published in the Journal of Nutrition found that consuming 30 grams of almonds daily can lower LDL cholesterol levels by 7% over a 12-week period.",
            "query": "How much can daily almond consumption lower LDL cholesterol according to the study?",
        },
        {
            "text": "Amazon's Prime membership grew to 200 million subscribers in 2023, marking a 10% increase from the previous year, driven by exclusive content and faster delivery options.",
            "query": "How many Prime members did Amazon have in 2023?",
        },
        {
            "text": "A new report by the International Energy Agency states that global renewable energy capacity increased by 295 gigawatts in 2022, primarily driven by solar and wind power expansion.",
            "query": "By how much did global renewable energy capacity increase in 2022 according to the report?",
        },
        {
            "text": "The World Health Organization reported in 2023 that the global life expectancy has risen to 73.4 years, an increase of 5.5 years since the year 2000.",
            "query": "What is the current global life expectancy according to the WHO's 2023 report?",
        },
        {
            "text": "The FIFA World Cup 2022 held in Qatar attracted a record-breaking audience of 5 billion people across various digital and traditional broadcasting platforms.",
            "query": "How many people watched the FIFA World Cup 2022?",
        },
        {
            "text": "The European Space Agency's JUICE mission, launched in 2023, aims to explore Jupiter's icy moons, including Ganymede, Europa, and Callisto, over the next decade.",
            "query": "Which moons is the JUICE mission set to explore?",
        },
        {
            "text": "According to a report by the International Labour Organization, the global unemployment rate in 2023 was estimated at 5.4%, reflecting a slight decrease compared to the previous year.",
            "query": "What was the global unemployment rate in 2023 according to the ILO?",
        },
    ]
    tasks = [entry.remote.aio(item["text"], item["query"]) for item in text_queries]
    results = await asyncio.gather(*tasks)
    print("\nFinal Results:")
    for result in results:
        print(result)
        print("----")
    os.kill(os.getpid(), signal.SIGTERM)
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
 [[package]]
 name = "aiofiles"
@ -4998,8 +4998,8 @@ files = [
 [package.dependencies]
 numpy = [
    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@ -5927,8 +5927,8 @@ astroid = ">=3.3.8,<=3.4.0-dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
    {version = ">=0.2", markers = "python_version < \"3.11\""},
    {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=0.3.7", markers = "python_version >= \"3.12\""},
    {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
 ]
 isort = ">=4.2.5,<5.13.0 || >5.13.0,<6"
 mccabe = ">=0.6,<0.8"
@ -8796,6 +8796,7 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 type = ["pytest-mypy"]
 [extras]
 codegraph = ["jedi", "parso"]
 deepeval = ["deepeval"]
 docs = ["unstructured"]
 falkordb = ["falkordb"]
@ -8814,4 +8815,4 @@ weaviate = ["weaviate-client"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10.0,<3.13"
-content-hash = "d40b127fc83e2f623276d7f001e726490a4ccad195350e8ff0b10c7e3b53775a"
+content-hash = "9b5d0162e4fdaaded920a2c8b448e07ec794c55914c1d6e18c6ab9b48c42df2d"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -77,6 +77,8 @@ pre-commit = "^4.0.1"
 httpx = "0.27.0"
 bokeh="^3.6.2"
 nltk = "3.9.1"
 parso = {version = "^0.8.4", optional = true}
 jedi =  {version = "^0.19.2", optional = true}
 [tool.poetry.extras]
@ -94,6 +96,7 @@ falkordb = ["falkordb"]
 groq = ["groq"]
 milvus = ["pymilvus"]
 docs = ["unstructured"]
 codegraph = ["parso", "jedi"]
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.0"