From 711ae8e675a70a1749b9852036aa4a77935e706d Mon Sep 17 00:00:00 2001 From: Boris Date: Wed, 26 Feb 2025 20:15:02 +0100 Subject: [PATCH] feat: codegraph improvements and new CODE search [COG-1351] (#581) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Introduced an automated deployment workflow to build and push container images. - Updated dependency management to include additional database support. - **Refactor** - Enhanced asynchronous operations and logging in the server for improved performance. - Optimized extraction and retrieval processes for code-related data. - **Chores** - Streamlined build configurations and startup scripts for greater reliability. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com> Co-authored-by: Igor Ilic --- .github/workflows/dockerhub-mcp.yml | 48 +++++ .github/workflows/dockerhub.yml | 2 +- cognee-mcp/Dockerfile | 49 +++++ cognee-mcp/pyproject.toml | 7 +- cognee-mcp/src/server.py | 7 +- cognee-mcp/uv.lock | 137 +++++++++++++- cognee/api/v1/cognify/code_graph_pipeline.py | 16 +- .../prompts/codegraph_retriever_system.txt | 22 +++ .../modules/retrieval/code_graph_retrieval.py | 140 ++++++++++++--- cognee/shared/CodeGraphEntities.py | 9 +- .../repo_processor/get_local_dependencies.py | 170 +++++++++++++----- .../get_repo_file_dependencies.py | 9 +- entrypoint-old.sh | 35 ---- entrypoint.sh | 2 +- 14 files changed, 532 insertions(+), 121 deletions(-) create mode 100644 .github/workflows/dockerhub-mcp.yml create mode 100644 cognee-mcp/Dockerfile create mode 100644 cognee/infrastructure/llm/prompts/codegraph_retriever_system.txt delete mode 100755 entrypoint-old.sh diff --git a/.github/workflows/dockerhub-mcp.yml b/.github/workflows/dockerhub-mcp.yml new file mode 100644 index 000000000..f254a9261 --- /dev/null +++ b/.github/workflows/dockerhub-mcp.yml @@ -0,0 +1,48 @@ +name: build | Build and Push Cognee MCP Docker Image to dockerhub + +on: + push: + branches: + - main + +jobs: + docker-build-and-push: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: cognee/cognee-mcp + tags: | + type=ref,event=branch + type=sha,prefix={{branch}}- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push + id: build + uses: docker/build-push-action@v5 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=registry,ref=cognee/cognee-mcp:buildcache + cache-to: type=registry,ref=cognee/cognee-mcp:buildcache,mode=max + + - name: Image digest + run: echo ${{ steps.build.outputs.digest }} diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index 1f5a6ca59..8ed7e802a 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -3,7 +3,6 @@ name: build | Build and Push Docker Image to dockerhub on: push: branches: - - dev - main jobs: @@ -34,6 +33,7 @@ jobs: type=raw,value=latest,enable={{is_default_branch}} - name: Build and push + id: build uses: docker/build-push-action@v5 with: context: . diff --git a/cognee-mcp/Dockerfile b/cognee-mcp/Dockerfile new file mode 100644 index 000000000..1aebbf255 --- /dev/null +++ b/cognee-mcp/Dockerfile @@ -0,0 +1,49 @@ +# Use a Python image with uv pre-installed +FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv + +# Set build argument +ARG DEBUG + +# Set environment variable based on the build argument +ENV DEBUG=${DEBUG} +ENV PIP_NO_CACHE_DIR=true +ENV PATH="${PATH}:/root/.poetry/bin" + +WORKDIR /app + +# Enable bytecode compilation +ENV UV_COMPILE_BYTECODE=1 + +# Copy from the cache instead of linking since it's a mounted volume +ENV UV_LINK_MODE=copy + + +# Install the project's dependencies using the lockfile and settings +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=uv.lock,target=uv.lock \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + uv sync --frozen --no-install-project --no-dev --no-editable + +# Then, add the rest of the project source code and install it +# Installing separately from its dependencies allows optimal layer caching +ADD . /app +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --frozen --no-dev --no-editable + +# RUN apt-get update && apt-get install + +# RUN apt-get install -y \ +# gcc \ +# libpq-dev + +FROM python:3.12-slim-bookworm + +WORKDIR /app + +COPY --from=uv /root/.local /root/.local +COPY --from=uv --chown=app:app /app/.venv /app/.venv + +# Place executables in the environment at the front of the path +ENV PATH="/app/.venv/bin:$PATH" + +ENTRYPOINT ["cognee"] diff --git a/cognee-mcp/pyproject.toml b/cognee-mcp/pyproject.toml index 7d561d0b1..87dea3cf5 100644 --- a/cognee-mcp/pyproject.toml +++ b/cognee-mcp/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ - "cognee[codegraph]", + "cognee[codegraph,postgres,neo4j]", "mcp==1.2.1", ] @@ -21,5 +21,10 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["src"] +[dependency-groups] +dev = [ + "debugpy>=1.8.12", +] + [project.scripts] cognee = "src:main" diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py index c7dfdb0b3..ccea5caca 100755 --- a/cognee-mcp/src/server.py +++ b/cognee-mcp/src/server.py @@ -1,3 +1,4 @@ +import asyncio import json import os import cognee @@ -92,7 +93,7 @@ async def call_tools(name: str, arguments: dict) -> list[types.TextContent]: with open(os.devnull, "w") as fnull: with redirect_stdout(fnull), redirect_stderr(fnull): if name == "cognify": - cognify( + await cognify( text=arguments["text"], graph_model_file=arguments.get("graph_model_file", None), graph_model_name=arguments.get("graph_model_name", None), @@ -161,6 +162,8 @@ async def main(): try: from mcp.server.stdio import stdio_server + logger.info("Starting Cognee MCP server...") + async with stdio_server() as (read_stream, write_stream): await mcp.run( read_stream=read_stream, @@ -249,6 +252,4 @@ def load_class(model_file, model_name): if __name__ == "__main__": # Initialize and run the server - import asyncio - asyncio.run(main()) diff --git a/cognee-mcp/uv.lock b/cognee-mcp/uv.lock index db29e6d1c..34877d370 100644 --- a/cognee-mcp/uv.lock +++ b/cognee-mcp/uv.lock @@ -225,6 +225,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/fa/e01228c2938de91d47b307831c62ab9e4001e747789d0b05baf779a6488c/async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028", size = 5721 }, ] +[[package]] +name = "asyncpg" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "async-timeout", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2f/4c/7c991e080e106d854809030d8584e15b2e996e26f16aee6d757e387bc17d/asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851", size = 957746 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/07/1650a8c30e3a5c625478fa8aafd89a8dd7d85999bf7169b16f54973ebf2c/asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e", size = 673143 }, + { url = "https://files.pythonhosted.org/packages/a0/9a/568ff9b590d0954553c56806766914c149609b828c426c5118d4869111d3/asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0", size = 645035 }, + { url = "https://files.pythonhosted.org/packages/de/11/6f2fa6c902f341ca10403743701ea952bca896fc5b07cc1f4705d2bb0593/asyncpg-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3152fef2e265c9c24eec4ee3d22b4f4d2703d30614b0b6753e9ed4115c8a146f", size = 2912384 }, + { url = "https://files.pythonhosted.org/packages/83/83/44bd393919c504ffe4a82d0aed8ea0e55eb1571a1dea6a4922b723f0a03b/asyncpg-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7255812ac85099a0e1ffb81b10dc477b9973345793776b128a23e60148dd1af", size = 2947526 }, + { url = "https://files.pythonhosted.org/packages/08/85/e23dd3a2b55536eb0ded80c457b0693352262dc70426ef4d4a6fc994fa51/asyncpg-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:578445f09f45d1ad7abddbff2a3c7f7c291738fdae0abffbeb737d3fc3ab8b75", size = 2895390 }, + { url = "https://files.pythonhosted.org/packages/9b/26/fa96c8f4877d47dc6c1864fef5500b446522365da3d3d0ee89a5cce71a3f/asyncpg-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c42f6bb65a277ce4d93f3fba46b91a265631c8df7250592dd4f11f8b0152150f", size = 3015630 }, + { url = "https://files.pythonhosted.org/packages/34/00/814514eb9287614188a5179a8b6e588a3611ca47d41937af0f3a844b1b4b/asyncpg-0.30.0-cp310-cp310-win32.whl", hash = "sha256:aa403147d3e07a267ada2ae34dfc9324e67ccc4cdca35261c8c22792ba2b10cf", size = 568760 }, + { url = "https://files.pythonhosted.org/packages/f0/28/869a7a279400f8b06dd237266fdd7220bc5f7c975348fea5d1e6909588e9/asyncpg-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb622c94db4e13137c4c7f98834185049cc50ee01d8f657ef898b6407c7b9c50", size = 625764 }, + { url = "https://files.pythonhosted.org/packages/4c/0e/f5d708add0d0b97446c402db7e8dd4c4183c13edaabe8a8500b411e7b495/asyncpg-0.30.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a", size = 674506 }, + { url = "https://files.pythonhosted.org/packages/6a/a0/67ec9a75cb24a1d99f97b8437c8d56da40e6f6bd23b04e2f4ea5d5ad82ac/asyncpg-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed", size = 645922 }, + { url = "https://files.pythonhosted.org/packages/5c/d9/a7584f24174bd86ff1053b14bb841f9e714380c672f61c906eb01d8ec433/asyncpg-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a", size = 3079565 }, + { url = "https://files.pythonhosted.org/packages/a0/d7/a4c0f9660e333114bdb04d1a9ac70db690dd4ae003f34f691139a5cbdae3/asyncpg-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956", size = 3109962 }, + { url = "https://files.pythonhosted.org/packages/3c/21/199fd16b5a981b1575923cbb5d9cf916fdc936b377e0423099f209e7e73d/asyncpg-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056", size = 3064791 }, + { url = "https://files.pythonhosted.org/packages/77/52/0004809b3427534a0c9139c08c87b515f1c77a8376a50ae29f001e53962f/asyncpg-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454", size = 3188696 }, + { url = "https://files.pythonhosted.org/packages/52/cb/fbad941cd466117be58b774a3f1cc9ecc659af625f028b163b1e646a55fe/asyncpg-0.30.0-cp311-cp311-win32.whl", hash = "sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d", size = 567358 }, + { url = "https://files.pythonhosted.org/packages/3c/0a/0a32307cf166d50e1ad120d9b81a33a948a1a5463ebfa5a96cc5606c0863/asyncpg-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f", size = 629375 }, + { url = "https://files.pythonhosted.org/packages/4b/64/9d3e887bb7b01535fdbc45fbd5f0a8447539833b97ee69ecdbb7a79d0cb4/asyncpg-0.30.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e", size = 673162 }, + { url = "https://files.pythonhosted.org/packages/6e/eb/8b236663f06984f212a087b3e849731f917ab80f84450e943900e8ca4052/asyncpg-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a", size = 637025 }, + { url = "https://files.pythonhosted.org/packages/cc/57/2dc240bb263d58786cfaa60920779af6e8d32da63ab9ffc09f8312bd7a14/asyncpg-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3", size = 3496243 }, + { url = "https://files.pythonhosted.org/packages/f4/40/0ae9d061d278b10713ea9021ef6b703ec44698fe32178715a501ac696c6b/asyncpg-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737", size = 3575059 }, + { url = "https://files.pythonhosted.org/packages/c3/75/d6b895a35a2c6506952247640178e5f768eeb28b2e20299b6a6f1d743ba0/asyncpg-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a", size = 3473596 }, + { url = "https://files.pythonhosted.org/packages/c8/e7/3693392d3e168ab0aebb2d361431375bd22ffc7b4a586a0fc060d519fae7/asyncpg-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af", size = 3641632 }, + { url = "https://files.pythonhosted.org/packages/32/ea/15670cea95745bba3f0352341db55f506a820b21c619ee66b7d12ea7867d/asyncpg-0.30.0-cp312-cp312-win32.whl", hash = "sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e", size = 560186 }, + { url = "https://files.pythonhosted.org/packages/7e/6b/fe1fad5cee79ca5f5c27aed7bd95baee529c1bf8a387435c8ba4fe53d5c1/asyncpg-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305", size = 621064 }, + { url = "https://files.pythonhosted.org/packages/3a/22/e20602e1218dc07692acf70d5b902be820168d6282e69ef0d3cb920dc36f/asyncpg-0.30.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70", size = 670373 }, + { url = "https://files.pythonhosted.org/packages/3d/b3/0cf269a9d647852a95c06eb00b815d0b95a4eb4b55aa2d6ba680971733b9/asyncpg-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3", size = 634745 }, + { url = "https://files.pythonhosted.org/packages/8e/6d/a4f31bf358ce8491d2a31bfe0d7bcf25269e80481e49de4d8616c4295a34/asyncpg-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33", size = 3512103 }, + { url = "https://files.pythonhosted.org/packages/96/19/139227a6e67f407b9c386cb594d9628c6c78c9024f26df87c912fabd4368/asyncpg-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4", size = 3592471 }, + { url = "https://files.pythonhosted.org/packages/67/e4/ab3ca38f628f53f0fd28d3ff20edff1c975dd1cb22482e0061916b4b9a74/asyncpg-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4", size = 3496253 }, + { url = "https://files.pythonhosted.org/packages/ef/5f/0bf65511d4eeac3a1f41c54034a492515a707c6edbc642174ae79034d3ba/asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba", size = 3662720 }, + { url = "https://files.pythonhosted.org/packages/e7/31/1513d5a6412b98052c3ed9158d783b1e09d0910f51fbe0e05f56cc370bc4/asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590", size = 560404 }, + { url = "https://files.pythonhosted.org/packages/c8/a4/cec76b3389c4c5ff66301cd100fe88c318563ec8a520e0b2e792b5b84972/asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e", size = 621623 }, +] + [[package]] name = "attrs" version = "25.1.0" @@ -529,22 +572,38 @@ codegraph = [ { name = "tree-sitter" }, { name = "tree-sitter-python" }, ] +neo4j = [ + { name = "neo4j" }, +] +postgres = [ + { name = "asyncpg" }, + { name = "pgvector" }, + { name = "psycopg2" }, +] [[package]] name = "cognee-mcp" version = "0.1.0" source = { editable = "." } dependencies = [ - { name = "cognee", extra = ["codegraph"] }, + { name = "cognee", extra = ["codegraph", "neo4j", "postgres"] }, { name = "mcp" }, ] +[package.dev-dependencies] +dev = [ + { name = "debugpy" }, +] + [package.metadata] requires-dist = [ - { name = "cognee", extras = ["codegraph"] }, + { name = "cognee", extras = ["codegraph", "postgres", "neo4j"] }, { name = "mcp", specifier = "==1.2.1" }, ] +[package.metadata.requires-dev] +dev = [{ name = "debugpy", specifier = ">=1.8.12" }] + [[package]] name = "colorama" version = "0.4.6" @@ -705,6 +764,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ed/a5/33cf000137545a08b0a3a6ea76c8ccbd87917f78bb5d737f9f56f3b11ef6/datasets-3.1.0-py3-none-any.whl", hash = "sha256:dc8808a6d17838fe05e13b39aa7ac3ea0fd0806ed7004eaf4d4eb2c2a356bc61", size = 480554 }, ] +[[package]] +name = "debugpy" +version = "1.8.12" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/68/25/c74e337134edf55c4dfc9af579eccb45af2393c40960e2795a94351e8140/debugpy-1.8.12.tar.gz", hash = "sha256:646530b04f45c830ceae8e491ca1c9320a2d2f0efea3141487c82130aba70dce", size = 1641122 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/19/dd58334c0a1ec07babf80bf29fb8daf1a7ca4c1a3bbe61548e40616ac087/debugpy-1.8.12-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:a2ba7ffe58efeae5b8fad1165357edfe01464f9aef25e814e891ec690e7dd82a", size = 2076091 }, + { url = "https://files.pythonhosted.org/packages/4c/37/bde1737da15f9617d11ab7b8d5267165f1b7dae116b2585a6643e89e1fa2/debugpy-1.8.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbbd4149c4fc5e7d508ece083e78c17442ee13b0e69bfa6bd63003e486770f45", size = 3560717 }, + { url = "https://files.pythonhosted.org/packages/d9/ca/bc67f5a36a7de072908bc9e1156c0f0b272a9a2224cf21540ab1ffd71a1f/debugpy-1.8.12-cp310-cp310-win32.whl", hash = "sha256:b202f591204023b3ce62ff9a47baa555dc00bb092219abf5caf0e3718ac20e7c", size = 5180672 }, + { url = "https://files.pythonhosted.org/packages/c1/b9/e899c0a80dfa674dbc992f36f2b1453cd1ee879143cdb455bc04fce999da/debugpy-1.8.12-cp310-cp310-win_amd64.whl", hash = "sha256:9649eced17a98ce816756ce50433b2dd85dfa7bc92ceb60579d68c053f98dff9", size = 5212702 }, + { url = "https://files.pythonhosted.org/packages/af/9f/5b8af282253615296264d4ef62d14a8686f0dcdebb31a669374e22fff0a4/debugpy-1.8.12-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:36f4829839ef0afdfdd208bb54f4c3d0eea86106d719811681a8627ae2e53dd5", size = 2174643 }, + { url = "https://files.pythonhosted.org/packages/ef/31/f9274dcd3b0f9f7d1e60373c3fa4696a585c55acb30729d313bb9d3bcbd1/debugpy-1.8.12-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a28ed481d530e3138553be60991d2d61103ce6da254e51547b79549675f539b7", size = 3133457 }, + { url = "https://files.pythonhosted.org/packages/ab/ca/6ee59e9892e424477e0c76e3798046f1fd1288040b927319c7a7b0baa484/debugpy-1.8.12-cp311-cp311-win32.whl", hash = "sha256:4ad9a94d8f5c9b954e0e3b137cc64ef3f579d0df3c3698fe9c3734ee397e4abb", size = 5106220 }, + { url = "https://files.pythonhosted.org/packages/d5/1a/8ab508ab05ede8a4eae3b139bbc06ea3ca6234f9e8c02713a044f253be5e/debugpy-1.8.12-cp311-cp311-win_amd64.whl", hash = "sha256:4703575b78dd697b294f8c65588dc86874ed787b7348c65da70cfc885efdf1e1", size = 5130481 }, + { url = "https://files.pythonhosted.org/packages/ba/e6/0f876ecfe5831ebe4762b19214364753c8bc2b357d28c5d739a1e88325c7/debugpy-1.8.12-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:7e94b643b19e8feb5215fa508aee531387494bf668b2eca27fa769ea11d9f498", size = 2500846 }, + { url = "https://files.pythonhosted.org/packages/19/64/33f41653a701f3cd2cbff8b41ebaad59885b3428b5afd0d93d16012ecf17/debugpy-1.8.12-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:086b32e233e89a2740c1615c2f775c34ae951508b28b308681dbbb87bba97d06", size = 4222181 }, + { url = "https://files.pythonhosted.org/packages/32/a6/02646cfe50bfacc9b71321c47dc19a46e35f4e0aceea227b6d205e900e34/debugpy-1.8.12-cp312-cp312-win32.whl", hash = "sha256:2ae5df899732a6051b49ea2632a9ea67f929604fd2b036613a9f12bc3163b92d", size = 5227017 }, + { url = "https://files.pythonhosted.org/packages/da/a6/10056431b5c47103474312cf4a2ec1001f73e0b63b1216706d5fef2531eb/debugpy-1.8.12-cp312-cp312-win_amd64.whl", hash = "sha256:39dfbb6fa09f12fae32639e3286112fc35ae976114f1f3d37375f3130a820969", size = 5267555 }, + { url = "https://files.pythonhosted.org/packages/cf/4d/7c3896619a8791effd5d8c31f0834471fc8f8fb3047ec4f5fc69dd1393dd/debugpy-1.8.12-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:696d8ae4dff4cbd06bf6b10d671e088b66669f110c7c4e18a44c43cf75ce966f", size = 2485246 }, + { url = "https://files.pythonhosted.org/packages/99/46/bc6dcfd7eb8cc969a5716d858e32485eb40c72c6a8dc88d1e3a4d5e95813/debugpy-1.8.12-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:898fba72b81a654e74412a67c7e0a81e89723cfe2a3ea6fcd3feaa3395138ca9", size = 4218616 }, + { url = "https://files.pythonhosted.org/packages/03/dd/d7fcdf0381a9b8094da1f6a1c9f19fed493a4f8576a2682349b3a8b20ec7/debugpy-1.8.12-cp313-cp313-win32.whl", hash = "sha256:22a11c493c70413a01ed03f01c3c3a2fc4478fc6ee186e340487b2edcd6f4180", size = 5226540 }, + { url = "https://files.pythonhosted.org/packages/25/bd/ecb98f5b5fc7ea0bfbb3c355bc1dd57c198a28780beadd1e19915bf7b4d9/debugpy-1.8.12-cp313-cp313-win_amd64.whl", hash = "sha256:fdb3c6d342825ea10b90e43d7f20f01535a72b3a1997850c0c3cefa5c27a4a2c", size = 5267134 }, + { url = "https://files.pythonhosted.org/packages/38/c4/5120ad36405c3008f451f94b8f92ef1805b1e516f6ff870f331ccb3c4cc0/debugpy-1.8.12-py2.py3-none-any.whl", hash = "sha256:274b6a2040349b5c9864e475284bce5bb062e63dce368a394b8cc865ae3b00c6", size = 5229490 }, +] + [[package]] name = "deprecation" version = "2.1.0" @@ -1999,6 +2083,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351 }, ] +[[package]] +name = "neo4j" +version = "5.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4b/20/733dac16f7cedc80b23093415822c9763302519cba0e7c8bcdb5c01fc512/neo4j-5.28.1.tar.gz", hash = "sha256:ae8e37a1d895099062c75bc359b2cce62099baac7be768d0eba7180c1298e214", size = 231094 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/57/94225fe5e9dabdc0ff60c88cbfcedf11277f4b34e7ab1373d3e62dbdd207/neo4j-5.28.1-py3-none-any.whl", hash = "sha256:6755ef9e5f4e14b403aef1138fb6315b120631a0075c138b5ddb2a06b87b09fd", size = 312258 }, +] + [[package]] name = "nest-asyncio" version = "1.6.0" @@ -2342,6 +2438,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7a/8a/166625d30f927e800e99f3f6556d8b3f4ad952c62d6a774844d73542b84b/pendulum-3.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:314c4038dc5e6a52991570f50edb2f08c339debdf8cea68ac355b32c4174e820", size = 293657 }, ] +[[package]] +name = "pgvector" +version = "0.3.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/d8/fd6009cee3e03214667df488cdcf9609461d729968da94e4f95d6359d304/pgvector-0.3.6.tar.gz", hash = "sha256:31d01690e6ea26cea8a633cde5f0f55f5b246d9c8292d68efdef8c22ec994ade", size = 25421 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/81/f457d6d361e04d061bef413749a6e1ab04d98cfeec6d8abcfe40184750f3/pgvector-0.3.6-py3-none-any.whl", hash = "sha256:f6c269b3c110ccb7496bac87202148ed18f34b390a0189c783e351062400a75a", size = 24880 }, +] + [[package]] name = "pillow" version = "10.4.0" @@ -2531,6 +2639,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/b2/ab07b09e0f6d143dfb839693aa05765257bceaa13d03bf1a696b78323e7a/protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f", size = 172550 }, ] +[[package]] +name = "psycopg2" +version = "2.9.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/62/51/2007ea29e605957a17ac6357115d0c1a1b60c8c984951c19419b3474cdfd/psycopg2-2.9.10.tar.gz", hash = "sha256:12ec0b40b0273f95296233e8750441339298e6a572f7039da5b260e3c8b60e11", size = 385672 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/a9/146b6bdc0d33539a359f5e134ee6dda9173fb8121c5b96af33fa299e50c4/psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716", size = 1024527 }, + { url = "https://files.pythonhosted.org/packages/47/50/c509e56f725fd2572b59b69bd964edaf064deebf1c896b2452f6b46fdfb3/psycopg2-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:c6f7b8561225f9e711a9c47087388a97fdc948211c10a4bccbf0ba68ab7b3b5a", size = 1163735 }, + { url = "https://files.pythonhosted.org/packages/20/a2/c51ca3e667c34e7852157b665e3d49418e68182081060231d514dd823225/psycopg2-2.9.10-cp311-cp311-win32.whl", hash = "sha256:47c4f9875125344f4c2b870e41b6aad585901318068acd01de93f3677a6522c2", size = 1024538 }, + { url = "https://files.pythonhosted.org/packages/33/39/5a9a229bb5414abeb86e33b8fc8143ab0aecce5a7f698a53e31367d30caa/psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4", size = 1163736 }, + { url = "https://files.pythonhosted.org/packages/3d/16/4623fad6076448df21c1a870c93a9774ad8a7b4dd1660223b59082dd8fec/psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067", size = 1025113 }, + { url = "https://files.pythonhosted.org/packages/66/de/baed128ae0fc07460d9399d82e631ea31a1f171c0c4ae18f9808ac6759e3/psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e", size = 1163951 }, + { url = "https://files.pythonhosted.org/packages/ae/49/a6cfc94a9c483b1fa401fbcb23aca7892f60c7269c5ffa2ac408364f80dc/psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2", size = 2569060 }, +] + [[package]] name = "pwdlib" version = "0.2.1" @@ -2584,6 +2707,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/67/16d48e7f02b285b39028aa47f847b3a279c903bc5cd49c8012ea90255317/py_rust_stemmers-0.1.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:fbb9f7933239a57d1d9c0fcdfbe0c5283a081e9e64ddc48ed878783be3d52b2b", size = 567278 }, { url = "https://files.pythonhosted.org/packages/ad/1c/cb8cc9680f8aa04f96cb5c814887b3bb8d23a2e9abf460ef861ae16bfe50/py_rust_stemmers-0.1.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:921803a6f8259f10bf348ac0e32a767c28ab587c9ad5c3b1ee593a4bbbe98d39", size = 488907 }, { url = "https://files.pythonhosted.org/packages/cd/29/88217de06239e3e526fa6286a11e3662d94acb0be4216c1310301a252dab/py_rust_stemmers-0.1.3-cp312-none-win_amd64.whl", hash = "sha256:576206b540575e81bb84a0f620b7a8529f5e89b0b2ec7d4487f3183789dd5cfd", size = 208980 }, + { url = "https://files.pythonhosted.org/packages/74/62/ab1492a3d6b7c724443f1f964ff986c5d64a5d97a880dcc4de9475815c00/py_rust_stemmers-0.1.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:ab7b6cc01df4013bd2e766ea4c367922bff4612dd36ec4a8aa8125cb384c5dac", size = 286055 }, + { url = "https://files.pythonhosted.org/packages/f5/dd/35ec95708df96831382df12184ef51b2a3f4db7c5fbed4d0d88e9a83ea49/py_rust_stemmers-0.1.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d39a18641cfa6ff6678ea538d64926c1612eb6ddce9a90a61694f383743c0257", size = 271966 }, + { url = "https://files.pythonhosted.org/packages/1d/3e/676726ab5fdd9d47ea6c8f0bbceebffec7a5d3837c71fc869ecce68faa2b/py_rust_stemmers-0.1.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ca50cef25d31e6ea200791f28976ee9500ef61fc91101343877b3d38fe3207a", size = 310515 }, + { url = "https://files.pythonhosted.org/packages/d2/d6/1722299d74959267d6d77fdfde7fc13aeacd0845265694fa65f358ed4a68/py_rust_stemmers-0.1.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a5d1a885830c5d94d36f74c0a2017225401f10e64f011e37e7b171ea84c17eb8", size = 315183 }, + { url = "https://files.pythonhosted.org/packages/bf/5c/279d420618d6dea0b00d40805e08418146c8af3c53db74345abf77f32551/py_rust_stemmers-0.1.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5bb25a58552c058530d69d119fc310dfa27e585dd7a4be6b8f739bd209c29164", size = 324424 }, + { url = "https://files.pythonhosted.org/packages/7f/85/4e6e62c94c3cad7f2ef861300fb277c8b9cc89b1bcc2aeeb0c67db20a83a/py_rust_stemmers-0.1.3-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8016d3e7c43b1a93ac06e9c4d68f77c4f8d6beec6984b4e86438406a0b589d48", size = 324779 }, + { url = "https://files.pythonhosted.org/packages/c9/b9/1079fd911d82f0ca1a34d613c5849ea33dcf373d3a0f18355a0f784420f4/py_rust_stemmers-0.1.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:846a16e43d8e12d3178d608f82dcbddc0fd03c4478cde9adc377de58a769b825", size = 487976 }, + { url = "https://files.pythonhosted.org/packages/80/17/5c52ad2b7cc3dbeb50aa1485372442989cb4e753e6c40476b174f38cb117/py_rust_stemmers-0.1.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9931ef64c9f2ace96f533092f5161a97bbf867ec5f1a9cb139838a6cf52da4c4", size = 575572 }, + { url = "https://files.pythonhosted.org/packages/9d/13/b019d8c0e8006702d0845b6bc7f9f0d100051a7936bc35d7f982117852f7/py_rust_stemmers-0.1.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:aa1ee56ae903f126598f237b45f316b2704ec29a85ad1d27467bf6a5b27c71b9", size = 493259 }, + { url = "https://files.pythonhosted.org/packages/79/9f/36a004b3925dc4a61c44968c6be009ba03dd62827f8d1490cd91f9c3c506/py_rust_stemmers-0.1.3-cp313-none-win_amd64.whl", hash = "sha256:2837fc5a60eb0fa2cefc6e41f5fcfb9ff350cd3cdbed25d34a1bc36057d29397", size = 209418 }, { url = "https://files.pythonhosted.org/packages/f1/45/e1ec9e76b4462e70fa42f6ac8be9f1bfe6565c1c260b9e5824e772157edf/py_rust_stemmers-0.1.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:59eacf7687738b20886a7c0ceeae999d501902b4e6234cf11eecd2f45f2c26bb", size = 288041 }, { url = "https://files.pythonhosted.org/packages/4a/5b/eb594ca68715c23dd3b8f52dd700c10cbdd8133faaaf19886962c8f97c90/py_rust_stemmers-0.1.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:e39d5d273e13aec2f07a2c3ea0050b3bf3aaa7b6e9f6bef3d4e728ab49979ae8", size = 274089 }, { url = "https://files.pythonhosted.org/packages/79/55/b62b14cdeb7268a818f21e4c8cfd543261c563dc9bd89ba7116293ce3008/py_rust_stemmers-0.1.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f95b25138431c4a457d684c49c6de5ff0c1852cf1cb3657e187ea63610fc7c21", size = 310373 }, diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 0dd5d8919..27a40628d 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -2,6 +2,8 @@ import asyncio import logging from uuid import NAMESPACE_OID, uuid5 +from cognee.api.v1.search.search_v2 import search +from cognee.api.v1.search import SearchType from cognee.base_config import get_base_config from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks @@ -42,7 +44,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): cognee_config = get_cognify_config() user = await get_default_user() - detailed_extraction = False + detailed_extraction = True tasks = [ Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction), @@ -50,7 +52,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): # Task(expand_dependency_graph, task_config={"batch_size": 50}), # Task(get_source_code_chunks, task_config={"batch_size": 50}), # Task(summarize_code, task_config={"batch_size": 50}), - Task(add_data_points, task_config={"batch_size": 100 if detailed_extraction else 500}), + Task(add_data_points, task_config={"batch_size": 500}), ] if include_docs: @@ -84,9 +86,17 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): if __name__ == "__main__": async def main(): - async for data_points in run_code_graph_pipeline("REPO_PATH"): + async for data_points in run_code_graph_pipeline("YOUR_REPO_PATH"): print(data_points) await render_graph() + search_results = await search( + query_type=SearchType.CODE, + query_text="How is Relationship weight calculated?", + ) + + for file in search_results: + print(file.filename) + asyncio.run(main()) diff --git a/cognee/infrastructure/llm/prompts/codegraph_retriever_system.txt b/cognee/infrastructure/llm/prompts/codegraph_retriever_system.txt new file mode 100644 index 000000000..0e162a380 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/codegraph_retriever_system.txt @@ -0,0 +1,22 @@ +You are a professional file name and python code extracting expert. +Extract file names and corresponding code pieces from text while preserving formatting and structure. + +### Instructions: + +1. **Identify File Names:** Extract filenames from inline text, headers, or markdown formatting. Empty list of filenames is completely normal. +2. **Extract Code:** Extract code pieces that are in the text (do not add additional content) and maintain their indentation and formatting. Empty list of code pieces is completely normal +3. **Ensure Accuracy:** Avoid extraneous text, merge related snippets, and support multiple programming languages. +4. **Keep content:** Avoid additional files and code pieces that are not in the text make sure everything you extract as a code is actually a code and not a part of a sentence. +5. **Ensure relevancy:** Make sure that the extracted codepiece is not just one or two lines but a meaningful python code, extract classes and functions in one piece + +Examples: + +1. +query: 'I want to change the test1.py file and want to add a print statement at the end' +files: ['test1.py'] +codepieces: "" + +2. +query: 'print('Hello World') doesn't work in the test2.py file. What are the changes I have to do there? +files: ["test2.py"] +codepieces: "print(\'Hello World\')" diff --git a/cognee/modules/retrieval/code_graph_retrieval.py b/cognee/modules/retrieval/code_graph_retrieval.py index 8328aaf83..151a4f732 100644 --- a/cognee/modules/retrieval/code_graph_retrieval.py +++ b/cognee/modules/retrieval/code_graph_retrieval.py @@ -1,42 +1,128 @@ -from cognee.low_level import DataPoint -from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses -from .brute_force_triplet_search import brute_force_triplet_search +import asyncio +import aiofiles + +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph +from typing import List, Dict, Any +from pydantic import BaseModel +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.infrastructure.llm.prompts import read_query_prompt -async def code_graph_retrieval(query: str) -> dict[str, str]: - subclasses = get_all_subclasses(DataPoint) +class CodeQueryInfo(BaseModel): + """Response model for information extraction from the query""" - vector_index_collections = [] + filenames: List[str] = [] + sourcecode: str - for subclass in subclasses: - index_fields = subclass.model_fields["metadata"].default.get("index_fields", []) - for field_name in index_fields: - vector_index_collections.append(f"{subclass.__name__}_{field_name}") - found_triplets = await brute_force_triplet_search( - query, - top_k=5, - collections=vector_index_collections or None, - properties_to_project=["id", "file_path", "source_code"], +async def code_graph_retrieval(query: str) -> list[dict[str, Any]]: + if not query or not isinstance(query, str): + raise ValueError("The query must be a non-empty string.") + + file_name_collections = ["CodeFile_name"] + classes_and_functions_collections = [ + "ClassDefinition_source_code", + "FunctionDefinition_source_code", + ] + + try: + vector_engine = get_vector_engine() + graph_engine = await get_graph_engine() + except Exception as e: + raise RuntimeError("Database initialization error in code_graph_retriever, ") from e + + system_prompt = read_query_prompt("codegraph_retriever_system.txt") + + llm_client = get_llm_client() + try: + files_and_codeparts = await llm_client.acreate_structured_output( + text_input=query, + system_prompt=system_prompt, + response_model=CodeQueryInfo, + ) + except Exception as e: + raise RuntimeError("Failed to retrieve structured output from LLM") from e + + similar_filenames = [] + similar_codepieces = [] + + if not files_and_codeparts.filenames or not files_and_codeparts.sourcecode: + for collection in file_name_collections: + search_results_file = await vector_engine.search(collection, query, limit=3) + for res in search_results_file: + similar_filenames.append({"id": res.id, "score": res.score, "payload": res.payload}) + + for collection in classes_and_functions_collections: + search_results_code = await vector_engine.search(collection, query, limit=3) + for res in search_results_code: + similar_codepieces.append( + {"id": res.id, "score": res.score, "payload": res.payload} + ) + + else: + for collection in file_name_collections: + for file_from_query in files_and_codeparts.filenames: + search_results_file = await vector_engine.search( + collection, file_from_query, limit=3 + ) + for res in search_results_file: + similar_filenames.append( + {"id": res.id, "score": res.score, "payload": res.payload} + ) + + for collection in classes_and_functions_collections: + for code_from_query in files_and_codeparts.sourcecode: + search_results_code = await vector_engine.search( + collection, code_from_query, limit=3 + ) + for res in search_results_code: + similar_codepieces.append( + {"id": res.id, "score": res.score, "payload": res.payload} + ) + + file_ids = [str(item["id"]) for item in similar_filenames] + code_ids = [str(item["id"]) for item in similar_codepieces] + + relevant_triplets = await asyncio.gather( + *[graph_engine.get_connections(node_id) for node_id in code_ids + file_ids] ) + paths = set() + + for sublist in relevant_triplets: + for tpl in sublist: + if isinstance(tpl, tuple) and len(tpl) >= 3: + if "file_path" in tpl[0]: + paths.add(tpl[0]["file_path"]) + if "file_path" in tpl[2]: # Third tuple element + paths.add(tpl[2]["file_path"]) + retrieved_files = {} - for triplet in found_triplets: - if triplet.node1.attributes["source_code"]: - retrieved_files[triplet.node1.attributes["file_path"]] = triplet.node1.attributes[ - "source_code" - ] - if triplet.node2.attributes["source_code"]: - retrieved_files[triplet.node2.attributes["file_path"]] = triplet.node2.attributes[ - "source_code" - ] + read_tasks = [] + for file_path in paths: - return [ + async def read_file(fp): + try: + async with aiofiles.open(fp, "r", encoding="utf-8") as f: + retrieved_files[fp] = await f.read() + except Exception as e: + print(f"Error reading {fp}: {e}") + retrieved_files[fp] = "" + + read_tasks.append(read_file(file_path)) + + await asyncio.gather(*read_tasks) + + result = [ { "name": file_path, "description": file_path, - "content": source_code, + "content": retrieved_files[file_path], } - for file_path, source_code in retrieved_files.items() + for file_path in paths ] + + return result diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 936b32c55..9d44c5604 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -8,11 +8,11 @@ class Repository(DataPoint): class ImportStatement(DataPoint): name: str + module: str start_point: tuple end_point: tuple source_code: str file_path: Optional[str] = None - metadata: dict = {"index_fields": ["name", "source_code"]} class FunctionDefinition(DataPoint): @@ -21,7 +21,7 @@ class FunctionDefinition(DataPoint): end_point: tuple source_code: str file_path: Optional[str] = None - metadata: dict = {"index_fields": ["name", "source_code"]} + metadata: dict = {"index_fields": ["source_code"]} class ClassDefinition(DataPoint): @@ -30,17 +30,18 @@ class ClassDefinition(DataPoint): end_point: tuple source_code: str file_path: Optional[str] = None - metadata: dict = {"index_fields": ["name", "source_code"]} + metadata: dict = {"index_fields": ["source_code"]} class CodeFile(DataPoint): + name: str file_path: str source_code: Optional[str] = None part_of: Optional[Repository] = None depends_on: Optional[List["ImportStatement"]] = [] provides_function_definition: Optional[List["FunctionDefinition"]] = [] provides_class_definition: Optional[List["ClassDefinition"]] = [] - metadata: dict = {"index_fields": ["source_code"]} + metadata: dict = {"index_fields": ["name"]} class CodePart(DataPoint): diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py index 9d929638a..5dd058834 100644 --- a/cognee/tasks/repo_processor/get_local_dependencies.py +++ b/cognee/tasks/repo_processor/get_local_dependencies.py @@ -1,7 +1,9 @@ -from typing import AsyncGenerator +import os +import importlib +from typing import AsyncGenerator, Optional from uuid import NAMESPACE_OID, uuid5 import tree_sitter_python as tspython -from tree_sitter import Language, Node, Parser +from tree_sitter import Language, Node, Parser, Tree import aiofiles @@ -21,6 +23,19 @@ PY_LANGUAGE = Language(tspython.language()) source_code_parser = Parser(PY_LANGUAGE) +class FileParser: + def __init__(self): + self.parsed_files = {} + + async def parse_file(self, file_path: str) -> tuple[str, Tree]: + if file_path not in self.parsed_files: + source_code = await get_source_code(file_path) + source_code_tree = source_code_parser.parse(bytes(source_code, "utf-8")) + self.parsed_files[file_path] = (source_code, source_code_tree) + + return self.parsed_files[file_path] + + async def get_source_code(file_path: str): try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: @@ -31,31 +46,63 @@ async def get_source_code(file_path: str): return None +def resolve_module_path(module_name): + """Find the file path of a module.""" + try: + spec = importlib.util.find_spec(module_name) + if spec and spec.origin: + return spec.origin + except ModuleNotFoundError: + return None + return None + + +def find_function_location( + module_path: str, function_name: str, parser: FileParser +) -> Optional[tuple[str, str]]: + """Find the function definition in the module.""" + if not module_path or not os.path.exists(module_path): + return None + + source_code, tree = parser.parse_file(module_path) + root_node: Node = tree.root_node + + for node in root_node.children: + if node.type == "function_definition": + func_name_node = node.child_by_field_name("name") + + if func_name_node and func_name_node.text.decode() == function_name: + return (module_path, node.start_point) # (line, column) + + return None + + async def get_local_script_dependencies( repo_path: str, script_path: str, detailed_extraction: bool = False ) -> CodeFile: - source_code = await get_source_code(script_path) + code_file_parser = FileParser() + source_code, source_code_tree = await code_file_parser.parse_file(script_path) - relative_file_path = script_path[len(repo_path) + 1 :] + file_path_relative_to_repo = script_path[len(repo_path) + 1 :] if not detailed_extraction: code_file_node = CodeFile( id=uuid5(NAMESPACE_OID, script_path), + name=file_path_relative_to_repo, source_code=source_code, - file_path=relative_file_path, + file_path=script_path, ) return code_file_node code_file_node = CodeFile( id=uuid5(NAMESPACE_OID, script_path), + name=file_path_relative_to_repo, source_code=None, - file_path=relative_file_path, + file_path=script_path, ) - source_code_tree = source_code_parser.parse(bytes(source_code, "utf-8")) - - async for part in extract_code_parts(source_code_tree.root_node): - part.file_path = relative_file_path + async for part in extract_code_parts(source_code_tree.root_node, script_path=script_path): + part.file_path = script_path if isinstance(part, FunctionDefinition): code_file_node.provides_function_definition.append(part) @@ -75,42 +122,81 @@ def find_node(nodes: list[Node], condition: callable) -> Node: return None -async def extract_code_parts(tree_root: Node) -> AsyncGenerator[DataPoint, None]: +async def extract_code_parts( + tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {} +) -> AsyncGenerator[DataPoint, None]: for child_node in tree_root.children: - if child_node.type == "import_statement": - module_node = child_node.children[1] - yield ImportStatement( - name=module_node.text, - start_point=child_node.start_point, - end_point=child_node.end_point, - source_code=child_node.text, - ) + if child_node.type == "import_statement" or child_node.type == "import_from_statement": + parts = child_node.text.decode("utf-8").split() - if child_node.type == "import_from_statement": - module_node = child_node.children[1] - yield ImportStatement( - name=module_node.text, - start_point=child_node.start_point, - end_point=child_node.end_point, - source_code=child_node.text, - ) + if parts[0] == "import": + module_name = parts[1] + function_name = None + elif parts[0] == "from": + module_name = parts[1] + function_name = parts[3] + + if " as " in function_name: + function_name = function_name.split(" as ")[0] + + if " as " in module_name: + module_name = module_name.split(" as ")[0] + + if function_name and "import " + function_name not in existing_nodes: + import_statement_node = ImportStatement( + name=function_name, + module=module_name, + start_point=child_node.start_point, + end_point=child_node.end_point, + file_path=script_path, + source_code=child_node.text, + ) + existing_nodes["import " + function_name] = import_statement_node + + if function_name: + yield existing_nodes["import " + function_name] + + if module_name not in existing_nodes: + import_statement_node = ImportStatement( + name=module_name, + module=module_name, + start_point=child_node.start_point, + end_point=child_node.end_point, + file_path=script_path, + source_code=child_node.text, + ) + existing_nodes[module_name] = import_statement_node + + yield existing_nodes[module_name] if child_node.type == "function_definition": - function_name_node = find_node( - child_node.children, lambda node: node.type == "identifier" - ) - yield FunctionDefinition( - name=function_name_node.text, - start_point=child_node.start_point, - end_point=child_node.end_point, - source_code=child_node.text, - ) + function_node = find_node(child_node.children, lambda node: node.type == "identifier") + function_node_name = function_node.text + + if function_node_name not in existing_nodes: + function_definition_node = FunctionDefinition( + name=function_node_name, + start_point=child_node.start_point, + end_point=child_node.end_point, + file_path=script_path, + source_code=child_node.text, + ) + existing_nodes[function_node_name] = function_definition_node + + yield existing_nodes[function_node_name] if child_node.type == "class_definition": class_name_node = find_node(child_node.children, lambda node: node.type == "identifier") - yield ClassDefinition( - name=class_name_node.text, - start_point=child_node.start_point, - end_point=child_node.end_point, - source_code=child_node.text, - ) + class_name_node_name = class_name_node.text + + if class_name_node_name not in existing_nodes: + class_definition_node = ClassDefinition( + name=class_name_node_name, + start_point=child_node.start_point, + end_point=child_node.end_point, + file_path=script_path, + source_code=child_node.text, + ) + existing_nodes[class_name_node_name] = class_definition_node + + yield existing_nodes[class_name_node_name] diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 38fd1fded..547dd5594 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -20,7 +20,12 @@ async def get_source_code_files(repo_path): os.path.join(root, file) for root, _, files in os.walk(repo_path) for file in files - if file.endswith(".py") + if ( + file.endswith(".py") + and not file.startswith("test_") + and not file.endswith("_test") + and ".venv" not in file + ) ) source_code_files = set() @@ -74,7 +79,7 @@ async def get_repo_file_dependencies( # with ProcessPoolExecutor(max_workers=12) as executor: tasks = [ get_local_script_dependencies(repo_path, file_path, detailed_extraction) - for file_path in source_code_files[start_range:end_range] + for file_path in source_code_files[start_range : end_range + 1] ] results: list[CodeFile] = await asyncio.gather(*tasks) diff --git a/entrypoint-old.sh b/entrypoint-old.sh deleted file mode 100755 index 8d8053759..000000000 --- a/entrypoint-old.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -# export ENVIRONMENT - -echo "Debug mode: $DEBUG" -echo "Environment: $ENVIRONMENT" - -if [ "$ENVIRONMENT" != "local" ]; then - echo "Running fetch_secret.py" - - PYTHONPATH=. python cognee/fetch_secret.py - - if [ $? -ne 0 ]; then - echo "Error: fetch_secret.py failed" - exit 1 - fi -else - echo '"local" environment is active, skipping fetch_secret.py' -fi - -echo "Creating database..." -# -#PYTHONPATH=. python cognee/setup_database.py -#if [ $? -ne 0 ]; then -# echo "Error: setup_database.py failed" -# exit 1 -#fi - -echo "Starting Gunicorn" - -if [ "$DEBUG" = true ]; then - echo "Waiting for the debugger to attach..." - python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --bind=0.0.0.0:443 --log-level debug cognee.api.client:app -else - gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --bind=0.0.0.0:443 --log-level debug cognee.api.client:app -fi diff --git a/entrypoint.sh b/entrypoint.sh index 83575d742..edb198443 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -31,7 +31,7 @@ sleep 2 # Modified Gunicorn startup with error handling if [ "$ENVIRONMENT" = "dev" ]; then - if [ "$DEBUG" = true ]; then + if [ "$DEBUG" = "true" ]; then echo "Waiting for the debugger to attach..." exec python -m debugpy --wait-for-client --listen 0.0.0.0:5678 -m gunicorn -w 3 -k uvicorn.workers.UvicornWorker -t 30000 --bind=0.0.0.0:8000 --log-level debug --reload cognee.api.client:app else