From 8879f3fbbe2021762c9aee19b0a40bc3d8d321a1 Mon Sep 17 00:00:00 2001 From: Igor Ilic <30923996+dexters1@users.noreply.github.com> Date: Fri, 31 Jan 2025 18:03:23 +0100 Subject: [PATCH] feat: Add gemini support [COG-1023] (#485) ## Description PR to test Gemini PR from holchan 1. Add Gemini LLM and Gemini Embedding support 2. Fix CodeGraph issue with chunks being bigger than maximum token value 3. Add Tokenizer adapters to CodeGraph ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Added support for the Gemini LLM provider. - Expanded LLM configuration options. - Introduced a new GitHub Actions workflow for multimetric QA evaluation. - Added new environment variables for LLM and embedding configurations across various workflows. - **Bug Fixes** - Improved error handling in various components. - Updated tokenization and embedding processes. - Removed warning related to missing `dict` method in data items. - **Refactor** - Simplified token extraction and decoding methods. - Updated tokenizer interfaces. - Removed deprecated dependencies. - Enhanced retry logic and error handling in embedding processes. - **Documentation** - Updated configuration comments and settings. - **Chores** - Updated GitHub Actions workflows to accommodate new secrets and environment variables. - Modified evaluation parameters. - Adjusted dependency management for optional libraries. --------- Co-authored-by: holchan <61059652+holchan@users.noreply.github.com> Co-authored-by: Boris --- .env.template | 4 +- .github/workflows/reusable_notebook.yml | 25 ++- .github/workflows/reusable_python_example.yml | 26 ++- .github/workflows/test_code_graph_example.yml | 9 +- .../test_cognee_llama_index_notebook.yml | 8 + .../test_cognee_multimedia_notebook.yml | 8 + .github/workflows/test_deduplication.yml | 9 +- .../workflows/test_dynamic_steps_example.yml | 9 +- .../test_dynamic_steps_example_windows.yml | 9 +- ...lama_index_cognee_integration_notebook.yml | 8 + .github/workflows/test_milvus.yml | 9 +- .../workflows/test_multimedia_example.yaml | 9 +- .../test_multimetric_qa_eval_run.yaml | 30 ++++ .github/workflows/test_neo4j.yml | 9 +- .github/workflows/test_notebook.yml | 8 + .github/workflows/test_pgvector.yml | 9 +- .github/workflows/test_python_3_10.yml | 9 +- .github/workflows/test_python_3_11.yml | 9 +- .github/workflows/test_python_3_12.yml | 9 +- .github/workflows/test_qdrant.yml | 9 +- .github/workflows/test_simple_example.yml | 9 +- .github/workflows/test_weaviate.yml | 9 +- .../settings/routers/get_settings_router.py | 2 +- .../exceptions/EmbeddingException.py | 14 +- .../embeddings/LiteLLMEmbeddingEngine.py | 21 ++- cognee/infrastructure/llm/gemini/__init__.py | 0 cognee/infrastructure/llm/gemini/adapter.py | 155 ++++++++++++++++++ cognee/infrastructure/llm/get_llm_client.py | 16 ++ cognee/infrastructure/llm/openai/adapter.py | 10 +- .../llm/tokenizer/Gemini/adapter.py | 9 +- .../llm/tokenizer/HuggingFace/adapter.py | 8 +- .../llm/tokenizer/TikToken/adapter.py | 15 +- .../llm/tokenizer/tokenizer_interface.py | 2 +- .../modules/graph/cognee_graph/CogneeGraph.py | 2 + cognee/modules/settings/get_settings.py | 11 ++ cognee/tasks/chunks/chunk_by_paragraph.py | 15 +- cognee/tasks/ingestion/ingest_data.py | 3 - .../repo_processor/get_source_code_chunks.py | 50 +++--- evals/eval_on_hotpot.py | 2 +- evals/qa_eval_parameters.json | 7 +- poetry.lock | 7 +- pyproject.toml | 3 +- 42 files changed, 494 insertions(+), 101 deletions(-) create mode 100644 .github/workflows/test_multimetric_qa_eval_run.yaml create mode 100644 cognee/infrastructure/llm/gemini/__init__.py create mode 100644 cognee/infrastructure/llm/gemini/adapter.py diff --git a/.env.template b/.env.template index df8408518..e9dc2a4c2 100644 --- a/.env.template +++ b/.env.template @@ -1,7 +1,7 @@ ENV="local" TOKENIZERS_PARALLELISM="false" -# LLM settings +# LLM Configuration LLM_API_KEY="" LLM_MODEL="openai/gpt-4o-mini" LLM_PROVIDER="openai" @@ -14,7 +14,7 @@ GRAPHISTRY_PASSWORD= SENTRY_REPORTING_URL= -# Embedding settings +# Embedding Configuration EMBEDDING_PROVIDER="openai" EMBEDDING_API_KEY="" EMBEDDING_MODEL="openai/text-embedding-3-large" diff --git a/.github/workflows/reusable_notebook.yml b/.github/workflows/reusable_notebook.yml index 9bc09c3a6..13c6eb43e 100644 --- a/.github/workflows/reusable_notebook.yml +++ b/.github/workflows/reusable_notebook.yml @@ -12,8 +12,24 @@ on: required: true GRAPHISTRY_PASSWORD: required: true + #LLM_MODEL: + # required: true + #LLM_ENDPOINT: + # required: true + LLM_API_KEY: + required: true OPENAI_API_KEY: required: true + #LLM_API_VERSION: + # required: true + EMBEDDING_MODEL: + required: true + EMBEDDING_ENDPOINT: + required: true + EMBEDDING_API_KEY: + required: true + EMBEDDING_API_VERSION: + required: true env: RUNTIME__LOG_LEVEL: ERROR @@ -50,8 +66,15 @@ jobs: - name: Execute Jupyter Notebook env: ENV: 'dev' + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Use OpenAI Until a multimedia model is deployed and DeepEval support for other models is added + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} run: | diff --git a/.github/workflows/reusable_python_example.yml b/.github/workflows/reusable_python_example.yml index d1f7ee16b..0897e639d 100644 --- a/.github/workflows/reusable_python_example.yml +++ b/.github/workflows/reusable_python_example.yml @@ -16,7 +16,23 @@ on: required: true GRAPHISTRY_PASSWORD: required: true + LLM_MODEL: + required: true + LLM_ENDPOINT: + required: true + LLM_API_KEY: + required: true OPENAI_API_KEY: + required: false + LLM_API_VERSION: + required: true + EMBEDDING_MODEL: + required: true + EMBEDDING_ENDPOINT: + required: true + EMBEDDING_API_KEY: + required: true + EMBEDDING_API_VERSION: required: true env: @@ -54,7 +70,15 @@ jobs: env: ENV: 'dev' PYTHONFAULTHANDLER: 1 - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} run: poetry run python ${{ inputs.example-location }} ${{ inputs.arguments }} diff --git a/.github/workflows/test_code_graph_example.yml b/.github/workflows/test_code_graph_example.yml index a1f8d4e2c..1200b5b11 100644 --- a/.github/workflows/test_code_graph_example.yml +++ b/.github/workflows/test_code_graph_example.yml @@ -17,6 +17,13 @@ jobs: example-location: ./examples/python/code_graph_example.py arguments: "--repo_path ./evals" secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_cognee_llama_index_notebook.yml b/.github/workflows/test_cognee_llama_index_notebook.yml index 2eadd125b..2e90cca6b 100644 --- a/.github/workflows/test_cognee_llama_index_notebook.yml +++ b/.github/workflows/test_cognee_llama_index_notebook.yml @@ -15,6 +15,14 @@ jobs: with: notebook-location: notebooks/cognee_llama_index.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_cognee_multimedia_notebook.yml b/.github/workflows/test_cognee_multimedia_notebook.yml index 1d6f1f468..dc0edf0e5 100644 --- a/.github/workflows/test_cognee_multimedia_notebook.yml +++ b/.github/workflows/test_cognee_multimedia_notebook.yml @@ -15,6 +15,14 @@ jobs: with: notebook-location: notebooks/cognee_multimedia_demo.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_deduplication.yml b/.github/workflows/test_deduplication.yml index 2f97e4ea6..01afd7b37 100644 --- a/.github/workflows/test_deduplication.yml +++ b/.github/workflows/test_deduplication.yml @@ -57,5 +57,12 @@ jobs: - name: Run deduplication test env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_deduplication.py diff --git a/.github/workflows/test_dynamic_steps_example.yml b/.github/workflows/test_dynamic_steps_example.yml index ed0688bef..0e22fa7ec 100644 --- a/.github/workflows/test_dynamic_steps_example.yml +++ b/.github/workflows/test_dynamic_steps_example.yml @@ -16,6 +16,13 @@ jobs: with: example-location: ./examples/python/dynamic_steps_example.py secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_dynamic_steps_example_windows.yml b/.github/workflows/test_dynamic_steps_example_windows.yml index d0a9c93e3..881c39f24 100644 --- a/.github/workflows/test_dynamic_steps_example_windows.yml +++ b/.github/workflows/test_dynamic_steps_example_windows.yml @@ -38,5 +38,12 @@ jobs: env: ENV: 'dev' PYTHONFAULTHANDLER: 1 - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./examples/python/dynamic_steps_example.py diff --git a/.github/workflows/test_llama_index_cognee_integration_notebook.yml b/.github/workflows/test_llama_index_cognee_integration_notebook.yml index aacc31eb5..5974009d6 100644 --- a/.github/workflows/test_llama_index_cognee_integration_notebook.yml +++ b/.github/workflows/test_llama_index_cognee_integration_notebook.yml @@ -15,6 +15,14 @@ jobs: with: notebook-location: notebooks/llama_index_cognee_integration.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_milvus.yml b/.github/workflows/test_milvus.yml index 51e5f0982..dec74a475 100644 --- a/.github/workflows/test_milvus.yml +++ b/.github/workflows/test_milvus.yml @@ -47,7 +47,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_milvus.py - name: Clean up disk space diff --git a/.github/workflows/test_multimedia_example.yaml b/.github/workflows/test_multimedia_example.yaml index 95c93c01b..c307002a8 100644 --- a/.github/workflows/test_multimedia_example.yaml +++ b/.github/workflows/test_multimedia_example.yaml @@ -16,6 +16,13 @@ jobs: with: example-location: ./examples/python/multimedia_example.py secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Use OpenAI until we deploy models to handle multimedia + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_multimetric_qa_eval_run.yaml b/.github/workflows/test_multimetric_qa_eval_run.yaml new file mode 100644 index 000000000..44a33a48f --- /dev/null +++ b/.github/workflows/test_multimetric_qa_eval_run.yaml @@ -0,0 +1,30 @@ +name: test | multimetric qa eval run + +on: + workflow_dispatch: + pull_request: + types: [labeled, synchronize] + + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + run_multimetric_qa_eval_test: + uses: ./.github/workflows/reusable_python_example.yml + with: + example-location: ./evals/multimetric_qa_eval_run.py + arguments: "--params_file evals/qa_eval_parameters.json --out_dir dirname" + secrets: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Until we add support for azure for DeepEval + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} + GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_neo4j.yml b/.github/workflows/test_neo4j.yml index e1d71dcfd..7e8423628 100644 --- a/.github/workflows/test_neo4j.yml +++ b/.github/workflows/test_neo4j.yml @@ -43,7 +43,14 @@ jobs: - name: Run default Neo4j env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }} GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }} GRAPH_DATABASE_USERNAME: "neo4j" diff --git a/.github/workflows/test_notebook.yml b/.github/workflows/test_notebook.yml index e03779c70..d8ff5a8ba 100644 --- a/.github/workflows/test_notebook.yml +++ b/.github/workflows/test_notebook.yml @@ -16,6 +16,14 @@ jobs: with: notebook-location: notebooks/cognee_demo.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_pgvector.yml b/.github/workflows/test_pgvector.yml index d5356d603..0f9b5d369 100644 --- a/.github/workflows/test_pgvector.yml +++ b/.github/workflows/test_pgvector.yml @@ -58,5 +58,12 @@ jobs: - name: Run default PGVector env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_pgvector.py diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index 90c437eb8..6a9367451 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -56,7 +56,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_library.py - name: Clean up disk space diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index 83f98c6ed..96b275773 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -58,7 +58,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_library.py - name: Clean up disk space diff --git a/.github/workflows/test_python_3_12.yml b/.github/workflows/test_python_3_12.yml index 14e33551a..9b0e7d6b5 100644 --- a/.github/workflows/test_python_3_12.yml +++ b/.github/workflows/test_python_3_12.yml @@ -56,7 +56,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_library.py - name: Clean up disk space diff --git a/.github/workflows/test_qdrant.yml b/.github/workflows/test_qdrant.yml index e2cf9abe8..5ee35058f 100644 --- a/.github/workflows/test_qdrant.yml +++ b/.github/workflows/test_qdrant.yml @@ -44,7 +44,14 @@ jobs: - name: Run default Qdrant env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} VECTOR_DB_URL: ${{ secrets.QDRANT_API_URL }} VECTOR_DB_KEY: ${{ secrets.QDRANT_API_KEY }} run: poetry run python ./cognee/tests/test_qdrant.py diff --git a/.github/workflows/test_simple_example.yml b/.github/workflows/test_simple_example.yml index 5378df891..21912414b 100644 --- a/.github/workflows/test_simple_example.yml +++ b/.github/workflows/test_simple_example.yml @@ -16,6 +16,13 @@ jobs: with: example-location: ./examples/python/simple_example.py secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_weaviate.yml b/.github/workflows/test_weaviate.yml index 81cc2603f..244c5f47a 100644 --- a/.github/workflows/test_weaviate.yml +++ b/.github/workflows/test_weaviate.yml @@ -44,7 +44,14 @@ jobs: - name: Run default Weaviate env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} VECTOR_DB_URL: ${{ secrets.WEAVIATE_API_URL }} VECTOR_DB_KEY: ${{ secrets.WEAVIATE_API_KEY }} run: poetry run python ./cognee/tests/test_weaviate.py diff --git a/cognee/api/v1/settings/routers/get_settings_router.py b/cognee/api/v1/settings/routers/get_settings_router.py index 138bea661..e2842e3cd 100644 --- a/cognee/api/v1/settings/routers/get_settings_router.py +++ b/cognee/api/v1/settings/routers/get_settings_router.py @@ -21,7 +21,7 @@ class SettingsDTO(OutDTO): class LLMConfigInputDTO(InDTO): - provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"]] + provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"], Literal["gemini"]] model: str api_key: str diff --git a/cognee/infrastructure/databases/exceptions/EmbeddingException.py b/cognee/infrastructure/databases/exceptions/EmbeddingException.py index 130282857..df3ba93c0 100644 --- a/cognee/infrastructure/databases/exceptions/EmbeddingException.py +++ b/cognee/infrastructure/databases/exceptions/EmbeddingException.py @@ -1,4 +1,14 @@ -class EmbeddingException(Exception): +from cognee.exceptions import CogneeApiError +from fastapi import status + + +class EmbeddingException(CogneeApiError): """Custom exception for handling embedding-related errors.""" - pass + def __init__( + self, + message: str = "Embedding Exception.", + name: str = "EmbeddingException", + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + ): + super().__init__(message, name, status_code) diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index f81bc8515..650fe5adb 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -23,10 +23,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): dimensions: int mock: bool + MAX_RETRIES = 5 + def __init__( self, + model: Optional[str] = "openai/text-embedding-3-large", provider: str = "openai", - model: Optional[str] = "text-embedding-3-large", dimensions: Optional[int] = 3072, api_key: str = None, endpoint: str = None, @@ -41,15 +43,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): self.dimensions = dimensions self.max_tokens = max_tokens self.tokenizer = self.get_tokenizer() + self.retry_count = 0 enable_mocking = os.getenv("MOCK_EMBEDDING", "false") if isinstance(enable_mocking, bool): enable_mocking = str(enable_mocking).lower() self.mock = enable_mocking in ("true", "1", "yes") - MAX_RETRIES = 5 - retry_count = 0 - async def embed_text(self, text: List[str]) -> List[List[float]]: async def exponential_backoff(attempt): wait_time = min(10 * (2**attempt), 60) # Max 60 seconds @@ -64,14 +64,14 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): return [data["embedding"] for data in response["data"]] else: response = await litellm.aembedding( - self.model, + model=self.model, input=text, api_key=self.api_key, api_base=self.endpoint, api_version=self.api_version, ) - self.retry_count = 0 + self.retry_count = 0 # Reset retry count on successful call return [data["embedding"] for data in response.data] @@ -99,13 +99,16 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): raise Exception("Rate limit exceeded and no more retries left.") await exponential_backoff(self.retry_count) - self.retry_count += 1 return await self.embed_text(text) - except litellm.exceptions.BadRequestError: - raise EmbeddingException("Failed to index data points.") + except ( + litellm.exceptions.BadRequestError, + litellm.exceptions.NotFoundError, + ) as e: + logger.error(f"Embedding error with model {self.model}: {str(e)}") + raise EmbeddingException(f"Failed to index data points using model {self.model}") except Exception as error: logger.error("Error embedding text: %s", str(error)) diff --git a/cognee/infrastructure/llm/gemini/__init__.py b/cognee/infrastructure/llm/gemini/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/infrastructure/llm/gemini/adapter.py b/cognee/infrastructure/llm/gemini/adapter.py new file mode 100644 index 000000000..f37fb1c80 --- /dev/null +++ b/cognee/infrastructure/llm/gemini/adapter.py @@ -0,0 +1,155 @@ +from typing import Type, Optional +from pydantic import BaseModel +import logging +import litellm +import asyncio +from litellm import acompletion, JSONSchemaValidationError +from cognee.shared.data_models import MonitoringTool +from cognee.exceptions import InvalidValueError +from cognee.infrastructure.llm.llm_interface import LLMInterface +from cognee.infrastructure.llm.prompts import read_query_prompt +from cognee.base_config import get_base_config + +logger = logging.getLogger(__name__) + +monitoring = get_base_config().monitoring_tool +if monitoring == MonitoringTool.LANGFUSE: + from langfuse.decorators import observe + + +class GeminiAdapter(LLMInterface): + MAX_RETRIES = 5 + + def __init__( + self, + api_key: str, + model: str, + max_tokens: int, + endpoint: Optional[str] = None, + api_version: Optional[str] = None, + streaming: bool = False, + ) -> None: + self.api_key = api_key + self.model = model + self.endpoint = endpoint + self.api_version = api_version + self.streaming = streaming + self.max_tokens = max_tokens + + @observe(as_type="generation") + async def acreate_structured_output( + self, text_input: str, system_prompt: str, response_model: Type[BaseModel] + ) -> BaseModel: + try: + response_schema = { + "type": "object", + "properties": { + "summary": {"type": "string"}, + "description": {"type": "string"}, + "nodes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "type": {"type": "string"}, + "description": {"type": "string"}, + "id": {"type": "string"}, + "label": {"type": "string"}, + }, + "required": ["name", "type", "description", "id", "label"], + }, + }, + "edges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "source_node_id": {"type": "string"}, + "target_node_id": {"type": "string"}, + "relationship_name": {"type": "string"}, + }, + "required": ["source_node_id", "target_node_id", "relationship_name"], + }, + }, + }, + "required": ["summary", "description", "nodes", "edges"], + } + + simplified_prompt = f""" +{system_prompt} + +IMPORTANT: Your response must be a valid JSON object with these required fields: +1. summary: A brief summary +2. description: A detailed description +3. nodes: Array of nodes with name, type, description, id, and label +4. edges: Array of edges with source_node_id, target_node_id, and relationship_name + +Example structure: +{{ + "summary": "Brief summary", + "description": "Detailed description", + "nodes": [ + {{ + "name": "Example Node", + "type": "Concept", + "description": "Node description", + "id": "example-id", + "label": "Concept" + }} + ], + "edges": [ + {{ + "source_node_id": "source-id", + "target_node_id": "target-id", + "relationship_name": "relates_to" + }} + ] +}}""" + + messages = [ + {"role": "system", "content": simplified_prompt}, + {"role": "user", "content": text_input}, + ] + + try: + response = await acompletion( + model=f"{self.model}", + messages=messages, + api_key=self.api_key, + max_tokens=self.max_tokens, + temperature=0.1, + response_format={"type": "json_object", "schema": response_schema}, + timeout=10, + num_retries=self.MAX_RETRIES, + ) + + if response.choices and response.choices[0].message.content: + content = response.choices[0].message.content + return response_model.model_validate_json(content) + + except litellm.exceptions.BadRequestError as e: + logger.error(f"Bad request error: {str(e)}") + raise ValueError(f"Invalid request: {str(e)}") + + raise ValueError("Failed to get valid response after retries") + + except JSONSchemaValidationError as e: + logger.error(f"Schema validation failed: {str(e)}") + logger.debug(f"Raw response: {e.raw_response}") + raise ValueError(f"Response failed schema validation: {str(e)}") + + def show_prompt(self, text_input: str, system_prompt: str) -> str: + """Format and display the prompt for a user query.""" + if not text_input: + text_input = "No user input provided." + if not system_prompt: + raise InvalidValueError(message="No system prompt path provided.") + system_prompt = read_query_prompt(system_prompt) + + formatted_prompt = ( + f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" + if system_prompt + else None + ) + return formatted_prompt diff --git a/cognee/infrastructure/llm/get_llm_client.py b/cognee/infrastructure/llm/get_llm_client.py index ede8bd330..5e26345e8 100644 --- a/cognee/infrastructure/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/get_llm_client.py @@ -12,6 +12,7 @@ class LLMProvider(Enum): OLLAMA = "ollama" ANTHROPIC = "anthropic" CUSTOM = "custom" + GEMINI = "gemini" def get_llm_client(): @@ -78,5 +79,20 @@ def get_llm_client(): max_tokens=max_tokens, ) + elif provider == LLMProvider.GEMINI: + if llm_config.llm_api_key is None: + raise InvalidValueError(message="LLM API key is not set.") + + from .gemini.adapter import GeminiAdapter + + return GeminiAdapter( + api_key=llm_config.llm_api_key, + model=llm_config.llm_model, + max_tokens=max_tokens, + endpoint=llm_config.llm_endpoint, + api_version=llm_config.llm_api_version, + streaming=llm_config.llm_streaming, + ) + else: raise InvalidValueError(message=f"Unsupported LLM provider: {provider}") diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py index d6939e323..c9f87d211 100644 --- a/cognee/infrastructure/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/openai/adapter.py @@ -23,6 +23,8 @@ class OpenAIAdapter(LLMInterface): api_key: str api_version: str + MAX_RETRIES = 5 + """Adapter for OpenAI's GPT-3, GPT=4 API""" def __init__( @@ -68,7 +70,7 @@ class OpenAIAdapter(LLMInterface): api_base=self.endpoint, api_version=self.api_version, response_model=response_model, - max_retries=5, + max_retries=self.MAX_RETRIES, ) @observe @@ -94,7 +96,7 @@ class OpenAIAdapter(LLMInterface): api_base=self.endpoint, api_version=self.api_version, response_model=response_model, - max_retries=5, + max_retries=self.MAX_RETRIES, ) def create_transcript(self, input): @@ -112,7 +114,7 @@ class OpenAIAdapter(LLMInterface): api_key=self.api_key, api_base=self.endpoint, api_version=self.api_version, - max_retries=5, + max_retries=self.MAX_RETRIES, ) return transcription @@ -144,7 +146,7 @@ class OpenAIAdapter(LLMInterface): api_base=self.endpoint, api_version=self.api_version, max_tokens=300, - max_retries=5, + max_retries=self.MAX_RETRIES, ) def show_prompt(self, text_input: str, system_prompt: str) -> str: diff --git a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py index e4cc4f145..61acdd7ab 100644 --- a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py @@ -1,4 +1,4 @@ -from typing import List, Any +from typing import List, Any, Union from ..tokenizer_interface import TokenizerInterface @@ -26,6 +26,10 @@ class GeminiTokenizer(TokenizerInterface): def extract_tokens(self, text: str) -> List[Any]: raise NotImplementedError + def decode_single_token(self, encoding: int): + # Gemini tokenizer doesn't have the option to decode tokens + raise NotImplementedError + def count_tokens(self, text: str) -> int: """ Returns the number of tokens in the given text. @@ -39,6 +43,3 @@ class GeminiTokenizer(TokenizerInterface): import google.generativeai as genai return len(genai.embed_content(model=f"models/{self.model}", content=text)) - - def trim_text_to_max_tokens(self, text: str) -> str: - raise NotImplementedError diff --git a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py index 878458414..e26a5b1ca 100644 --- a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py @@ -1,7 +1,5 @@ from typing import List, Any -from transformers import AutoTokenizer - from ..tokenizer_interface import TokenizerInterface @@ -14,6 +12,9 @@ class HuggingFaceTokenizer(TokenizerInterface): self.model = model self.max_tokens = max_tokens + # Import here to make it an optional dependency + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained(model) def extract_tokens(self, text: str) -> List[Any]: @@ -32,5 +33,6 @@ class HuggingFaceTokenizer(TokenizerInterface): """ return len(self.tokenizer.tokenize(text)) - def trim_text_to_max_tokens(self, text: str) -> str: + def decode_single_token(self, encoding: int): + # Gemini tokenizer doesn't have the option to decode tokens raise NotImplementedError diff --git a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py index 3d649ef38..7a01fe511 100644 --- a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py @@ -21,14 +21,17 @@ class TikTokenTokenizer(TokenizerInterface): self.tokenizer = tiktoken.encoding_for_model(self.model) def extract_tokens(self, text: str) -> List[Any]: - tokens = [] # Using TikToken's method to tokenize text token_ids = self.tokenizer.encode(text) - # Go through tokens and decode them to text value - for token_id in token_ids: - token = self.tokenizer.decode([token_id]) - tokens.append(token) - return tokens + return token_ids + + def decode_token_list(self, tokens: List[Any]) -> List[Any]: + if not isinstance(tokens, list): + tokens = [tokens] + return [self.tokenizer.decode(i) for i in tokens] + + def decode_single_token(self, token: int): + return self.tokenizer.decode_single_token_bytes(token).decode("utf-8", errors="replace") def count_tokens(self, text: str) -> int: """ diff --git a/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py b/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py index c533f0cf9..456c69f64 100644 --- a/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py +++ b/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py @@ -14,5 +14,5 @@ class TokenizerInterface(Protocol): raise NotImplementedError @abstractmethod - def trim_text_to_max_tokens(self, text: str) -> str: + def decode_single_token(self, token: int) -> str: raise NotImplementedError diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 491f83b5a..f9d9a74ec 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -113,8 +113,10 @@ class CogneeGraph(CogneeAbstractGraph): except (ValueError, TypeError) as e: print(f"Error projecting graph: {e}") + raise e except Exception as ex: print(f"Unexpected error: {ex}") + raise ex async def map_vector_distances_to_graph_nodes(self, node_distances) -> None: for category, scored_results in node_distances.items(): diff --git a/cognee/modules/settings/get_settings.py b/cognee/modules/settings/get_settings.py index 93fd67cff..063c18971 100644 --- a/cognee/modules/settings/get_settings.py +++ b/cognee/modules/settings/get_settings.py @@ -13,6 +13,7 @@ class ModelName(Enum): openai = "openai" ollama = "ollama" anthropic = "anthropic" + gemini = "gemini" class LLMConfig(BaseModel): @@ -72,6 +73,10 @@ def get_settings() -> SettingsDict: "value": "anthropic", "label": "Anthropic", }, + { + "value": "gemini", + "label": "Gemini", + }, ] return SettingsDict.model_validate( @@ -136,6 +141,12 @@ def get_settings() -> SettingsDict: "label": "Claude 3 Haiku", }, ], + "gemini": [ + { + "value": "gemini-2.0-flash-exp", + "label": "Gemini 2.0 Flash", + }, + ], }, }, vector_db={ diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 34205d9f6..52f1f6674 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -1,8 +1,6 @@ -from typing import Any, Dict, Iterator, Optional, Union +from typing import Any, Dict, Iterator from uuid import NAMESPACE_OID, uuid5 -import tiktoken - from cognee.infrastructure.databases.vector import get_vector_engine from .chunk_by_sentence import chunk_by_sentence @@ -19,7 +17,7 @@ def chunk_by_paragraph( When chunks are joined with empty string "", they reproduce the original text exactly. Notes: - - Tokenization is handled using the `tiktoken` library, ensuring compatibility with the vector engine's embedding model. + - Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model. - If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk. - Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed. - Remaining text at the end of the input will be yielded as a final chunk. @@ -31,17 +29,12 @@ def chunk_by_paragraph( last_cut_type = None current_token_count = 0 - vector_engine = get_vector_engine() - embedding_model = vector_engine.embedding_engine.model - embedding_model = embedding_model.split("/")[-1] - for paragraph_id, sentence, word_count, end_type in chunk_by_sentence( data, maximum_length=paragraph_length ): # Check if this sentence would exceed length limit - - tokenizer = tiktoken.encoding_for_model(embedding_model) - token_count = len(tokenizer.encode(sentence)) + embedding_engine = get_vector_engine().embedding_engine + token_count = embedding_engine.tokenizer.count_tokens(sentence) if current_word_count > 0 and ( current_word_count + word_count > paragraph_length diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index b19786d4e..04ce41517 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -30,9 +30,6 @@ async def ingest_data(data: Any, dataset_name: str, user: User): if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")): return {"metadata": data_item.dict(), "origin": str(type(data_item))} else: - warnings.warn( - f"Data of type {type(data_item)}... does not have dict method. Returning empty metadata." - ) return {} @dlt.resource(standalone=True, primary_key="id", merge_key="id") diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index ca1c76e46..5607c6c37 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -3,33 +3,32 @@ from typing import AsyncGenerator, Generator from uuid import NAMESPACE_OID, uuid5 import parso -import tiktoken from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk +from cognee.infrastructure.llm import get_max_chunk_tokens logger = logging.getLogger(__name__) -def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int: - return len(tokenizer.encode(source_code)) - - def _get_naive_subchunk_token_counts( - tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000 + source_code: str, max_subchunk_tokens ) -> list[tuple[str, int]]: """Splits source code into subchunks of up to max_subchunk_tokens and counts tokens.""" - token_ids = tokenizer.encode(source_code) + tokenizer = get_vector_engine().embedding_engine.tokenizer + token_ids = tokenizer.extract_tokens(source_code) subchunk_token_counts = [] for start_idx in range(0, len(token_ids), max_subchunk_tokens): subchunk_token_ids = token_ids[start_idx : start_idx + max_subchunk_tokens] token_count = len(subchunk_token_ids) + # Note: This can't work with Gemini embeddings as they keep their method of encoding text + # to tokens hidden and don't offer a decoder + # TODO: Add support for different tokenizers for this function subchunk = "".join( - tokenizer.decode_single_token_bytes(token_id).decode("utf-8", errors="replace") - for token_id in subchunk_token_ids + tokenizer.decode_single_token(token_id) for token_id in subchunk_token_ids ) subchunk_token_counts.append((subchunk, token_count)) @@ -37,15 +36,14 @@ def _get_naive_subchunk_token_counts( def _get_subchunk_token_counts( - tokenizer: tiktoken.Encoding, source_code: str, - max_subchunk_tokens: int = 8000, + max_subchunk_tokens, depth: int = 0, max_depth: int = 100, ) -> list[tuple[str, int]]: """Splits source code into subchunk and counts tokens for each subchunk.""" if depth > max_depth: - return _get_naive_subchunk_token_counts(tokenizer, source_code, max_subchunk_tokens) + return _get_naive_subchunk_token_counts(source_code, max_subchunk_tokens) try: module = parso.parse(source_code) @@ -64,7 +62,8 @@ def _get_subchunk_token_counts( subchunk_token_counts = [] for child in module.children: subchunk = child.get_code() - token_count = _count_tokens(tokenizer, subchunk) + tokenizer = get_vector_engine().embedding_engine.tokenizer + token_count = tokenizer.count_tokens(subchunk) if token_count == 0: continue @@ -75,13 +74,13 @@ def _get_subchunk_token_counts( if child.type == "string": subchunk_token_counts.extend( - _get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens) + _get_naive_subchunk_token_counts(subchunk, max_subchunk_tokens) ) continue subchunk_token_counts.extend( _get_subchunk_token_counts( - tokenizer, subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth + subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth ) ) @@ -96,22 +95,19 @@ def _get_chunk_source_code( cumulative_counts = [] current_source_code = "" - # Get embedding engine used in vector database - embedding_engine = get_vector_engine().embedding_engine - for i, (child_code, token_count) in enumerate(code_token_counts): current_count += token_count cumulative_counts.append(current_count) - if current_count > embedding_engine.max_tokens: + if current_count > get_max_chunk_tokens(): break current_source_code += f"\n{child_code}" - if current_count <= embedding_engine.max_tokens: + if current_count <= get_max_chunk_tokens(): return [], current_source_code.strip() cutoff = 1 for i, cum_count in enumerate(cumulative_counts): - if cum_count > (1 - overlap) * embedding_engine.max_tokens: + if cum_count > (1 - overlap) * get_max_chunk_tokens(): break cutoff = i @@ -121,19 +117,16 @@ def _get_chunk_source_code( def get_source_code_chunks_from_code_part( code_file_part: CodePart, overlap: float = 0.25, - granularity: float = 0.1, + granularity: float = 0.09, ) -> Generator[SourceCodeChunk, None, None]: """Yields source code chunks from a CodePart object, with configurable token limits and overlap.""" if not code_file_part.source_code: logger.error(f"No source code in CodeFile {code_file_part.id}") return - embedding_engine = get_vector_engine().embedding_engine - tokenizer = embedding_engine.tokenizer - - max_subchunk_tokens = max(1, int(granularity * embedding_engine.max_tokens)) + max_subchunk_tokens = max(1, int(granularity * get_max_chunk_tokens())) subchunk_token_counts = _get_subchunk_token_counts( - tokenizer, code_file_part.source_code, max_subchunk_tokens + code_file_part.source_code, max_subchunk_tokens ) previous_chunk = None @@ -157,7 +150,6 @@ async def get_source_code_chunks( data_points: list[DataPoint], ) -> AsyncGenerator[list[DataPoint], None]: """Processes code graph datapoints, create SourceCodeChink datapoints.""" - # TODO: Add support for other embedding models, with max_token mapping for data_point in data_points: try: yield data_point @@ -173,5 +165,7 @@ async def get_source_code_chunks( yield source_code_chunk except Exception as e: logger.error(f"Error processing code part: {e}") + raise e except Exception as e: logger.error(f"Error processing data point: {e}") + raise e diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 4f3198d87..75ad82954 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -114,7 +114,7 @@ async def eval_on_QA_dataset( if not out_path.exists(): out_path.mkdir(parents=True, exist_ok=True) - random.seed(42) + random.seed(43) instances = dataset if not num_samples else random.sample(dataset, num_samples) contexts_filename = out_path / Path( diff --git a/evals/qa_eval_parameters.json b/evals/qa_eval_parameters.json index 6d60ab56f..8ae82b2e8 100644 --- a/evals/qa_eval_parameters.json +++ b/evals/qa_eval_parameters.json @@ -4,7 +4,6 @@ ], "rag_option": [ "cognee_incremental", - "cognee", "no_rag", "simple_rag", "brute_force" @@ -14,10 +13,6 @@ ], "metric_names": [ "Correctness", - "Comprehensiveness", - "Directness", - "Diversity", - "Empowerment", - "promptfoo.directness" + "Comprehensiveness" ] } diff --git a/poetry.lock b/poetry.lock index 542e0377b..ee4e41039 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7241,7 +7241,7 @@ crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] name = "safetensors" version = "0.5.2" description = "" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"}, @@ -8079,7 +8079,7 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, name = "transformers" version = "4.48.1" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" -optional = false +optional = true python-versions = ">=3.9.0" files = [ {file = "transformers-4.48.1-py3-none-any.whl", hash = "sha256:24be0564b0a36d9e433d9a65de248f1545b6f6edce1737669605eb6a8141bbbb"}, @@ -9040,6 +9040,7 @@ falkordb = ["falkordb"] filesystem = ["botocore"] gemini = ["google-generativeai"] groq = ["groq"] +huggingface = ["transformers"] langchain = ["langchain_text_splitters", "langsmith"] llama-index = ["llama-index-core"] milvus = ["pymilvus"] @@ -9053,4 +9054,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.10.0,<3.13" -content-hash = "480675c274cd85a76a95bf03af865b1a0b462f25bbc21d7427b0a0b8e21c13db" +content-hash = "e0752df2545fd5048c0969acc7282fce8e034ec0abfabfe07785e7d34c44fc8b" diff --git a/pyproject.toml b/pyproject.toml index 98497cded..cdca4e755 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ pgvector = {version = "^0.3.5", optional = true} psycopg2 = {version = "^2.9.10", optional = true} llama-index-core = {version = "^0.12.11", optional = true} deepeval = {version = "^2.0.1", optional = true} -transformers = "^4.46.3" +transformers = {version = "^4.46.3", optional = true} pymilvus = {version = "^2.5.0", optional = true} unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.13", optional = true } pre-commit = "^4.0.1" @@ -92,6 +92,7 @@ notebook = ["notebook", "ipykernel", "overrides", "ipywidgets", "jupyterlab", "j langchain = ["langsmith", "langchain_text_splitters"] llama-index = ["llama-index-core"] gemini = ["google-generativeai"] +huggingface = ["transformers"] deepeval = ["deepeval"] posthog = ["posthog"] falkordb = ["falkordb"]