feat: Add gemini support [COG-1023] (#485)
<!-- .github/pull_request_template.md -->
## Description
PR to test Gemini PR from holchan
1. Add Gemini LLM and Gemini Embedding support
2. Fix CodeGraph issue with chunks being bigger than maximum token value
3. Add Tokenizer adapters to CodeGraph
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
- **New Features**
- Added support for the Gemini LLM provider.
- Expanded LLM configuration options.
- Introduced a new GitHub Actions workflow for multimetric QA
evaluation.
- Added new environment variables for LLM and embedding configurations
across various workflows.
- **Bug Fixes**
- Improved error handling in various components.
- Updated tokenization and embedding processes.
- Removed warning related to missing `dict` method in data items.
- **Refactor**
- Simplified token extraction and decoding methods.
- Updated tokenizer interfaces.
- Removed deprecated dependencies.
- Enhanced retry logic and error handling in embedding processes.
- **Documentation**
- Updated configuration comments and settings.
- **Chores**
- Updated GitHub Actions workflows to accommodate new secrets and
environment variables.
- Modified evaluation parameters.
- Adjusted dependency management for optional libraries.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---------
Co-authored-by: holchan <61059652+holchan@users.noreply.github.com>
Co-authored-by: Boris <boris@topoteretes.com>
This commit is contained in:
parent
f843c256e4
commit
8879f3fbbe
42 changed files with 494 additions and 101 deletions
|
|
@ -1,7 +1,7 @@
|
|||
ENV="local"
|
||||
TOKENIZERS_PARALLELISM="false"
|
||||
|
||||
# LLM settings
|
||||
# LLM Configuration
|
||||
LLM_API_KEY=""
|
||||
LLM_MODEL="openai/gpt-4o-mini"
|
||||
LLM_PROVIDER="openai"
|
||||
|
|
@ -14,7 +14,7 @@ GRAPHISTRY_PASSWORD=
|
|||
|
||||
SENTRY_REPORTING_URL=
|
||||
|
||||
# Embedding settings
|
||||
# Embedding Configuration
|
||||
EMBEDDING_PROVIDER="openai"
|
||||
EMBEDDING_API_KEY=""
|
||||
EMBEDDING_MODEL="openai/text-embedding-3-large"
|
||||
|
|
|
|||
25
.github/workflows/reusable_notebook.yml
vendored
25
.github/workflows/reusable_notebook.yml
vendored
|
|
@ -12,8 +12,24 @@ on:
|
|||
required: true
|
||||
GRAPHISTRY_PASSWORD:
|
||||
required: true
|
||||
#LLM_MODEL:
|
||||
# required: true
|
||||
#LLM_ENDPOINT:
|
||||
# required: true
|
||||
LLM_API_KEY:
|
||||
required: true
|
||||
OPENAI_API_KEY:
|
||||
required: true
|
||||
#LLM_API_VERSION:
|
||||
# required: true
|
||||
EMBEDDING_MODEL:
|
||||
required: true
|
||||
EMBEDDING_ENDPOINT:
|
||||
required: true
|
||||
EMBEDDING_API_KEY:
|
||||
required: true
|
||||
EMBEDDING_API_VERSION:
|
||||
required: true
|
||||
|
||||
env:
|
||||
RUNTIME__LOG_LEVEL: ERROR
|
||||
|
|
@ -50,8 +66,15 @@ jobs:
|
|||
- name: Execute Jupyter Notebook
|
||||
env:
|
||||
ENV: 'dev'
|
||||
#LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
#LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Use OpenAI Until a multimedia model is deployed and DeepEval support for other models is added
|
||||
#LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
run: |
|
||||
|
|
|
|||
26
.github/workflows/reusable_python_example.yml
vendored
26
.github/workflows/reusable_python_example.yml
vendored
|
|
@ -16,7 +16,23 @@ on:
|
|||
required: true
|
||||
GRAPHISTRY_PASSWORD:
|
||||
required: true
|
||||
LLM_MODEL:
|
||||
required: true
|
||||
LLM_ENDPOINT:
|
||||
required: true
|
||||
LLM_API_KEY:
|
||||
required: true
|
||||
OPENAI_API_KEY:
|
||||
required: false
|
||||
LLM_API_VERSION:
|
||||
required: true
|
||||
EMBEDDING_MODEL:
|
||||
required: true
|
||||
EMBEDDING_ENDPOINT:
|
||||
required: true
|
||||
EMBEDDING_API_KEY:
|
||||
required: true
|
||||
EMBEDDING_API_VERSION:
|
||||
required: true
|
||||
|
||||
env:
|
||||
|
|
@ -54,7 +70,15 @@ jobs:
|
|||
env:
|
||||
ENV: 'dev'
|
||||
PYTHONFAULTHANDLER: 1
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
run: poetry run python ${{ inputs.example-location }} ${{ inputs.arguments }}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,13 @@ jobs:
|
|||
example-location: ./examples/python/code_graph_example.py
|
||||
arguments: "--repo_path ./evals"
|
||||
secrets:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,14 @@ jobs:
|
|||
with:
|
||||
notebook-location: notebooks/cognee_llama_index.ipynb
|
||||
secrets:
|
||||
#LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
#LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
#LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,14 @@ jobs:
|
|||
with:
|
||||
notebook-location: notebooks/cognee_multimedia_demo.ipynb
|
||||
secrets:
|
||||
#LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
#LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
#LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
9
.github/workflows/test_deduplication.yml
vendored
9
.github/workflows/test_deduplication.yml
vendored
|
|
@ -57,5 +57,12 @@ jobs:
|
|||
- name: Run deduplication test
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: poetry run python ./cognee/tests/test_deduplication.py
|
||||
|
|
|
|||
|
|
@ -16,6 +16,13 @@ jobs:
|
|||
with:
|
||||
example-location: ./examples/python/dynamic_steps_example.py
|
||||
secrets:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
|
|
@ -38,5 +38,12 @@ jobs:
|
|||
env:
|
||||
ENV: 'dev'
|
||||
PYTHONFAULTHANDLER: 1
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: poetry run python ./examples/python/dynamic_steps_example.py
|
||||
|
|
|
|||
|
|
@ -15,6 +15,14 @@ jobs:
|
|||
with:
|
||||
notebook-location: notebooks/llama_index_cognee_integration.ipynb
|
||||
secrets:
|
||||
#LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
#LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
#LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
9
.github/workflows/test_milvus.yml
vendored
9
.github/workflows/test_milvus.yml
vendored
|
|
@ -47,7 +47,14 @@ jobs:
|
|||
- name: Run default basic pipeline
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: poetry run python ./cognee/tests/test_milvus.py
|
||||
|
||||
- name: Clean up disk space
|
||||
|
|
|
|||
|
|
@ -16,6 +16,13 @@ jobs:
|
|||
with:
|
||||
example-location: ./examples/python/multimedia_example.py
|
||||
secrets:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
#LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
#LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Use OpenAI until we deploy models to handle multimedia
|
||||
#LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
30
.github/workflows/test_multimetric_qa_eval_run.yaml
vendored
Normal file
30
.github/workflows/test_multimetric_qa_eval_run.yaml
vendored
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
name: test | multimetric qa eval run
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [labeled, synchronize]
|
||||
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run_multimetric_qa_eval_test:
|
||||
uses: ./.github/workflows/reusable_python_example.yml
|
||||
with:
|
||||
example-location: ./evals/multimetric_qa_eval_run.py
|
||||
arguments: "--params_file evals/qa_eval_parameters.json --out_dir dirname"
|
||||
secrets:
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Until we add support for azure for DeepEval
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
9
.github/workflows/test_neo4j.yml
vendored
9
.github/workflows/test_neo4j.yml
vendored
|
|
@ -43,7 +43,14 @@ jobs:
|
|||
- name: Run default Neo4j
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }}
|
||||
GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }}
|
||||
GRAPH_DATABASE_USERNAME: "neo4j"
|
||||
|
|
|
|||
8
.github/workflows/test_notebook.yml
vendored
8
.github/workflows/test_notebook.yml
vendored
|
|
@ -16,6 +16,14 @@ jobs:
|
|||
with:
|
||||
notebook-location: notebooks/cognee_demo.ipynb
|
||||
secrets:
|
||||
#LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
#LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
#LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
9
.github/workflows/test_pgvector.yml
vendored
9
.github/workflows/test_pgvector.yml
vendored
|
|
@ -58,5 +58,12 @@ jobs:
|
|||
- name: Run default PGVector
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: poetry run python ./cognee/tests/test_pgvector.py
|
||||
|
|
|
|||
9
.github/workflows/test_python_3_10.yml
vendored
9
.github/workflows/test_python_3_10.yml
vendored
|
|
@ -56,7 +56,14 @@ jobs:
|
|||
- name: Run default basic pipeline
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: poetry run python ./cognee/tests/test_library.py
|
||||
|
||||
- name: Clean up disk space
|
||||
|
|
|
|||
9
.github/workflows/test_python_3_11.yml
vendored
9
.github/workflows/test_python_3_11.yml
vendored
|
|
@ -58,7 +58,14 @@ jobs:
|
|||
- name: Run default basic pipeline
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: poetry run python ./cognee/tests/test_library.py
|
||||
|
||||
- name: Clean up disk space
|
||||
|
|
|
|||
9
.github/workflows/test_python_3_12.yml
vendored
9
.github/workflows/test_python_3_12.yml
vendored
|
|
@ -56,7 +56,14 @@ jobs:
|
|||
- name: Run default basic pipeline
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
run: poetry run python ./cognee/tests/test_library.py
|
||||
|
||||
- name: Clean up disk space
|
||||
|
|
|
|||
9
.github/workflows/test_qdrant.yml
vendored
9
.github/workflows/test_qdrant.yml
vendored
|
|
@ -44,7 +44,14 @@ jobs:
|
|||
- name: Run default Qdrant
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
VECTOR_DB_URL: ${{ secrets.QDRANT_API_URL }}
|
||||
VECTOR_DB_KEY: ${{ secrets.QDRANT_API_KEY }}
|
||||
run: poetry run python ./cognee/tests/test_qdrant.py
|
||||
|
|
|
|||
9
.github/workflows/test_simple_example.yml
vendored
9
.github/workflows/test_simple_example.yml
vendored
|
|
@ -16,6 +16,13 @@ jobs:
|
|||
with:
|
||||
example-location: ./examples/python/simple_example.py
|
||||
secrets:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }}
|
||||
GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }}
|
||||
|
|
|
|||
9
.github/workflows/test_weaviate.yml
vendored
9
.github/workflows/test_weaviate.yml
vendored
|
|
@ -44,7 +44,14 @@ jobs:
|
|||
- name: Run default Weaviate
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||
VECTOR_DB_URL: ${{ secrets.WEAVIATE_API_URL }}
|
||||
VECTOR_DB_KEY: ${{ secrets.WEAVIATE_API_KEY }}
|
||||
run: poetry run python ./cognee/tests/test_weaviate.py
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ class SettingsDTO(OutDTO):
|
|||
|
||||
|
||||
class LLMConfigInputDTO(InDTO):
|
||||
provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"]]
|
||||
provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"], Literal["gemini"]]
|
||||
model: str
|
||||
api_key: str
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,14 @@
|
|||
class EmbeddingException(Exception):
|
||||
from cognee.exceptions import CogneeApiError
|
||||
from fastapi import status
|
||||
|
||||
|
||||
class EmbeddingException(CogneeApiError):
|
||||
"""Custom exception for handling embedding-related errors."""
|
||||
|
||||
pass
|
||||
def __init__(
|
||||
self,
|
||||
message: str = "Embedding Exception.",
|
||||
name: str = "EmbeddingException",
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
):
|
||||
super().__init__(message, name, status_code)
|
||||
|
|
|
|||
|
|
@ -23,10 +23,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|||
dimensions: int
|
||||
mock: bool
|
||||
|
||||
MAX_RETRIES = 5
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Optional[str] = "openai/text-embedding-3-large",
|
||||
provider: str = "openai",
|
||||
model: Optional[str] = "text-embedding-3-large",
|
||||
dimensions: Optional[int] = 3072,
|
||||
api_key: str = None,
|
||||
endpoint: str = None,
|
||||
|
|
@ -41,15 +43,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|||
self.dimensions = dimensions
|
||||
self.max_tokens = max_tokens
|
||||
self.tokenizer = self.get_tokenizer()
|
||||
self.retry_count = 0
|
||||
|
||||
enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
|
||||
if isinstance(enable_mocking, bool):
|
||||
enable_mocking = str(enable_mocking).lower()
|
||||
self.mock = enable_mocking in ("true", "1", "yes")
|
||||
|
||||
MAX_RETRIES = 5
|
||||
retry_count = 0
|
||||
|
||||
async def embed_text(self, text: List[str]) -> List[List[float]]:
|
||||
async def exponential_backoff(attempt):
|
||||
wait_time = min(10 * (2**attempt), 60) # Max 60 seconds
|
||||
|
|
@ -64,14 +64,14 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|||
return [data["embedding"] for data in response["data"]]
|
||||
else:
|
||||
response = await litellm.aembedding(
|
||||
self.model,
|
||||
model=self.model,
|
||||
input=text,
|
||||
api_key=self.api_key,
|
||||
api_base=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
)
|
||||
|
||||
self.retry_count = 0
|
||||
self.retry_count = 0 # Reset retry count on successful call
|
||||
|
||||
return [data["embedding"] for data in response.data]
|
||||
|
||||
|
|
@ -99,13 +99,16 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|||
raise Exception("Rate limit exceeded and no more retries left.")
|
||||
|
||||
await exponential_backoff(self.retry_count)
|
||||
|
||||
self.retry_count += 1
|
||||
|
||||
return await self.embed_text(text)
|
||||
|
||||
except litellm.exceptions.BadRequestError:
|
||||
raise EmbeddingException("Failed to index data points.")
|
||||
except (
|
||||
litellm.exceptions.BadRequestError,
|
||||
litellm.exceptions.NotFoundError,
|
||||
) as e:
|
||||
logger.error(f"Embedding error with model {self.model}: {str(e)}")
|
||||
raise EmbeddingException(f"Failed to index data points using model {self.model}")
|
||||
|
||||
except Exception as error:
|
||||
logger.error("Error embedding text: %s", str(error))
|
||||
|
|
|
|||
0
cognee/infrastructure/llm/gemini/__init__.py
Normal file
0
cognee/infrastructure/llm/gemini/__init__.py
Normal file
155
cognee/infrastructure/llm/gemini/adapter.py
Normal file
155
cognee/infrastructure/llm/gemini/adapter.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
from typing import Type, Optional
|
||||
from pydantic import BaseModel
|
||||
import logging
|
||||
import litellm
|
||||
import asyncio
|
||||
from litellm import acompletion, JSONSchemaValidationError
|
||||
from cognee.shared.data_models import MonitoringTool
|
||||
from cognee.exceptions import InvalidValueError
|
||||
from cognee.infrastructure.llm.llm_interface import LLMInterface
|
||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||
from cognee.base_config import get_base_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
monitoring = get_base_config().monitoring_tool
|
||||
if monitoring == MonitoringTool.LANGFUSE:
|
||||
from langfuse.decorators import observe
|
||||
|
||||
|
||||
class GeminiAdapter(LLMInterface):
|
||||
MAX_RETRIES = 5
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
endpoint: Optional[str] = None,
|
||||
api_version: Optional[str] = None,
|
||||
streaming: bool = False,
|
||||
) -> None:
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.streaming = streaming
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
@observe(as_type="generation")
|
||||
async def acreate_structured_output(
|
||||
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
|
||||
) -> BaseModel:
|
||||
try:
|
||||
response_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"summary": {"type": "string"},
|
||||
"description": {"type": "string"},
|
||||
"nodes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"type": {"type": "string"},
|
||||
"description": {"type": "string"},
|
||||
"id": {"type": "string"},
|
||||
"label": {"type": "string"},
|
||||
},
|
||||
"required": ["name", "type", "description", "id", "label"],
|
||||
},
|
||||
},
|
||||
"edges": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_node_id": {"type": "string"},
|
||||
"target_node_id": {"type": "string"},
|
||||
"relationship_name": {"type": "string"},
|
||||
},
|
||||
"required": ["source_node_id", "target_node_id", "relationship_name"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["summary", "description", "nodes", "edges"],
|
||||
}
|
||||
|
||||
simplified_prompt = f"""
|
||||
{system_prompt}
|
||||
|
||||
IMPORTANT: Your response must be a valid JSON object with these required fields:
|
||||
1. summary: A brief summary
|
||||
2. description: A detailed description
|
||||
3. nodes: Array of nodes with name, type, description, id, and label
|
||||
4. edges: Array of edges with source_node_id, target_node_id, and relationship_name
|
||||
|
||||
Example structure:
|
||||
{{
|
||||
"summary": "Brief summary",
|
||||
"description": "Detailed description",
|
||||
"nodes": [
|
||||
{{
|
||||
"name": "Example Node",
|
||||
"type": "Concept",
|
||||
"description": "Node description",
|
||||
"id": "example-id",
|
||||
"label": "Concept"
|
||||
}}
|
||||
],
|
||||
"edges": [
|
||||
{{
|
||||
"source_node_id": "source-id",
|
||||
"target_node_id": "target-id",
|
||||
"relationship_name": "relates_to"
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": simplified_prompt},
|
||||
{"role": "user", "content": text_input},
|
||||
]
|
||||
|
||||
try:
|
||||
response = await acompletion(
|
||||
model=f"{self.model}",
|
||||
messages=messages,
|
||||
api_key=self.api_key,
|
||||
max_tokens=self.max_tokens,
|
||||
temperature=0.1,
|
||||
response_format={"type": "json_object", "schema": response_schema},
|
||||
timeout=10,
|
||||
num_retries=self.MAX_RETRIES,
|
||||
)
|
||||
|
||||
if response.choices and response.choices[0].message.content:
|
||||
content = response.choices[0].message.content
|
||||
return response_model.model_validate_json(content)
|
||||
|
||||
except litellm.exceptions.BadRequestError as e:
|
||||
logger.error(f"Bad request error: {str(e)}")
|
||||
raise ValueError(f"Invalid request: {str(e)}")
|
||||
|
||||
raise ValueError("Failed to get valid response after retries")
|
||||
|
||||
except JSONSchemaValidationError as e:
|
||||
logger.error(f"Schema validation failed: {str(e)}")
|
||||
logger.debug(f"Raw response: {e.raw_response}")
|
||||
raise ValueError(f"Response failed schema validation: {str(e)}")
|
||||
|
||||
def show_prompt(self, text_input: str, system_prompt: str) -> str:
|
||||
"""Format and display the prompt for a user query."""
|
||||
if not text_input:
|
||||
text_input = "No user input provided."
|
||||
if not system_prompt:
|
||||
raise InvalidValueError(message="No system prompt path provided.")
|
||||
system_prompt = read_query_prompt(system_prompt)
|
||||
|
||||
formatted_prompt = (
|
||||
f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n"""
|
||||
if system_prompt
|
||||
else None
|
||||
)
|
||||
return formatted_prompt
|
||||
|
|
@ -12,6 +12,7 @@ class LLMProvider(Enum):
|
|||
OLLAMA = "ollama"
|
||||
ANTHROPIC = "anthropic"
|
||||
CUSTOM = "custom"
|
||||
GEMINI = "gemini"
|
||||
|
||||
|
||||
def get_llm_client():
|
||||
|
|
@ -78,5 +79,20 @@ def get_llm_client():
|
|||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
elif provider == LLMProvider.GEMINI:
|
||||
if llm_config.llm_api_key is None:
|
||||
raise InvalidValueError(message="LLM API key is not set.")
|
||||
|
||||
from .gemini.adapter import GeminiAdapter
|
||||
|
||||
return GeminiAdapter(
|
||||
api_key=llm_config.llm_api_key,
|
||||
model=llm_config.llm_model,
|
||||
max_tokens=max_tokens,
|
||||
endpoint=llm_config.llm_endpoint,
|
||||
api_version=llm_config.llm_api_version,
|
||||
streaming=llm_config.llm_streaming,
|
||||
)
|
||||
|
||||
else:
|
||||
raise InvalidValueError(message=f"Unsupported LLM provider: {provider}")
|
||||
|
|
|
|||
|
|
@ -23,6 +23,8 @@ class OpenAIAdapter(LLMInterface):
|
|||
api_key: str
|
||||
api_version: str
|
||||
|
||||
MAX_RETRIES = 5
|
||||
|
||||
"""Adapter for OpenAI's GPT-3, GPT=4 API"""
|
||||
|
||||
def __init__(
|
||||
|
|
@ -68,7 +70,7 @@ class OpenAIAdapter(LLMInterface):
|
|||
api_base=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
response_model=response_model,
|
||||
max_retries=5,
|
||||
max_retries=self.MAX_RETRIES,
|
||||
)
|
||||
|
||||
@observe
|
||||
|
|
@ -94,7 +96,7 @@ class OpenAIAdapter(LLMInterface):
|
|||
api_base=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
response_model=response_model,
|
||||
max_retries=5,
|
||||
max_retries=self.MAX_RETRIES,
|
||||
)
|
||||
|
||||
def create_transcript(self, input):
|
||||
|
|
@ -112,7 +114,7 @@ class OpenAIAdapter(LLMInterface):
|
|||
api_key=self.api_key,
|
||||
api_base=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
max_retries=5,
|
||||
max_retries=self.MAX_RETRIES,
|
||||
)
|
||||
|
||||
return transcription
|
||||
|
|
@ -144,7 +146,7 @@ class OpenAIAdapter(LLMInterface):
|
|||
api_base=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
max_tokens=300,
|
||||
max_retries=5,
|
||||
max_retries=self.MAX_RETRIES,
|
||||
)
|
||||
|
||||
def show_prompt(self, text_input: str, system_prompt: str) -> str:
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from typing import List, Any
|
||||
from typing import List, Any, Union
|
||||
|
||||
from ..tokenizer_interface import TokenizerInterface
|
||||
|
||||
|
|
@ -26,6 +26,10 @@ class GeminiTokenizer(TokenizerInterface):
|
|||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
def decode_single_token(self, encoding: int):
|
||||
# Gemini tokenizer doesn't have the option to decode tokens
|
||||
raise NotImplementedError
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Returns the number of tokens in the given text.
|
||||
|
|
@ -39,6 +43,3 @@ class GeminiTokenizer(TokenizerInterface):
|
|||
import google.generativeai as genai
|
||||
|
||||
return len(genai.embed_content(model=f"models/{self.model}", content=text))
|
||||
|
||||
def trim_text_to_max_tokens(self, text: str) -> str:
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
from typing import List, Any
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from ..tokenizer_interface import TokenizerInterface
|
||||
|
||||
|
||||
|
|
@ -14,6 +12,9 @@ class HuggingFaceTokenizer(TokenizerInterface):
|
|||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
# Import here to make it an optional dependency
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
|
||||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
|
|
@ -32,5 +33,6 @@ class HuggingFaceTokenizer(TokenizerInterface):
|
|||
"""
|
||||
return len(self.tokenizer.tokenize(text))
|
||||
|
||||
def trim_text_to_max_tokens(self, text: str) -> str:
|
||||
def decode_single_token(self, encoding: int):
|
||||
# Gemini tokenizer doesn't have the option to decode tokens
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -21,14 +21,17 @@ class TikTokenTokenizer(TokenizerInterface):
|
|||
self.tokenizer = tiktoken.encoding_for_model(self.model)
|
||||
|
||||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
tokens = []
|
||||
# Using TikToken's method to tokenize text
|
||||
token_ids = self.tokenizer.encode(text)
|
||||
# Go through tokens and decode them to text value
|
||||
for token_id in token_ids:
|
||||
token = self.tokenizer.decode([token_id])
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
return token_ids
|
||||
|
||||
def decode_token_list(self, tokens: List[Any]) -> List[Any]:
|
||||
if not isinstance(tokens, list):
|
||||
tokens = [tokens]
|
||||
return [self.tokenizer.decode(i) for i in tokens]
|
||||
|
||||
def decode_single_token(self, token: int):
|
||||
return self.tokenizer.decode_single_token_bytes(token).decode("utf-8", errors="replace")
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -14,5 +14,5 @@ class TokenizerInterface(Protocol):
|
|||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def trim_text_to_max_tokens(self, text: str) -> str:
|
||||
def decode_single_token(self, token: int) -> str:
|
||||
raise NotImplementedError
|
||||
|
|
|
|||
|
|
@ -113,8 +113,10 @@ class CogneeGraph(CogneeAbstractGraph):
|
|||
|
||||
except (ValueError, TypeError) as e:
|
||||
print(f"Error projecting graph: {e}")
|
||||
raise e
|
||||
except Exception as ex:
|
||||
print(f"Unexpected error: {ex}")
|
||||
raise ex
|
||||
|
||||
async def map_vector_distances_to_graph_nodes(self, node_distances) -> None:
|
||||
for category, scored_results in node_distances.items():
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ class ModelName(Enum):
|
|||
openai = "openai"
|
||||
ollama = "ollama"
|
||||
anthropic = "anthropic"
|
||||
gemini = "gemini"
|
||||
|
||||
|
||||
class LLMConfig(BaseModel):
|
||||
|
|
@ -72,6 +73,10 @@ def get_settings() -> SettingsDict:
|
|||
"value": "anthropic",
|
||||
"label": "Anthropic",
|
||||
},
|
||||
{
|
||||
"value": "gemini",
|
||||
"label": "Gemini",
|
||||
},
|
||||
]
|
||||
|
||||
return SettingsDict.model_validate(
|
||||
|
|
@ -136,6 +141,12 @@ def get_settings() -> SettingsDict:
|
|||
"label": "Claude 3 Haiku",
|
||||
},
|
||||
],
|
||||
"gemini": [
|
||||
{
|
||||
"value": "gemini-2.0-flash-exp",
|
||||
"label": "Gemini 2.0 Flash",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
vector_db={
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
from typing import Any, Dict, Iterator, Optional, Union
|
||||
from typing import Any, Dict, Iterator
|
||||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
import tiktoken
|
||||
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
|
||||
from .chunk_by_sentence import chunk_by_sentence
|
||||
|
|
@ -19,7 +17,7 @@ def chunk_by_paragraph(
|
|||
When chunks are joined with empty string "", they reproduce the original text exactly.
|
||||
|
||||
Notes:
|
||||
- Tokenization is handled using the `tiktoken` library, ensuring compatibility with the vector engine's embedding model.
|
||||
- Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model.
|
||||
- If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
|
||||
- Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
|
||||
- Remaining text at the end of the input will be yielded as a final chunk.
|
||||
|
|
@ -31,17 +29,12 @@ def chunk_by_paragraph(
|
|||
last_cut_type = None
|
||||
current_token_count = 0
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
embedding_model = vector_engine.embedding_engine.model
|
||||
embedding_model = embedding_model.split("/")[-1]
|
||||
|
||||
for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(
|
||||
data, maximum_length=paragraph_length
|
||||
):
|
||||
# Check if this sentence would exceed length limit
|
||||
|
||||
tokenizer = tiktoken.encoding_for_model(embedding_model)
|
||||
token_count = len(tokenizer.encode(sentence))
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
token_count = embedding_engine.tokenizer.count_tokens(sentence)
|
||||
|
||||
if current_word_count > 0 and (
|
||||
current_word_count + word_count > paragraph_length
|
||||
|
|
|
|||
|
|
@ -30,9 +30,6 @@ async def ingest_data(data: Any, dataset_name: str, user: User):
|
|||
if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")):
|
||||
return {"metadata": data_item.dict(), "origin": str(type(data_item))}
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Data of type {type(data_item)}... does not have dict method. Returning empty metadata."
|
||||
)
|
||||
return {}
|
||||
|
||||
@dlt.resource(standalone=True, primary_key="id", merge_key="id")
|
||||
|
|
|
|||
|
|
@ -3,33 +3,32 @@ from typing import AsyncGenerator, Generator
|
|||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
import parso
|
||||
import tiktoken
|
||||
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
|
||||
from cognee.infrastructure.llm import get_max_chunk_tokens
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:
|
||||
return len(tokenizer.encode(source_code))
|
||||
|
||||
|
||||
def _get_naive_subchunk_token_counts(
|
||||
tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
|
||||
source_code: str, max_subchunk_tokens
|
||||
) -> list[tuple[str, int]]:
|
||||
"""Splits source code into subchunks of up to max_subchunk_tokens and counts tokens."""
|
||||
|
||||
token_ids = tokenizer.encode(source_code)
|
||||
tokenizer = get_vector_engine().embedding_engine.tokenizer
|
||||
token_ids = tokenizer.extract_tokens(source_code)
|
||||
subchunk_token_counts = []
|
||||
|
||||
for start_idx in range(0, len(token_ids), max_subchunk_tokens):
|
||||
subchunk_token_ids = token_ids[start_idx : start_idx + max_subchunk_tokens]
|
||||
token_count = len(subchunk_token_ids)
|
||||
# Note: This can't work with Gemini embeddings as they keep their method of encoding text
|
||||
# to tokens hidden and don't offer a decoder
|
||||
# TODO: Add support for different tokenizers for this function
|
||||
subchunk = "".join(
|
||||
tokenizer.decode_single_token_bytes(token_id).decode("utf-8", errors="replace")
|
||||
for token_id in subchunk_token_ids
|
||||
tokenizer.decode_single_token(token_id) for token_id in subchunk_token_ids
|
||||
)
|
||||
subchunk_token_counts.append((subchunk, token_count))
|
||||
|
||||
|
|
@ -37,15 +36,14 @@ def _get_naive_subchunk_token_counts(
|
|||
|
||||
|
||||
def _get_subchunk_token_counts(
|
||||
tokenizer: tiktoken.Encoding,
|
||||
source_code: str,
|
||||
max_subchunk_tokens: int = 8000,
|
||||
max_subchunk_tokens,
|
||||
depth: int = 0,
|
||||
max_depth: int = 100,
|
||||
) -> list[tuple[str, int]]:
|
||||
"""Splits source code into subchunk and counts tokens for each subchunk."""
|
||||
if depth > max_depth:
|
||||
return _get_naive_subchunk_token_counts(tokenizer, source_code, max_subchunk_tokens)
|
||||
return _get_naive_subchunk_token_counts(source_code, max_subchunk_tokens)
|
||||
|
||||
try:
|
||||
module = parso.parse(source_code)
|
||||
|
|
@ -64,7 +62,8 @@ def _get_subchunk_token_counts(
|
|||
subchunk_token_counts = []
|
||||
for child in module.children:
|
||||
subchunk = child.get_code()
|
||||
token_count = _count_tokens(tokenizer, subchunk)
|
||||
tokenizer = get_vector_engine().embedding_engine.tokenizer
|
||||
token_count = tokenizer.count_tokens(subchunk)
|
||||
|
||||
if token_count == 0:
|
||||
continue
|
||||
|
|
@ -75,13 +74,13 @@ def _get_subchunk_token_counts(
|
|||
|
||||
if child.type == "string":
|
||||
subchunk_token_counts.extend(
|
||||
_get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens)
|
||||
_get_naive_subchunk_token_counts(subchunk, max_subchunk_tokens)
|
||||
)
|
||||
continue
|
||||
|
||||
subchunk_token_counts.extend(
|
||||
_get_subchunk_token_counts(
|
||||
tokenizer, subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth
|
||||
subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -96,22 +95,19 @@ def _get_chunk_source_code(
|
|||
cumulative_counts = []
|
||||
current_source_code = ""
|
||||
|
||||
# Get embedding engine used in vector database
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
|
||||
for i, (child_code, token_count) in enumerate(code_token_counts):
|
||||
current_count += token_count
|
||||
cumulative_counts.append(current_count)
|
||||
if current_count > embedding_engine.max_tokens:
|
||||
if current_count > get_max_chunk_tokens():
|
||||
break
|
||||
current_source_code += f"\n{child_code}"
|
||||
|
||||
if current_count <= embedding_engine.max_tokens:
|
||||
if current_count <= get_max_chunk_tokens():
|
||||
return [], current_source_code.strip()
|
||||
|
||||
cutoff = 1
|
||||
for i, cum_count in enumerate(cumulative_counts):
|
||||
if cum_count > (1 - overlap) * embedding_engine.max_tokens:
|
||||
if cum_count > (1 - overlap) * get_max_chunk_tokens():
|
||||
break
|
||||
cutoff = i
|
||||
|
||||
|
|
@ -121,19 +117,16 @@ def _get_chunk_source_code(
|
|||
def get_source_code_chunks_from_code_part(
|
||||
code_file_part: CodePart,
|
||||
overlap: float = 0.25,
|
||||
granularity: float = 0.1,
|
||||
granularity: float = 0.09,
|
||||
) -> Generator[SourceCodeChunk, None, None]:
|
||||
"""Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
|
||||
if not code_file_part.source_code:
|
||||
logger.error(f"No source code in CodeFile {code_file_part.id}")
|
||||
return
|
||||
|
||||
embedding_engine = get_vector_engine().embedding_engine
|
||||
tokenizer = embedding_engine.tokenizer
|
||||
|
||||
max_subchunk_tokens = max(1, int(granularity * embedding_engine.max_tokens))
|
||||
max_subchunk_tokens = max(1, int(granularity * get_max_chunk_tokens()))
|
||||
subchunk_token_counts = _get_subchunk_token_counts(
|
||||
tokenizer, code_file_part.source_code, max_subchunk_tokens
|
||||
code_file_part.source_code, max_subchunk_tokens
|
||||
)
|
||||
|
||||
previous_chunk = None
|
||||
|
|
@ -157,7 +150,6 @@ async def get_source_code_chunks(
|
|||
data_points: list[DataPoint],
|
||||
) -> AsyncGenerator[list[DataPoint], None]:
|
||||
"""Processes code graph datapoints, create SourceCodeChink datapoints."""
|
||||
# TODO: Add support for other embedding models, with max_token mapping
|
||||
for data_point in data_points:
|
||||
try:
|
||||
yield data_point
|
||||
|
|
@ -173,5 +165,7 @@ async def get_source_code_chunks(
|
|||
yield source_code_chunk
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing code part: {e}")
|
||||
raise e
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing data point: {e}")
|
||||
raise e
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ async def eval_on_QA_dataset(
|
|||
if not out_path.exists():
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
random.seed(42)
|
||||
random.seed(43)
|
||||
instances = dataset if not num_samples else random.sample(dataset, num_samples)
|
||||
|
||||
contexts_filename = out_path / Path(
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@
|
|||
],
|
||||
"rag_option": [
|
||||
"cognee_incremental",
|
||||
"cognee",
|
||||
"no_rag",
|
||||
"simple_rag",
|
||||
"brute_force"
|
||||
|
|
@ -14,10 +13,6 @@
|
|||
],
|
||||
"metric_names": [
|
||||
"Correctness",
|
||||
"Comprehensiveness",
|
||||
"Directness",
|
||||
"Diversity",
|
||||
"Empowerment",
|
||||
"promptfoo.directness"
|
||||
"Comprehensiveness"
|
||||
]
|
||||
}
|
||||
|
|
|
|||
7
poetry.lock
generated
7
poetry.lock
generated
|
|
@ -7241,7 +7241,7 @@ crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"]
|
|||
name = "safetensors"
|
||||
version = "0.5.2"
|
||||
description = ""
|
||||
optional = false
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"},
|
||||
|
|
@ -8079,7 +8079,7 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,
|
|||
name = "transformers"
|
||||
version = "4.48.1"
|
||||
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
|
||||
optional = false
|
||||
optional = true
|
||||
python-versions = ">=3.9.0"
|
||||
files = [
|
||||
{file = "transformers-4.48.1-py3-none-any.whl", hash = "sha256:24be0564b0a36d9e433d9a65de248f1545b6f6edce1737669605eb6a8141bbbb"},
|
||||
|
|
@ -9040,6 +9040,7 @@ falkordb = ["falkordb"]
|
|||
filesystem = ["botocore"]
|
||||
gemini = ["google-generativeai"]
|
||||
groq = ["groq"]
|
||||
huggingface = ["transformers"]
|
||||
langchain = ["langchain_text_splitters", "langsmith"]
|
||||
llama-index = ["llama-index-core"]
|
||||
milvus = ["pymilvus"]
|
||||
|
|
@ -9053,4 +9054,4 @@ weaviate = ["weaviate-client"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10.0,<3.13"
|
||||
content-hash = "480675c274cd85a76a95bf03af865b1a0b462f25bbc21d7427b0a0b8e21c13db"
|
||||
content-hash = "e0752df2545fd5048c0969acc7282fce8e034ec0abfabfe07785e7d34c44fc8b"
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ pgvector = {version = "^0.3.5", optional = true}
|
|||
psycopg2 = {version = "^2.9.10", optional = true}
|
||||
llama-index-core = {version = "^0.12.11", optional = true}
|
||||
deepeval = {version = "^2.0.1", optional = true}
|
||||
transformers = "^4.46.3"
|
||||
transformers = {version = "^4.46.3", optional = true}
|
||||
pymilvus = {version = "^2.5.0", optional = true}
|
||||
unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.13", optional = true }
|
||||
pre-commit = "^4.0.1"
|
||||
|
|
@ -92,6 +92,7 @@ notebook = ["notebook", "ipykernel", "overrides", "ipywidgets", "jupyterlab", "j
|
|||
langchain = ["langsmith", "langchain_text_splitters"]
|
||||
llama-index = ["llama-index-core"]
|
||||
gemini = ["google-generativeai"]
|
||||
huggingface = ["transformers"]
|
||||
deepeval = ["deepeval"]
|
||||
posthog = ["posthog"]
|
||||
falkordb = ["falkordb"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue