diff --git a/.env.template b/.env.template index df8408518..e9dc2a4c2 100644 --- a/.env.template +++ b/.env.template @@ -1,7 +1,7 @@ ENV="local" TOKENIZERS_PARALLELISM="false" -# LLM settings +# LLM Configuration LLM_API_KEY="" LLM_MODEL="openai/gpt-4o-mini" LLM_PROVIDER="openai" @@ -14,7 +14,7 @@ GRAPHISTRY_PASSWORD= SENTRY_REPORTING_URL= -# Embedding settings +# Embedding Configuration EMBEDDING_PROVIDER="openai" EMBEDDING_API_KEY="" EMBEDDING_MODEL="openai/text-embedding-3-large" diff --git a/.github/workflows/reusable_notebook.yml b/.github/workflows/reusable_notebook.yml index 9bc09c3a6..13c6eb43e 100644 --- a/.github/workflows/reusable_notebook.yml +++ b/.github/workflows/reusable_notebook.yml @@ -12,8 +12,24 @@ on: required: true GRAPHISTRY_PASSWORD: required: true + #LLM_MODEL: + # required: true + #LLM_ENDPOINT: + # required: true + LLM_API_KEY: + required: true OPENAI_API_KEY: required: true + #LLM_API_VERSION: + # required: true + EMBEDDING_MODEL: + required: true + EMBEDDING_ENDPOINT: + required: true + EMBEDDING_API_KEY: + required: true + EMBEDDING_API_VERSION: + required: true env: RUNTIME__LOG_LEVEL: ERROR @@ -50,8 +66,15 @@ jobs: - name: Execute Jupyter Notebook env: ENV: 'dev' + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Use OpenAI Until a multimedia model is deployed and DeepEval support for other models is added + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} run: | diff --git a/.github/workflows/reusable_python_example.yml b/.github/workflows/reusable_python_example.yml index d1f7ee16b..0897e639d 100644 --- a/.github/workflows/reusable_python_example.yml +++ b/.github/workflows/reusable_python_example.yml @@ -16,7 +16,23 @@ on: required: true GRAPHISTRY_PASSWORD: required: true + LLM_MODEL: + required: true + LLM_ENDPOINT: + required: true + LLM_API_KEY: + required: true OPENAI_API_KEY: + required: false + LLM_API_VERSION: + required: true + EMBEDDING_MODEL: + required: true + EMBEDDING_ENDPOINT: + required: true + EMBEDDING_API_KEY: + required: true + EMBEDDING_API_VERSION: required: true env: @@ -54,7 +70,15 @@ jobs: env: ENV: 'dev' PYTHONFAULTHANDLER: 1 - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} run: poetry run python ${{ inputs.example-location }} ${{ inputs.arguments }} diff --git a/.github/workflows/test_code_graph_example.yml b/.github/workflows/test_code_graph_example.yml index a1f8d4e2c..1200b5b11 100644 --- a/.github/workflows/test_code_graph_example.yml +++ b/.github/workflows/test_code_graph_example.yml @@ -17,6 +17,13 @@ jobs: example-location: ./examples/python/code_graph_example.py arguments: "--repo_path ./evals" secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_cognee_llama_index_notebook.yml b/.github/workflows/test_cognee_llama_index_notebook.yml index 2eadd125b..2e90cca6b 100644 --- a/.github/workflows/test_cognee_llama_index_notebook.yml +++ b/.github/workflows/test_cognee_llama_index_notebook.yml @@ -15,6 +15,14 @@ jobs: with: notebook-location: notebooks/cognee_llama_index.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_cognee_multimedia_notebook.yml b/.github/workflows/test_cognee_multimedia_notebook.yml index 1d6f1f468..dc0edf0e5 100644 --- a/.github/workflows/test_cognee_multimedia_notebook.yml +++ b/.github/workflows/test_cognee_multimedia_notebook.yml @@ -15,6 +15,14 @@ jobs: with: notebook-location: notebooks/cognee_multimedia_demo.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_deduplication.yml b/.github/workflows/test_deduplication.yml index 2f97e4ea6..01afd7b37 100644 --- a/.github/workflows/test_deduplication.yml +++ b/.github/workflows/test_deduplication.yml @@ -57,5 +57,12 @@ jobs: - name: Run deduplication test env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_deduplication.py diff --git a/.github/workflows/test_dynamic_steps_example.yml b/.github/workflows/test_dynamic_steps_example.yml index ed0688bef..0e22fa7ec 100644 --- a/.github/workflows/test_dynamic_steps_example.yml +++ b/.github/workflows/test_dynamic_steps_example.yml @@ -16,6 +16,13 @@ jobs: with: example-location: ./examples/python/dynamic_steps_example.py secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_dynamic_steps_example_windows.yml b/.github/workflows/test_dynamic_steps_example_windows.yml index d0a9c93e3..881c39f24 100644 --- a/.github/workflows/test_dynamic_steps_example_windows.yml +++ b/.github/workflows/test_dynamic_steps_example_windows.yml @@ -38,5 +38,12 @@ jobs: env: ENV: 'dev' PYTHONFAULTHANDLER: 1 - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./examples/python/dynamic_steps_example.py diff --git a/.github/workflows/test_llama_index_cognee_integration_notebook.yml b/.github/workflows/test_llama_index_cognee_integration_notebook.yml index aacc31eb5..5974009d6 100644 --- a/.github/workflows/test_llama_index_cognee_integration_notebook.yml +++ b/.github/workflows/test_llama_index_cognee_integration_notebook.yml @@ -15,6 +15,14 @@ jobs: with: notebook-location: notebooks/llama_index_cognee_integration.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_milvus.yml b/.github/workflows/test_milvus.yml index 51e5f0982..dec74a475 100644 --- a/.github/workflows/test_milvus.yml +++ b/.github/workflows/test_milvus.yml @@ -47,7 +47,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_milvus.py - name: Clean up disk space diff --git a/.github/workflows/test_multimedia_example.yaml b/.github/workflows/test_multimedia_example.yaml index 95c93c01b..c307002a8 100644 --- a/.github/workflows/test_multimedia_example.yaml +++ b/.github/workflows/test_multimedia_example.yaml @@ -16,6 +16,13 @@ jobs: with: example-location: ./examples/python/multimedia_example.py secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Use OpenAI until we deploy models to handle multimedia + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_multimetric_qa_eval_run.yaml b/.github/workflows/test_multimetric_qa_eval_run.yaml new file mode 100644 index 000000000..44a33a48f --- /dev/null +++ b/.github/workflows/test_multimetric_qa_eval_run.yaml @@ -0,0 +1,30 @@ +name: test | multimetric qa eval run + +on: + workflow_dispatch: + pull_request: + types: [labeled, synchronize] + + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + run_multimetric_qa_eval_test: + uses: ./.github/workflows/reusable_python_example.yml + with: + example-location: ./evals/multimetric_qa_eval_run.py + arguments: "--params_file evals/qa_eval_parameters.json --out_dir dirname" + secrets: + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Until we add support for azure for DeepEval + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} + GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_neo4j.yml b/.github/workflows/test_neo4j.yml index e1d71dcfd..7e8423628 100644 --- a/.github/workflows/test_neo4j.yml +++ b/.github/workflows/test_neo4j.yml @@ -43,7 +43,14 @@ jobs: - name: Run default Neo4j env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }} GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }} GRAPH_DATABASE_USERNAME: "neo4j" diff --git a/.github/workflows/test_notebook.yml b/.github/workflows/test_notebook.yml index e03779c70..d8ff5a8ba 100644 --- a/.github/workflows/test_notebook.yml +++ b/.github/workflows/test_notebook.yml @@ -16,6 +16,14 @@ jobs: with: notebook-location: notebooks/cognee_demo.ipynb secrets: + #LLM_MODEL: ${{ secrets.LLM_MODEL }} + #LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + #LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_pgvector.yml b/.github/workflows/test_pgvector.yml index d5356d603..0f9b5d369 100644 --- a/.github/workflows/test_pgvector.yml +++ b/.github/workflows/test_pgvector.yml @@ -58,5 +58,12 @@ jobs: - name: Run default PGVector env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_pgvector.py diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index 90c437eb8..6a9367451 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -56,7 +56,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_library.py - name: Clean up disk space diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index 83f98c6ed..96b275773 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -58,7 +58,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_library.py - name: Clean up disk space diff --git a/.github/workflows/test_python_3_12.yml b/.github/workflows/test_python_3_12.yml index 14e33551a..9b0e7d6b5 100644 --- a/.github/workflows/test_python_3_12.yml +++ b/.github/workflows/test_python_3_12.yml @@ -56,7 +56,14 @@ jobs: - name: Run default basic pipeline env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: poetry run python ./cognee/tests/test_library.py - name: Clean up disk space diff --git a/.github/workflows/test_qdrant.yml b/.github/workflows/test_qdrant.yml index e2cf9abe8..5ee35058f 100644 --- a/.github/workflows/test_qdrant.yml +++ b/.github/workflows/test_qdrant.yml @@ -44,7 +44,14 @@ jobs: - name: Run default Qdrant env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} VECTOR_DB_URL: ${{ secrets.QDRANT_API_URL }} VECTOR_DB_KEY: ${{ secrets.QDRANT_API_KEY }} run: poetry run python ./cognee/tests/test_qdrant.py diff --git a/.github/workflows/test_simple_example.yml b/.github/workflows/test_simple_example.yml index 5378df891..21912414b 100644 --- a/.github/workflows/test_simple_example.yml +++ b/.github/workflows/test_simple_example.yml @@ -16,6 +16,13 @@ jobs: with: example-location: ./examples/python/simple_example.py secrets: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} GRAPHISTRY_USERNAME: ${{ secrets.GRAPHISTRY_USERNAME }} GRAPHISTRY_PASSWORD: ${{ secrets.GRAPHISTRY_PASSWORD }} diff --git a/.github/workflows/test_weaviate.yml b/.github/workflows/test_weaviate.yml index 81cc2603f..244c5f47a 100644 --- a/.github/workflows/test_weaviate.yml +++ b/.github/workflows/test_weaviate.yml @@ -44,7 +44,14 @@ jobs: - name: Run default Weaviate env: ENV: 'dev' - LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} VECTOR_DB_URL: ${{ secrets.WEAVIATE_API_URL }} VECTOR_DB_KEY: ${{ secrets.WEAVIATE_API_KEY }} run: poetry run python ./cognee/tests/test_weaviate.py diff --git a/cognee/api/v1/settings/routers/get_settings_router.py b/cognee/api/v1/settings/routers/get_settings_router.py index 138bea661..e2842e3cd 100644 --- a/cognee/api/v1/settings/routers/get_settings_router.py +++ b/cognee/api/v1/settings/routers/get_settings_router.py @@ -21,7 +21,7 @@ class SettingsDTO(OutDTO): class LLMConfigInputDTO(InDTO): - provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"]] + provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"], Literal["gemini"]] model: str api_key: str diff --git a/cognee/infrastructure/databases/exceptions/EmbeddingException.py b/cognee/infrastructure/databases/exceptions/EmbeddingException.py index 130282857..df3ba93c0 100644 --- a/cognee/infrastructure/databases/exceptions/EmbeddingException.py +++ b/cognee/infrastructure/databases/exceptions/EmbeddingException.py @@ -1,4 +1,14 @@ -class EmbeddingException(Exception): +from cognee.exceptions import CogneeApiError +from fastapi import status + + +class EmbeddingException(CogneeApiError): """Custom exception for handling embedding-related errors.""" - pass + def __init__( + self, + message: str = "Embedding Exception.", + name: str = "EmbeddingException", + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + ): + super().__init__(message, name, status_code) diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index f81bc8515..650fe5adb 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -23,10 +23,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): dimensions: int mock: bool + MAX_RETRIES = 5 + def __init__( self, + model: Optional[str] = "openai/text-embedding-3-large", provider: str = "openai", - model: Optional[str] = "text-embedding-3-large", dimensions: Optional[int] = 3072, api_key: str = None, endpoint: str = None, @@ -41,15 +43,13 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): self.dimensions = dimensions self.max_tokens = max_tokens self.tokenizer = self.get_tokenizer() + self.retry_count = 0 enable_mocking = os.getenv("MOCK_EMBEDDING", "false") if isinstance(enable_mocking, bool): enable_mocking = str(enable_mocking).lower() self.mock = enable_mocking in ("true", "1", "yes") - MAX_RETRIES = 5 - retry_count = 0 - async def embed_text(self, text: List[str]) -> List[List[float]]: async def exponential_backoff(attempt): wait_time = min(10 * (2**attempt), 60) # Max 60 seconds @@ -64,14 +64,14 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): return [data["embedding"] for data in response["data"]] else: response = await litellm.aembedding( - self.model, + model=self.model, input=text, api_key=self.api_key, api_base=self.endpoint, api_version=self.api_version, ) - self.retry_count = 0 + self.retry_count = 0 # Reset retry count on successful call return [data["embedding"] for data in response.data] @@ -99,13 +99,16 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): raise Exception("Rate limit exceeded and no more retries left.") await exponential_backoff(self.retry_count) - self.retry_count += 1 return await self.embed_text(text) - except litellm.exceptions.BadRequestError: - raise EmbeddingException("Failed to index data points.") + except ( + litellm.exceptions.BadRequestError, + litellm.exceptions.NotFoundError, + ) as e: + logger.error(f"Embedding error with model {self.model}: {str(e)}") + raise EmbeddingException(f"Failed to index data points using model {self.model}") except Exception as error: logger.error("Error embedding text: %s", str(error)) diff --git a/cognee/infrastructure/llm/gemini/__init__.py b/cognee/infrastructure/llm/gemini/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/infrastructure/llm/gemini/adapter.py b/cognee/infrastructure/llm/gemini/adapter.py new file mode 100644 index 000000000..f37fb1c80 --- /dev/null +++ b/cognee/infrastructure/llm/gemini/adapter.py @@ -0,0 +1,155 @@ +from typing import Type, Optional +from pydantic import BaseModel +import logging +import litellm +import asyncio +from litellm import acompletion, JSONSchemaValidationError +from cognee.shared.data_models import MonitoringTool +from cognee.exceptions import InvalidValueError +from cognee.infrastructure.llm.llm_interface import LLMInterface +from cognee.infrastructure.llm.prompts import read_query_prompt +from cognee.base_config import get_base_config + +logger = logging.getLogger(__name__) + +monitoring = get_base_config().monitoring_tool +if monitoring == MonitoringTool.LANGFUSE: + from langfuse.decorators import observe + + +class GeminiAdapter(LLMInterface): + MAX_RETRIES = 5 + + def __init__( + self, + api_key: str, + model: str, + max_tokens: int, + endpoint: Optional[str] = None, + api_version: Optional[str] = None, + streaming: bool = False, + ) -> None: + self.api_key = api_key + self.model = model + self.endpoint = endpoint + self.api_version = api_version + self.streaming = streaming + self.max_tokens = max_tokens + + @observe(as_type="generation") + async def acreate_structured_output( + self, text_input: str, system_prompt: str, response_model: Type[BaseModel] + ) -> BaseModel: + try: + response_schema = { + "type": "object", + "properties": { + "summary": {"type": "string"}, + "description": {"type": "string"}, + "nodes": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "type": {"type": "string"}, + "description": {"type": "string"}, + "id": {"type": "string"}, + "label": {"type": "string"}, + }, + "required": ["name", "type", "description", "id", "label"], + }, + }, + "edges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "source_node_id": {"type": "string"}, + "target_node_id": {"type": "string"}, + "relationship_name": {"type": "string"}, + }, + "required": ["source_node_id", "target_node_id", "relationship_name"], + }, + }, + }, + "required": ["summary", "description", "nodes", "edges"], + } + + simplified_prompt = f""" +{system_prompt} + +IMPORTANT: Your response must be a valid JSON object with these required fields: +1. summary: A brief summary +2. description: A detailed description +3. nodes: Array of nodes with name, type, description, id, and label +4. edges: Array of edges with source_node_id, target_node_id, and relationship_name + +Example structure: +{{ + "summary": "Brief summary", + "description": "Detailed description", + "nodes": [ + {{ + "name": "Example Node", + "type": "Concept", + "description": "Node description", + "id": "example-id", + "label": "Concept" + }} + ], + "edges": [ + {{ + "source_node_id": "source-id", + "target_node_id": "target-id", + "relationship_name": "relates_to" + }} + ] +}}""" + + messages = [ + {"role": "system", "content": simplified_prompt}, + {"role": "user", "content": text_input}, + ] + + try: + response = await acompletion( + model=f"{self.model}", + messages=messages, + api_key=self.api_key, + max_tokens=self.max_tokens, + temperature=0.1, + response_format={"type": "json_object", "schema": response_schema}, + timeout=10, + num_retries=self.MAX_RETRIES, + ) + + if response.choices and response.choices[0].message.content: + content = response.choices[0].message.content + return response_model.model_validate_json(content) + + except litellm.exceptions.BadRequestError as e: + logger.error(f"Bad request error: {str(e)}") + raise ValueError(f"Invalid request: {str(e)}") + + raise ValueError("Failed to get valid response after retries") + + except JSONSchemaValidationError as e: + logger.error(f"Schema validation failed: {str(e)}") + logger.debug(f"Raw response: {e.raw_response}") + raise ValueError(f"Response failed schema validation: {str(e)}") + + def show_prompt(self, text_input: str, system_prompt: str) -> str: + """Format and display the prompt for a user query.""" + if not text_input: + text_input = "No user input provided." + if not system_prompt: + raise InvalidValueError(message="No system prompt path provided.") + system_prompt = read_query_prompt(system_prompt) + + formatted_prompt = ( + f"""System Prompt:\n{system_prompt}\n\nUser Input:\n{text_input}\n""" + if system_prompt + else None + ) + return formatted_prompt diff --git a/cognee/infrastructure/llm/get_llm_client.py b/cognee/infrastructure/llm/get_llm_client.py index ede8bd330..5e26345e8 100644 --- a/cognee/infrastructure/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/get_llm_client.py @@ -12,6 +12,7 @@ class LLMProvider(Enum): OLLAMA = "ollama" ANTHROPIC = "anthropic" CUSTOM = "custom" + GEMINI = "gemini" def get_llm_client(): @@ -78,5 +79,20 @@ def get_llm_client(): max_tokens=max_tokens, ) + elif provider == LLMProvider.GEMINI: + if llm_config.llm_api_key is None: + raise InvalidValueError(message="LLM API key is not set.") + + from .gemini.adapter import GeminiAdapter + + return GeminiAdapter( + api_key=llm_config.llm_api_key, + model=llm_config.llm_model, + max_tokens=max_tokens, + endpoint=llm_config.llm_endpoint, + api_version=llm_config.llm_api_version, + streaming=llm_config.llm_streaming, + ) + else: raise InvalidValueError(message=f"Unsupported LLM provider: {provider}") diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py index d6939e323..c9f87d211 100644 --- a/cognee/infrastructure/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/openai/adapter.py @@ -23,6 +23,8 @@ class OpenAIAdapter(LLMInterface): api_key: str api_version: str + MAX_RETRIES = 5 + """Adapter for OpenAI's GPT-3, GPT=4 API""" def __init__( @@ -68,7 +70,7 @@ class OpenAIAdapter(LLMInterface): api_base=self.endpoint, api_version=self.api_version, response_model=response_model, - max_retries=5, + max_retries=self.MAX_RETRIES, ) @observe @@ -94,7 +96,7 @@ class OpenAIAdapter(LLMInterface): api_base=self.endpoint, api_version=self.api_version, response_model=response_model, - max_retries=5, + max_retries=self.MAX_RETRIES, ) def create_transcript(self, input): @@ -112,7 +114,7 @@ class OpenAIAdapter(LLMInterface): api_key=self.api_key, api_base=self.endpoint, api_version=self.api_version, - max_retries=5, + max_retries=self.MAX_RETRIES, ) return transcription @@ -144,7 +146,7 @@ class OpenAIAdapter(LLMInterface): api_base=self.endpoint, api_version=self.api_version, max_tokens=300, - max_retries=5, + max_retries=self.MAX_RETRIES, ) def show_prompt(self, text_input: str, system_prompt: str) -> str: diff --git a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py index e4cc4f145..61acdd7ab 100644 --- a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py @@ -1,4 +1,4 @@ -from typing import List, Any +from typing import List, Any, Union from ..tokenizer_interface import TokenizerInterface @@ -26,6 +26,10 @@ class GeminiTokenizer(TokenizerInterface): def extract_tokens(self, text: str) -> List[Any]: raise NotImplementedError + def decode_single_token(self, encoding: int): + # Gemini tokenizer doesn't have the option to decode tokens + raise NotImplementedError + def count_tokens(self, text: str) -> int: """ Returns the number of tokens in the given text. @@ -39,6 +43,3 @@ class GeminiTokenizer(TokenizerInterface): import google.generativeai as genai return len(genai.embed_content(model=f"models/{self.model}", content=text)) - - def trim_text_to_max_tokens(self, text: str) -> str: - raise NotImplementedError diff --git a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py index 878458414..e26a5b1ca 100644 --- a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py @@ -1,7 +1,5 @@ from typing import List, Any -from transformers import AutoTokenizer - from ..tokenizer_interface import TokenizerInterface @@ -14,6 +12,9 @@ class HuggingFaceTokenizer(TokenizerInterface): self.model = model self.max_tokens = max_tokens + # Import here to make it an optional dependency + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained(model) def extract_tokens(self, text: str) -> List[Any]: @@ -32,5 +33,6 @@ class HuggingFaceTokenizer(TokenizerInterface): """ return len(self.tokenizer.tokenize(text)) - def trim_text_to_max_tokens(self, text: str) -> str: + def decode_single_token(self, encoding: int): + # Gemini tokenizer doesn't have the option to decode tokens raise NotImplementedError diff --git a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py index 3d649ef38..7a01fe511 100644 --- a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py @@ -21,14 +21,17 @@ class TikTokenTokenizer(TokenizerInterface): self.tokenizer = tiktoken.encoding_for_model(self.model) def extract_tokens(self, text: str) -> List[Any]: - tokens = [] # Using TikToken's method to tokenize text token_ids = self.tokenizer.encode(text) - # Go through tokens and decode them to text value - for token_id in token_ids: - token = self.tokenizer.decode([token_id]) - tokens.append(token) - return tokens + return token_ids + + def decode_token_list(self, tokens: List[Any]) -> List[Any]: + if not isinstance(tokens, list): + tokens = [tokens] + return [self.tokenizer.decode(i) for i in tokens] + + def decode_single_token(self, token: int): + return self.tokenizer.decode_single_token_bytes(token).decode("utf-8", errors="replace") def count_tokens(self, text: str) -> int: """ diff --git a/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py b/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py index c533f0cf9..456c69f64 100644 --- a/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py +++ b/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py @@ -14,5 +14,5 @@ class TokenizerInterface(Protocol): raise NotImplementedError @abstractmethod - def trim_text_to_max_tokens(self, text: str) -> str: + def decode_single_token(self, token: int) -> str: raise NotImplementedError diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 491f83b5a..f9d9a74ec 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -113,8 +113,10 @@ class CogneeGraph(CogneeAbstractGraph): except (ValueError, TypeError) as e: print(f"Error projecting graph: {e}") + raise e except Exception as ex: print(f"Unexpected error: {ex}") + raise ex async def map_vector_distances_to_graph_nodes(self, node_distances) -> None: for category, scored_results in node_distances.items(): diff --git a/cognee/modules/settings/get_settings.py b/cognee/modules/settings/get_settings.py index 93fd67cff..063c18971 100644 --- a/cognee/modules/settings/get_settings.py +++ b/cognee/modules/settings/get_settings.py @@ -13,6 +13,7 @@ class ModelName(Enum): openai = "openai" ollama = "ollama" anthropic = "anthropic" + gemini = "gemini" class LLMConfig(BaseModel): @@ -72,6 +73,10 @@ def get_settings() -> SettingsDict: "value": "anthropic", "label": "Anthropic", }, + { + "value": "gemini", + "label": "Gemini", + }, ] return SettingsDict.model_validate( @@ -136,6 +141,12 @@ def get_settings() -> SettingsDict: "label": "Claude 3 Haiku", }, ], + "gemini": [ + { + "value": "gemini-2.0-flash-exp", + "label": "Gemini 2.0 Flash", + }, + ], }, }, vector_db={ diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 34205d9f6..52f1f6674 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -1,8 +1,6 @@ -from typing import Any, Dict, Iterator, Optional, Union +from typing import Any, Dict, Iterator from uuid import NAMESPACE_OID, uuid5 -import tiktoken - from cognee.infrastructure.databases.vector import get_vector_engine from .chunk_by_sentence import chunk_by_sentence @@ -19,7 +17,7 @@ def chunk_by_paragraph( When chunks are joined with empty string "", they reproduce the original text exactly. Notes: - - Tokenization is handled using the `tiktoken` library, ensuring compatibility with the vector engine's embedding model. + - Tokenization is handled using our tokenization adapters, ensuring compatibility with the vector engine's embedding model. - If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk. - Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed. - Remaining text at the end of the input will be yielded as a final chunk. @@ -31,17 +29,12 @@ def chunk_by_paragraph( last_cut_type = None current_token_count = 0 - vector_engine = get_vector_engine() - embedding_model = vector_engine.embedding_engine.model - embedding_model = embedding_model.split("/")[-1] - for paragraph_id, sentence, word_count, end_type in chunk_by_sentence( data, maximum_length=paragraph_length ): # Check if this sentence would exceed length limit - - tokenizer = tiktoken.encoding_for_model(embedding_model) - token_count = len(tokenizer.encode(sentence)) + embedding_engine = get_vector_engine().embedding_engine + token_count = embedding_engine.tokenizer.count_tokens(sentence) if current_word_count > 0 and ( current_word_count + word_count > paragraph_length diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index b19786d4e..04ce41517 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -30,9 +30,6 @@ async def ingest_data(data: Any, dataset_name: str, user: User): if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")): return {"metadata": data_item.dict(), "origin": str(type(data_item))} else: - warnings.warn( - f"Data of type {type(data_item)}... does not have dict method. Returning empty metadata." - ) return {} @dlt.resource(standalone=True, primary_key="id", merge_key="id") diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index ca1c76e46..5607c6c37 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -3,33 +3,32 @@ from typing import AsyncGenerator, Generator from uuid import NAMESPACE_OID, uuid5 import parso -import tiktoken from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk +from cognee.infrastructure.llm import get_max_chunk_tokens logger = logging.getLogger(__name__) -def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int: - return len(tokenizer.encode(source_code)) - - def _get_naive_subchunk_token_counts( - tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000 + source_code: str, max_subchunk_tokens ) -> list[tuple[str, int]]: """Splits source code into subchunks of up to max_subchunk_tokens and counts tokens.""" - token_ids = tokenizer.encode(source_code) + tokenizer = get_vector_engine().embedding_engine.tokenizer + token_ids = tokenizer.extract_tokens(source_code) subchunk_token_counts = [] for start_idx in range(0, len(token_ids), max_subchunk_tokens): subchunk_token_ids = token_ids[start_idx : start_idx + max_subchunk_tokens] token_count = len(subchunk_token_ids) + # Note: This can't work with Gemini embeddings as they keep their method of encoding text + # to tokens hidden and don't offer a decoder + # TODO: Add support for different tokenizers for this function subchunk = "".join( - tokenizer.decode_single_token_bytes(token_id).decode("utf-8", errors="replace") - for token_id in subchunk_token_ids + tokenizer.decode_single_token(token_id) for token_id in subchunk_token_ids ) subchunk_token_counts.append((subchunk, token_count)) @@ -37,15 +36,14 @@ def _get_naive_subchunk_token_counts( def _get_subchunk_token_counts( - tokenizer: tiktoken.Encoding, source_code: str, - max_subchunk_tokens: int = 8000, + max_subchunk_tokens, depth: int = 0, max_depth: int = 100, ) -> list[tuple[str, int]]: """Splits source code into subchunk and counts tokens for each subchunk.""" if depth > max_depth: - return _get_naive_subchunk_token_counts(tokenizer, source_code, max_subchunk_tokens) + return _get_naive_subchunk_token_counts(source_code, max_subchunk_tokens) try: module = parso.parse(source_code) @@ -64,7 +62,8 @@ def _get_subchunk_token_counts( subchunk_token_counts = [] for child in module.children: subchunk = child.get_code() - token_count = _count_tokens(tokenizer, subchunk) + tokenizer = get_vector_engine().embedding_engine.tokenizer + token_count = tokenizer.count_tokens(subchunk) if token_count == 0: continue @@ -75,13 +74,13 @@ def _get_subchunk_token_counts( if child.type == "string": subchunk_token_counts.extend( - _get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens) + _get_naive_subchunk_token_counts(subchunk, max_subchunk_tokens) ) continue subchunk_token_counts.extend( _get_subchunk_token_counts( - tokenizer, subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth + subchunk, max_subchunk_tokens, depth=depth + 1, max_depth=max_depth ) ) @@ -96,22 +95,19 @@ def _get_chunk_source_code( cumulative_counts = [] current_source_code = "" - # Get embedding engine used in vector database - embedding_engine = get_vector_engine().embedding_engine - for i, (child_code, token_count) in enumerate(code_token_counts): current_count += token_count cumulative_counts.append(current_count) - if current_count > embedding_engine.max_tokens: + if current_count > get_max_chunk_tokens(): break current_source_code += f"\n{child_code}" - if current_count <= embedding_engine.max_tokens: + if current_count <= get_max_chunk_tokens(): return [], current_source_code.strip() cutoff = 1 for i, cum_count in enumerate(cumulative_counts): - if cum_count > (1 - overlap) * embedding_engine.max_tokens: + if cum_count > (1 - overlap) * get_max_chunk_tokens(): break cutoff = i @@ -121,19 +117,16 @@ def _get_chunk_source_code( def get_source_code_chunks_from_code_part( code_file_part: CodePart, overlap: float = 0.25, - granularity: float = 0.1, + granularity: float = 0.09, ) -> Generator[SourceCodeChunk, None, None]: """Yields source code chunks from a CodePart object, with configurable token limits and overlap.""" if not code_file_part.source_code: logger.error(f"No source code in CodeFile {code_file_part.id}") return - embedding_engine = get_vector_engine().embedding_engine - tokenizer = embedding_engine.tokenizer - - max_subchunk_tokens = max(1, int(granularity * embedding_engine.max_tokens)) + max_subchunk_tokens = max(1, int(granularity * get_max_chunk_tokens())) subchunk_token_counts = _get_subchunk_token_counts( - tokenizer, code_file_part.source_code, max_subchunk_tokens + code_file_part.source_code, max_subchunk_tokens ) previous_chunk = None @@ -157,7 +150,6 @@ async def get_source_code_chunks( data_points: list[DataPoint], ) -> AsyncGenerator[list[DataPoint], None]: """Processes code graph datapoints, create SourceCodeChink datapoints.""" - # TODO: Add support for other embedding models, with max_token mapping for data_point in data_points: try: yield data_point @@ -173,5 +165,7 @@ async def get_source_code_chunks( yield source_code_chunk except Exception as e: logger.error(f"Error processing code part: {e}") + raise e except Exception as e: logger.error(f"Error processing data point: {e}") + raise e diff --git a/evals/eval_on_hotpot.py b/evals/eval_on_hotpot.py index 4f3198d87..75ad82954 100644 --- a/evals/eval_on_hotpot.py +++ b/evals/eval_on_hotpot.py @@ -114,7 +114,7 @@ async def eval_on_QA_dataset( if not out_path.exists(): out_path.mkdir(parents=True, exist_ok=True) - random.seed(42) + random.seed(43) instances = dataset if not num_samples else random.sample(dataset, num_samples) contexts_filename = out_path / Path( diff --git a/evals/qa_eval_parameters.json b/evals/qa_eval_parameters.json index 6d60ab56f..8ae82b2e8 100644 --- a/evals/qa_eval_parameters.json +++ b/evals/qa_eval_parameters.json @@ -4,7 +4,6 @@ ], "rag_option": [ "cognee_incremental", - "cognee", "no_rag", "simple_rag", "brute_force" @@ -14,10 +13,6 @@ ], "metric_names": [ "Correctness", - "Comprehensiveness", - "Directness", - "Diversity", - "Empowerment", - "promptfoo.directness" + "Comprehensiveness" ] } diff --git a/poetry.lock b/poetry.lock index 542e0377b..ee4e41039 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7241,7 +7241,7 @@ crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] name = "safetensors" version = "0.5.2" description = "" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"}, @@ -8079,7 +8079,7 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, name = "transformers" version = "4.48.1" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" -optional = false +optional = true python-versions = ">=3.9.0" files = [ {file = "transformers-4.48.1-py3-none-any.whl", hash = "sha256:24be0564b0a36d9e433d9a65de248f1545b6f6edce1737669605eb6a8141bbbb"}, @@ -9040,6 +9040,7 @@ falkordb = ["falkordb"] filesystem = ["botocore"] gemini = ["google-generativeai"] groq = ["groq"] +huggingface = ["transformers"] langchain = ["langchain_text_splitters", "langsmith"] llama-index = ["llama-index-core"] milvus = ["pymilvus"] @@ -9053,4 +9054,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.10.0,<3.13" -content-hash = "480675c274cd85a76a95bf03af865b1a0b462f25bbc21d7427b0a0b8e21c13db" +content-hash = "e0752df2545fd5048c0969acc7282fce8e034ec0abfabfe07785e7d34c44fc8b" diff --git a/pyproject.toml b/pyproject.toml index 98497cded..cdca4e755 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ pgvector = {version = "^0.3.5", optional = true} psycopg2 = {version = "^2.9.10", optional = true} llama-index-core = {version = "^0.12.11", optional = true} deepeval = {version = "^2.0.1", optional = true} -transformers = "^4.46.3" +transformers = {version = "^4.46.3", optional = true} pymilvus = {version = "^2.5.0", optional = true} unstructured = { extras = ["csv", "doc", "docx", "epub", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"], version = "^0.16.13", optional = true } pre-commit = "^4.0.1" @@ -92,6 +92,7 @@ notebook = ["notebook", "ipykernel", "overrides", "ipywidgets", "jupyterlab", "j langchain = ["langsmith", "langchain_text_splitters"] llama-index = ["llama-index-core"] gemini = ["google-generativeai"] +huggingface = ["transformers"] deepeval = ["deepeval"] posthog = ["posthog"] falkordb = ["falkordb"]