diff --git a/.env.template b/.env.template index 89ac06830..7dcd4f346 100644 --- a/.env.template +++ b/.env.template @@ -242,13 +242,14 @@ LITELLM_LOG="ERROR" ########## Local LLM via Ollama ############################################### + #LLM_API_KEY ="ollama" #LLM_MODEL="llama3.1:8b" #LLM_PROVIDER="ollama" #LLM_ENDPOINT="http://localhost:11434/v1" #EMBEDDING_PROVIDER="ollama" #EMBEDDING_MODEL="nomic-embed-text:latest" -#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings" +#EMBEDDING_ENDPOINT="http://localhost:11434/api/embed" #EMBEDDING_DIMENSIONS=768 #HUGGINGFACE_TOKENIZER="nomic-ai/nomic-embed-text-v1.5" diff --git a/.github/workflows/dockerhub-mcp.yml b/.github/workflows/dockerhub-mcp.yml index 3d4ef4fe1..5a1f28296 100644 --- a/.github/workflows/dockerhub-mcp.yml +++ b/.github/workflows/dockerhub-mcp.yml @@ -7,14 +7,29 @@ on: jobs: docker-build-and-push: - runs-on: ubuntu-latest + runs-on: + group: Default + labels: + - docker_build_runner steps: + - name: Check and free disk space before build + run: | + echo "=== Before cleanup ===" + df -h + echo "Removing unused preinstalled SDKs to free space..." + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc || true + docker system prune -af || true + echo "=== After cleanup ===" + df -h + - name: Checkout repository uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + buildkitd-flags: --root /tmp/buildkit - name: Log in to Docker Hub uses: docker/login-action@v3 @@ -34,7 +49,7 @@ jobs: - name: Build and push id: build - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . platforms: linux/amd64,linux/arm64 @@ -45,5 +60,6 @@ jobs: cache-from: type=registry,ref=cognee/cognee-mcp:buildcache cache-to: type=registry,ref=cognee/cognee-mcp:buildcache,mode=max + - name: Image digest run: echo ${{ steps.build.outputs.digest }} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..4cadfbdc1 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,132 @@ +## Repository Guidelines + +This document summarizes how to work with the cognee repository: how itβs organized, how to build, test, lint, and contribute. It mirrors our actual tooling and CI while providing quick commands for local development. + +## Project Structure & Module Organization + +- `cognee/`: Core Python library and API. + - `api/`: FastAPI application and versioned routers (add, cognify, memify, search, delete, users, datasets, responses, visualize, settings, sync, update, checks). + - `cli/`: CLI entry points and subcommands invoked via `cognee` / `cognee-cli`. + - `infrastructure/`: Databases, LLM providers, embeddings, loaders, and storage adapters. + - `modules/`: Domain logic (graph, retrieval, ontology, users, processing, observability, etc.). + - `tasks/`: Reusable tasks (e.g., code graph, web scraping, storage). Extend with new tasks here. + - `eval_framework/`: Evaluation utilities and adapters. + - `shared/`: Cross-cutting helpers (logging, settings, utils). + - `tests/`: Unit, integration, CLI, and end-to-end tests organized by feature. + - `__main__.py`: Entrypoint to route to CLI. +- `cognee-mcp/`: Model Context Protocol server exposing cognee as MCP tools (SSE/HTTP/stdio). Contains its own README and Dockerfile. +- `cognee-frontend/`: Next.js UI for local development and demos. +- `distributed/`: Utilities for distributed execution (Modal, workers, queues). +- `examples/`: Example scripts demonstrating the public APIs and features (graph, code graph, multimodal, permissions, etc.). +- `notebooks/`: Jupyter notebooks for demos and tutorials. +- `alembic/`: Database migrations for relational backends. + +Notes: +- Co-locate feature-specific helpers under their respective package (`modules/`, `infrastructure/`, or `tasks/`). +- Extend the system by adding new tasks, loaders, or retrievers rather than modifying core pipeline mechanisms. + +## Build, Test, and Development Commands + +Python (root) β requires Python >= 3.10 and < 3.14. We recommend `uv` for speed and reproducibility. + +- Create/refresh env and install dev deps: +```bash +uv sync --dev --all-extras --reinstall +``` + +- Run the CLI (examples): +```bash +uv run cognee-cli add "Cognee turns documents into AI memory." +uv run cognee-cli cognify +uv run cognee-cli search "What does cognee do?" +uv run cognee-cli -ui # Launches UI, backend API, and MCP server together +``` + +- Start the FastAPI server directly: +```bash +uv run python -m cognee.api.client +``` + +- Run tests (CI mirrors these commands): +```bash +uv run pytest cognee/tests/unit/ -v +uv run pytest cognee/tests/integration/ -v +``` + +- Lint and format (ruff): +```bash +uv run ruff check . +uv run ruff format . +``` + +- Optional static type checks (mypy): +```bash +uv run mypy cognee/ +``` + +MCP Server (`cognee-mcp/`): + +- Install and run locally: +```bash +cd cognee-mcp +uv sync --dev --all-extras --reinstall +uv run python src/server.py # stdio (default) +uv run python src/server.py --transport sse +uv run python src/server.py --transport http --host 127.0.0.1 --port 8000 --path /mcp +``` + +- API Mode (connect to a running Cognee API): +```bash +uv run python src/server.py --transport sse --api-url http://localhost:8000 --api-token YOUR_TOKEN +``` + +- Docker quickstart (examples): see `cognee-mcp/README.md` for full details +```bash +docker run -e TRANSPORT_MODE=http --env-file ./.env -p 8000:8000 --rm -it cognee/cognee-mcp:main +``` + +Frontend (`cognee-frontend/`): +```bash +cd cognee-frontend +npm install +npm run dev # Next.js dev server +npm run lint # ESLint +npm run build && npm start +``` + +## Coding Style & Naming Conventions + +Python: +- 4-space indentation, modules and functions in `snake_case`, classes in `PascalCase`. +- Public APIs should be type-annotated where practical. +- Use `ruff format` before committing; `ruff check` enforces import hygiene and style (line-length 100 configured in `pyproject.toml`). +- Prefer explicit, structured error handling. Use shared logging utilities in `cognee.shared.logging_utils`. + +MCP server and Frontend: +- Follow the local `README.md` and ESLint/TypeScript configuration in `cognee-frontend/`. + +## Testing Guidelines + +- Place Python tests under `cognee/tests/`. + - Unit tests: `cognee/tests/unit/` + - Integration tests: `cognee/tests/integration/` + - CLI tests: `cognee/tests/cli_tests/` +- Name test files `test_*.py`. Use `pytest.mark.asyncio` for async tests. +- Avoid external state; rely on test fixtures and the CI-provided env vars when LLM/embedding providers are required. See CI workflows under `.github/workflows/` for expected environment variables. +- When adding public APIs, provide/update targeted examples under `examples/python/`. + +## Commit & Pull Request Guidelines + +- Use clear, imperative subjects (β€ 72 chars) and conventional commit styling in PR titles. Our CI validates semantic PR titles (see `.github/workflows/pr_lint`). Examples: + - `feat(graph): add temporal edge weighting` + - `fix(api): handle missing auth cookie` + - `docs: update installation instructions` +- Reference related issues/discussions in the PR body and provide brief context. +- PRs should describe scope, list local test commands run, and mention any impacts on MCP server or UI if applicable. +- Sign commits and affirm the DCO (see `CONTRIBUTING.md`). + +## CI Mirrors Local Commands + +Our GitHub Actions run the same ruff checks and pytest suites shown above (`.github/workflows/basic_tests.yml` and related workflows). Use the commands in this document locally to minimize CI surprises. + + diff --git a/README.md b/README.md index 305bffdfe..d51a380b1 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ Hosted platform: ### π¦ Installation -You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager. +You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager.. Cognee supports Python 3.10 to 3.12 diff --git a/cognee-mcp/README.md b/cognee-mcp/README.md index d14bc9fa1..9ac8b4973 100644 --- a/cognee-mcp/README.md +++ b/cognee-mcp/README.md @@ -110,6 +110,47 @@ If you'd rather run cognee-mcp in a container, you have two options: # For stdio transport (default) docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main ``` + + **Installing optional dependencies at runtime:** + + You can install optional dependencies when running the container by setting the `EXTRAS` environment variable: + ```bash + # Install a single optional dependency group at runtime + docker run \ + -e TRANSPORT_MODE=http \ + -e EXTRAS=aws \ + --env-file ./.env \ + -p 8000:8000 \ + --rm -it cognee/cognee-mcp:main + + # Install multiple optional dependency groups at runtime (comma-separated) + docker run \ + -e TRANSPORT_MODE=sse \ + -e EXTRAS=aws,postgres,neo4j \ + --env-file ./.env \ + -p 8000:8000 \ + --rm -it cognee/cognee-mcp:main + ``` + + **Available optional dependency groups:** + - `aws` - S3 storage support + - `postgres` / `postgres-binary` - PostgreSQL database support + - `neo4j` - Neo4j graph database support + - `neptune` - AWS Neptune support + - `chromadb` - ChromaDB vector store support + - `scraping` - Web scraping capabilities + - `distributed` - Modal distributed execution + - `langchain` - LangChain integration + - `llama-index` - LlamaIndex integration + - `anthropic` - Anthropic models + - `groq` - Groq models + - `mistral` - Mistral models + - `ollama` / `huggingface` - Local model support + - `docs` - Document processing + - `codegraph` - Code analysis + - `monitoring` - Sentry & Langfuse monitoring + - `redis` - Redis support + - And more (see [pyproject.toml](https://github.com/topoteretes/cognee/blob/main/pyproject.toml) for full list) 2. **Pull from Docker Hub** (no build required): ```bash # With HTTP transport (recommended for web deployments) @@ -119,6 +160,17 @@ If you'd rather run cognee-mcp in a container, you have two options: # With stdio transport (default) docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main ``` + + **With runtime installation of optional dependencies:** + ```bash + # Install optional dependencies from Docker Hub image + docker run \ + -e TRANSPORT_MODE=http \ + -e EXTRAS=aws,postgres \ + --env-file ./.env \ + -p 8000:8000 \ + --rm -it cognee/cognee-mcp:main + ``` ### **Important: Docker vs Direct Usage** **Docker uses environment variables**, not command line arguments: diff --git a/cognee-mcp/entrypoint.sh b/cognee-mcp/entrypoint.sh index 2f122bbfd..cf7d19f0a 100644 --- a/cognee-mcp/entrypoint.sh +++ b/cognee-mcp/entrypoint.sh @@ -4,6 +4,42 @@ set -e # Exit on error echo "Debug mode: $DEBUG" echo "Environment: $ENVIRONMENT" +# Install optional dependencies if EXTRAS is set +if [ -n "$EXTRAS" ]; then + echo "Installing optional dependencies: $EXTRAS" + + # Get the cognee version that's currently installed + COGNEE_VERSION=$(uv pip show cognee | grep "Version:" | awk '{print $2}') + echo "Current cognee version: $COGNEE_VERSION" + + # Build the extras list for cognee + IFS=',' read -ra EXTRA_ARRAY <<< "$EXTRAS" + # Combine base extras from pyproject.toml with requested extras + ALL_EXTRAS="" + for extra in "${EXTRA_ARRAY[@]}"; do + # Trim whitespace + extra=$(echo "$extra" | xargs) + # Add to extras list if not already present + if [[ ! "$ALL_EXTRAS" =~ (^|,)"$extra"(,|$) ]]; then + if [ -z "$ALL_EXTRAS" ]; then + ALL_EXTRAS="$extra" + else + ALL_EXTRAS="$ALL_EXTRAS,$extra" + fi + fi + done + + echo "Installing cognee with extras: $ALL_EXTRAS" + echo "Running: uv pip install 'cognee[$ALL_EXTRAS]==$COGNEE_VERSION'" + uv pip install "cognee[$ALL_EXTRAS]==$COGNEE_VERSION" + + # Verify installation + echo "" + echo "β Optional dependencies installation completed" +else + echo "No optional dependencies specified" +fi + # Set default transport mode if not specified TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"} echo "Transport mode: $TRANSPORT_MODE" diff --git a/cognee-mcp/pyproject.toml b/cognee-mcp/pyproject.toml index c3327b67f..f37bf337c 100644 --- a/cognee-mcp/pyproject.toml +++ b/cognee-mcp/pyproject.toml @@ -9,7 +9,7 @@ dependencies = [ # For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes. #"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j] @ file:/Users/igorilic/Desktop/cognee", # TODO: Remove gemini from optional dependecnies for new Cognee version after 0.3.4 - "cognee[postgres,codegraph,huggingface,docs,neo4j]==0.3.7", + "cognee[postgres,docs,neo4j]==0.3.7", "fastmcp>=2.10.0,<3.0.0", "mcp>=1.12.0,<2.0.0", "uv>=0.6.3,<1.0.0", diff --git a/cognee-mcp/src/client.py b/cognee-mcp/src/client.py index 2d6bdfe18..952503ee7 100755 --- a/cognee-mcp/src/client.py +++ b/cognee-mcp/src/client.py @@ -37,12 +37,10 @@ async def run(): toolResult = await session.call_tool("prune", arguments={}) - toolResult = await session.call_tool( - "codify", arguments={"repo_path": "SOME_REPO_PATH"} - ) + toolResult = await session.call_tool("cognify", arguments={}) toolResult = await session.call_tool( - "search", arguments={"search_type": "CODE", "search_query": "exceptions"} + "search", arguments={"search_type": "GRAPH_COMPLETION"} ) print(f"Cognify result: {toolResult.content}") diff --git a/cognee-mcp/uv.lock b/cognee-mcp/uv.lock index daa88edef..a6a6fb511 100644 --- a/cognee-mcp/uv.lock +++ b/cognee-mcp/uv.lock @@ -718,19 +718,10 @@ wheels = [ ] [package.optional-dependencies] -codegraph = [ - { name = "fastembed", marker = "python_full_version < '3.13'" }, - { name = "transformers" }, - { name = "tree-sitter" }, - { name = "tree-sitter-python" }, -] docs = [ { name = "lxml" }, { name = "unstructured", extra = ["csv", "doc", "docx", "epub", "md", "odt", "org", "pdf", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"] }, ] -huggingface = [ - { name = "transformers" }, -] neo4j = [ { name = "neo4j" }, ] @@ -745,7 +736,7 @@ name = "cognee-mcp" version = "0.4.0" source = { editable = "." } dependencies = [ - { name = "cognee", extra = ["codegraph", "docs", "huggingface", "neo4j", "postgres"] }, + { name = "cognee", extra = ["docs", "neo4j", "postgres"] }, { name = "fastmcp" }, { name = "httpx" }, { name = "mcp" }, @@ -759,7 +750,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "cognee", extras = ["postgres", "codegraph", "huggingface", "docs", "neo4j"], specifier = "==0.3.7" }, + { name = "cognee", extras = ["postgres", "docs", "neo4j"], specifier = "==0.3.7" }, { name = "fastmcp", specifier = ">=2.10.0,<3.0.0" }, { name = "httpx", specifier = ">=0.27.0,<1.0.0" }, { name = "mcp", specifier = ">=1.12.0,<2.0.0" }, @@ -6038,57 +6029,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/2b/4d2708ac1ff5cd708b6548f4c5812d0ae40d1c28591c4c1c762b6dbdef2d/transformers-4.57.0-py3-none-any.whl", hash = "sha256:9d7c6d098c026e40d897e017ed1f481ab803cbac041021dbc6ae6100e4949b55", size = 11990588 }, ] -[[package]] -name = "tree-sitter" -version = "0.24.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a7/a2/698b9d31d08ad5558f8bfbfe3a0781bd4b1f284e89bde3ad18e05101a892/tree-sitter-0.24.0.tar.gz", hash = "sha256:abd95af65ca2f4f7eca356343391ed669e764f37748b5352946f00f7fc78e734", size = 168304 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/08/9a/bd627a02e41671af73222316e1fcf87772c7804dc2fba99405275eb1f3eb/tree_sitter-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f3f00feff1fc47a8e4863561b8da8f5e023d382dd31ed3e43cd11d4cae445445", size = 140890 }, - { url = "https://files.pythonhosted.org/packages/5b/9b/b1ccfb187f8be78e2116176a091a2f2abfd043a06d78f80c97c97f315b37/tree_sitter-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f9691be48d98c49ef8f498460278884c666b44129222ed6217477dffad5d4831", size = 134413 }, - { url = "https://files.pythonhosted.org/packages/01/39/e25b0042a049eb27e991133a7aa7c49bb8e49a8a7b44ca34e7e6353ba7ac/tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:098a81df9f89cf254d92c1cd0660a838593f85d7505b28249216661d87adde4a", size = 560427 }, - { url = "https://files.pythonhosted.org/packages/1c/59/4d132f1388da5242151b90acf32cc56af779bfba063923699ab28b276b62/tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b26bf9e958da6eb7e74a081aab9d9c7d05f9baeaa830dbb67481898fd16f1f5", size = 574327 }, - { url = "https://files.pythonhosted.org/packages/ec/97/3914e45ab9e0ff0f157e493caa91791372508488b97ff0961a0640a37d25/tree_sitter-0.24.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2a84ff87a2f2a008867a1064aba510ab3bd608e3e0cd6e8fef0379efee266c73", size = 577171 }, - { url = "https://files.pythonhosted.org/packages/c5/b0/266a529c3eef171137b73cde8ad7aa282734354609a8b2f5564428e8f12d/tree_sitter-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:c012e4c345c57a95d92ab5a890c637aaa51ab3b7ff25ed7069834b1087361c95", size = 120260 }, - { url = "https://files.pythonhosted.org/packages/c1/c3/07bfaa345e0037ff75d98b7a643cf940146e4092a1fd54eed0359836be03/tree_sitter-0.24.0-cp310-cp310-win_arm64.whl", hash = "sha256:033506c1bc2ba7bd559b23a6bdbeaf1127cee3c68a094b82396718596dfe98bc", size = 108416 }, - { url = "https://files.pythonhosted.org/packages/66/08/82aaf7cbea7286ee2a0b43e9b75cb93ac6ac132991b7d3c26ebe5e5235a3/tree_sitter-0.24.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de0fb7c18c6068cacff46250c0a0473e8fc74d673e3e86555f131c2c1346fb13", size = 140733 }, - { url = "https://files.pythonhosted.org/packages/8c/bd/1a84574911c40734d80327495e6e218e8f17ef318dd62bb66b55c1e969f5/tree_sitter-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a7c9c89666dea2ce2b2bf98e75f429d2876c569fab966afefdcd71974c6d8538", size = 134243 }, - { url = "https://files.pythonhosted.org/packages/46/c1/c2037af2c44996d7bde84eb1c9e42308cc84b547dd6da7f8a8bea33007e1/tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ddb113e6b8b3e3b199695b1492a47d87d06c538e63050823d90ef13cac585fd", size = 562030 }, - { url = "https://files.pythonhosted.org/packages/4c/aa/2fb4d81886df958e6ec7e370895f7106d46d0bbdcc531768326124dc8972/tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01ea01a7003b88b92f7f875da6ba9d5d741e0c84bb1bd92c503c0eecd0ee6409", size = 575585 }, - { url = "https://files.pythonhosted.org/packages/e3/3c/5f997ce34c0d1b744e0f0c0757113bdfc173a2e3dadda92c751685cfcbd1/tree_sitter-0.24.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:464fa5b2cac63608915a9de8a6efd67a4da1929e603ea86abaeae2cb1fe89921", size = 578203 }, - { url = "https://files.pythonhosted.org/packages/d5/1f/f2bc7fa7c3081653ea4f2639e06ff0af4616c47105dbcc0746137da7620d/tree_sitter-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:3b1f3cbd9700e1fba0be2e7d801527e37c49fc02dc140714669144ef6ab58dce", size = 120147 }, - { url = "https://files.pythonhosted.org/packages/c0/4c/9add771772c4d72a328e656367ca948e389432548696a3819b69cdd6f41e/tree_sitter-0.24.0-cp311-cp311-win_arm64.whl", hash = "sha256:f3f08a2ca9f600b3758792ba2406971665ffbad810847398d180c48cee174ee2", size = 108302 }, - { url = "https://files.pythonhosted.org/packages/e9/57/3a590f287b5aa60c07d5545953912be3d252481bf5e178f750db75572bff/tree_sitter-0.24.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:14beeff5f11e223c37be7d5d119819880601a80d0399abe8c738ae2288804afc", size = 140788 }, - { url = "https://files.pythonhosted.org/packages/61/0b/fc289e0cba7dbe77c6655a4dd949cd23c663fd62a8b4d8f02f97e28d7fe5/tree_sitter-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26a5b130f70d5925d67b47db314da209063664585a2fd36fa69e0717738efaf4", size = 133945 }, - { url = "https://files.pythonhosted.org/packages/86/d7/80767238308a137e0b5b5c947aa243e3c1e3e430e6d0d5ae94b9a9ffd1a2/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fc5c3c26d83c9d0ecb4fc4304fba35f034b7761d35286b936c1db1217558b4e", size = 564819 }, - { url = "https://files.pythonhosted.org/packages/bf/b3/6c5574f4b937b836601f5fb556b24804b0a6341f2eb42f40c0e6464339f4/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:772e1bd8c0931c866b848d0369b32218ac97c24b04790ec4b0e409901945dd8e", size = 579303 }, - { url = "https://files.pythonhosted.org/packages/0a/f4/bd0ddf9abe242ea67cca18a64810f8af230fc1ea74b28bb702e838ccd874/tree_sitter-0.24.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:24a8dd03b0d6b8812425f3b84d2f4763322684e38baf74e5bb766128b5633dc7", size = 581054 }, - { url = "https://files.pythonhosted.org/packages/8c/1c/ff23fa4931b6ef1bbeac461b904ca7e49eaec7e7e5398584e3eef836ec96/tree_sitter-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9e8b1605ab60ed43803100f067eed71b0b0e6c1fb9860a262727dbfbbb74751", size = 120221 }, - { url = "https://files.pythonhosted.org/packages/b2/2a/9979c626f303177b7612a802237d0533155bf1e425ff6f73cc40f25453e2/tree_sitter-0.24.0-cp312-cp312-win_arm64.whl", hash = "sha256:f733a83d8355fc95561582b66bbea92ffd365c5d7a665bc9ebd25e049c2b2abb", size = 108234 }, - { url = "https://files.pythonhosted.org/packages/61/cd/2348339c85803330ce38cee1c6cbbfa78a656b34ff58606ebaf5c9e83bd0/tree_sitter-0.24.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d4a6416ed421c4210f0ca405a4834d5ccfbb8ad6692d4d74f7773ef68f92071", size = 140781 }, - { url = "https://files.pythonhosted.org/packages/8b/a3/1ea9d8b64e8dcfcc0051028a9c84a630301290995cd6e947bf88267ef7b1/tree_sitter-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e0992d483677e71d5c5d37f30dfb2e3afec2f932a9c53eec4fca13869b788c6c", size = 133928 }, - { url = "https://files.pythonhosted.org/packages/fe/ae/55c1055609c9428a4aedf4b164400ab9adb0b1bf1538b51f4b3748a6c983/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57277a12fbcefb1c8b206186068d456c600dbfbc3fd6c76968ee22614c5cd5ad", size = 564497 }, - { url = "https://files.pythonhosted.org/packages/ce/d0/f2ffcd04882c5aa28d205a787353130cbf84b2b8a977fd211bdc3b399ae3/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25fa22766d63f73716c6fec1a31ee5cf904aa429484256bd5fdf5259051ed74", size = 578917 }, - { url = "https://files.pythonhosted.org/packages/af/82/aebe78ea23a2b3a79324993d4915f3093ad1af43d7c2208ee90be9273273/tree_sitter-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7d5d9537507e1c8c5fa9935b34f320bfec4114d675e028f3ad94f11cf9db37b9", size = 581148 }, - { url = "https://files.pythonhosted.org/packages/a1/b4/6b0291a590c2b0417cfdb64ccb8ea242f270a46ed429c641fbc2bfab77e0/tree_sitter-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:f58bb4956917715ec4d5a28681829a8dad5c342cafd4aea269f9132a83ca9b34", size = 120207 }, - { url = "https://files.pythonhosted.org/packages/a8/18/542fd844b75272630229c9939b03f7db232c71a9d82aadc59c596319ea6a/tree_sitter-0.24.0-cp313-cp313-win_arm64.whl", hash = "sha256:23641bd25dcd4bb0b6fa91b8fb3f46cc9f1c9f475efe4d536d3f1f688d1b84c8", size = 108232 }, -] - -[[package]] -name = "tree-sitter-python" -version = "0.23.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1c/30/6766433b31be476fda6569a3a374c2220e45ffee0bff75460038a57bf23b/tree_sitter_python-0.23.6.tar.gz", hash = "sha256:354bfa0a2f9217431764a631516f85173e9711af2c13dbd796a8815acfe505d9", size = 155868 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ab/67/577a02acae5f776007c924ca86ef14c19c12e71de0aa9d2a036f3c248e7b/tree_sitter_python-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:28fbec8f74eeb2b30292d97715e60fac9ccf8a8091ce19b9d93e9b580ed280fb", size = 74361 }, - { url = "https://files.pythonhosted.org/packages/d2/a6/194b3625a7245c532ad418130d63077ce6cd241152524152f533e4d6edb0/tree_sitter_python-0.23.6-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:680b710051b144fedf61c95197db0094f2245e82551bf7f0c501356333571f7a", size = 76436 }, - { url = "https://files.pythonhosted.org/packages/d0/62/1da112689d6d282920e62c40e67ab39ea56463b0e7167bfc5e81818a770e/tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a9dcef55507b6567207e8ee0a6b053d0688019b47ff7f26edc1764b7f4dc0a4", size = 112060 }, - { url = "https://files.pythonhosted.org/packages/5d/62/c9358584c96e38318d69b6704653684fd8467601f7b74e88aa44f4e6903f/tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29dacdc0cd2f64e55e61d96c6906533ebb2791972bec988450c46cce60092f5d", size = 112338 }, - { url = "https://files.pythonhosted.org/packages/1a/58/c5e61add45e34fb8ecbf057c500bae9d96ed7c9ca36edb7985da8ae45526/tree_sitter_python-0.23.6-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7e048733c36f564b379831689006801feb267d8194f9e793fbb395ef1723335d", size = 109382 }, - { url = "https://files.pythonhosted.org/packages/e9/f3/9b30893cae9b3811fe652dc6f90aaadfda12ae0b2757f5722fc7266f423c/tree_sitter_python-0.23.6-cp39-abi3-win_amd64.whl", hash = "sha256:a24027248399fb41594b696f929f9956828ae7cc85596d9f775e6c239cd0c2be", size = 75904 }, - { url = "https://files.pythonhosted.org/packages/87/cb/ce35a65f83a47b510d8a2f1eddf3bdbb0d57aabc87351c8788caf3309f76/tree_sitter_python-0.23.6-cp39-abi3-win_arm64.whl", hash = "sha256:71334371bd73d5fe080aed39fbff49ed8efb9506edebe16795b0c7567ed6a272", size = 73649 }, -] - [[package]] name = "triton" version = "3.5.0" diff --git a/cognee/base_config.py b/cognee/base_config.py index a2ad06249..a4c88e0da 100644 --- a/cognee/base_config.py +++ b/cognee/base_config.py @@ -1,4 +1,5 @@ import os +from pathlib import Path from typing import Optional from functools import lru_cache from cognee.root_dir import get_absolute_path, ensure_absolute_path @@ -11,6 +12,9 @@ class BaseConfig(BaseSettings): data_root_directory: str = get_absolute_path(".data_storage") system_root_directory: str = get_absolute_path(".cognee_system") cache_root_directory: str = get_absolute_path(".cognee_cache") + logs_root_directory: str = os.getenv( + "COGNEE_LOGS_DIR", str(os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")) + ) monitoring_tool: object = Observer.NONE @pydantic.model_validator(mode="after") @@ -30,6 +34,8 @@ class BaseConfig(BaseSettings): # Require absolute paths for root directories self.data_root_directory = ensure_absolute_path(self.data_root_directory) self.system_root_directory = ensure_absolute_path(self.system_root_directory) + self.logs_root_directory = ensure_absolute_path(self.logs_root_directory) + # Set monitoring tool based on available keys if self.langfuse_public_key and self.langfuse_secret_key: self.monitoring_tool = Observer.LANGFUSE @@ -49,6 +55,7 @@ class BaseConfig(BaseSettings): "system_root_directory": self.system_root_directory, "monitoring_tool": self.monitoring_tool, "cache_root_directory": self.cache_root_directory, + "logs_root_directory": self.logs_root_directory, } diff --git a/cognee/infrastructure/databases/vector/create_vector_engine.py b/cognee/infrastructure/databases/vector/create_vector_engine.py index 639bbb9f6..c54d94f6c 100644 --- a/cognee/infrastructure/databases/vector/create_vector_engine.py +++ b/cognee/infrastructure/databases/vector/create_vector_engine.py @@ -47,7 +47,7 @@ def create_vector_engine( embedding_engine=embedding_engine, ) - if vector_db_provider == "pgvector": + if vector_db_provider.lower() == "pgvector": from cognee.infrastructure.databases.relational import get_relational_config # Get configuration for postgres database @@ -78,7 +78,7 @@ def create_vector_engine( embedding_engine, ) - elif vector_db_provider == "chromadb": + elif vector_db_provider.lower() == "chromadb": try: import chromadb except ImportError: @@ -94,7 +94,7 @@ def create_vector_engine( embedding_engine=embedding_engine, ) - elif vector_db_provider == "neptune_analytics": + elif vector_db_provider.lower() == "neptune_analytics": try: from langchain_aws import NeptuneAnalyticsGraph except ImportError: @@ -122,7 +122,7 @@ def create_vector_engine( embedding_engine=embedding_engine, ) - else: + elif vector_db_provider.lower() == "lancedb": from .lancedb.LanceDBAdapter import LanceDBAdapter return LanceDBAdapter( @@ -130,3 +130,9 @@ def create_vector_engine( api_key=vector_db_key, embedding_engine=embedding_engine, ) + + else: + raise EnvironmentError( + f"Unsupported vector database provider: {vector_db_provider}. " + f"Supported providers are: {', '.join(list(supported_databases.keys()) + ['LanceDB', 'PGVector', 'neptune_analytics', 'ChromaDB'])}" + ) diff --git a/cognee/infrastructure/engine/models/Edge.py b/cognee/infrastructure/engine/models/Edge.py index 5ad9c84dd..59f01a9ab 100644 --- a/cognee/infrastructure/engine/models/Edge.py +++ b/cognee/infrastructure/engine/models/Edge.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, field_validator from typing import Optional, Any, Dict @@ -18,9 +18,21 @@ class Edge(BaseModel): # Mixed usage has_items: (Edge(weight=0.5, weights={"confidence": 0.9}), list[Item]) + + # With edge_text for rich embedding representation + contains: (Edge(relationship_type="contains", edge_text="relationship_name: contains; entity_description: Alice"), Entity) """ weight: Optional[float] = None weights: Optional[Dict[str, float]] = None relationship_type: Optional[str] = None properties: Optional[Dict[str, Any]] = None + edge_text: Optional[str] = None + + @field_validator("edge_text", mode="before") + @classmethod + def ensure_edge_text(cls, v, info): + """Auto-populate edge_text from relationship_type if not explicitly provided.""" + if v is None and info.data.get("relationship_type"): + return info.data["relationship_type"] + return v diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py index 23b10a6df..3b6c5a364 100644 --- a/cognee/infrastructure/files/utils/get_file_metadata.py +++ b/cognee/infrastructure/files/utils/get_file_metadata.py @@ -1,6 +1,6 @@ import io import os.path -from typing import BinaryIO, TypedDict +from typing import BinaryIO, TypedDict, Optional from pathlib import Path from cognee.shared.logging_utils import get_logger @@ -27,7 +27,7 @@ class FileMetadata(TypedDict): file_size: int -async def get_file_metadata(file: BinaryIO) -> FileMetadata: +async def get_file_metadata(file: BinaryIO, name: Optional[str] = None) -> FileMetadata: """ Retrieve metadata from a file object. @@ -53,7 +53,7 @@ async def get_file_metadata(file: BinaryIO) -> FileMetadata: except io.UnsupportedOperation as error: logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n") - file_type = guess_file_type(file) + file_type = guess_file_type(file, name) file_path = getattr(file, "name", None) or getattr(file, "full_name", None) diff --git a/cognee/infrastructure/files/utils/guess_file_type.py b/cognee/infrastructure/files/utils/guess_file_type.py index dcdd68cad..78b20c93d 100644 --- a/cognee/infrastructure/files/utils/guess_file_type.py +++ b/cognee/infrastructure/files/utils/guess_file_type.py @@ -1,6 +1,9 @@ -from typing import BinaryIO +import io +from pathlib import Path +from typing import BinaryIO, Optional, Any import filetype -from .is_text_content import is_text_content +from tempfile import SpooledTemporaryFile +from filetype.types.base import Type class FileTypeException(Exception): @@ -22,90 +25,7 @@ class FileTypeException(Exception): self.message = message -class TxtFileType(filetype.Type): - """ - Represents a text file type with specific MIME and extension properties. - - Public methods: - - match: Determines whether a given buffer matches the text file type. - """ - - MIME = "text/plain" - EXTENSION = "txt" - - def __init__(self): - super(TxtFileType, self).__init__(mime=TxtFileType.MIME, extension=TxtFileType.EXTENSION) - - def match(self, buf): - """ - Determine if the given buffer contains text content. - - Parameters: - ----------- - - - buf: The buffer to check for text content. - - Returns: - -------- - - Returns True if the buffer is identified as text content, otherwise False. - """ - return is_text_content(buf) - - -txt_file_type = TxtFileType() - -filetype.add_type(txt_file_type) - - -class CustomPdfMatcher(filetype.Type): - """ - Match PDF file types based on MIME type and extension. - - Public methods: - - match - - Instance variables: - - MIME: The MIME type of the PDF. - - EXTENSION: The file extension of the PDF. - """ - - MIME = "application/pdf" - EXTENSION = "pdf" - - def __init__(self): - super(CustomPdfMatcher, self).__init__( - mime=CustomPdfMatcher.MIME, extension=CustomPdfMatcher.EXTENSION - ) - - def match(self, buf): - """ - Determine if the provided buffer is a PDF file. - - This method checks for the presence of the PDF signature in the buffer. - - Raises: - - TypeError: If the buffer is not of bytes type. - - Parameters: - ----------- - - - buf: The buffer containing the data to be checked. - - Returns: - -------- - - Returns True if the buffer contains a PDF signature, otherwise returns False. - """ - return b"PDF-" in buf - - -custom_pdf_matcher = CustomPdfMatcher() - -filetype.add_type(custom_pdf_matcher) - - -def guess_file_type(file: BinaryIO) -> filetype.Type: +def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type: """ Guess the file type from the given binary file stream. @@ -122,12 +42,23 @@ def guess_file_type(file: BinaryIO) -> filetype.Type: - filetype.Type: The guessed file type, represented as filetype.Type. """ + + # Note: If file has .txt or .text extension, consider it a plain text file as filetype.guess may not detect it properly + # as it contains no magic number encoding + ext = None + if isinstance(file, str): + ext = Path(file).suffix + elif name is not None: + ext = Path(name).suffix + + if ext in [".txt", ".text"]: + file_type = Type("text/plain", "txt") + return file_type + file_type = filetype.guess(file) # If file type could not be determined consider it a plain text file as they don't have magic number encoding if file_type is None: - from filetype.types.base import Type - file_type = Type("text/plain", "txt") if file_type is None: diff --git a/cognee/infrastructure/llm/prompts/extract_query_time.txt b/cognee/infrastructure/llm/prompts/extract_query_time.txt index 763d0e1c4..ce78c3471 100644 --- a/cognee/infrastructure/llm/prompts/extract_query_time.txt +++ b/cognee/infrastructure/llm/prompts/extract_query_time.txt @@ -1,15 +1,13 @@ -For the purposes of identifying timestamps in a query, you are tasked with extracting relevant timestamps from the query. -## Timestamp requirements -- If the query contains interval extrack both starts_at and ends_at properties -- If the query contains an instantaneous timestamp, starts_at and ends_at should be the same -- If the query its open-ended (before 2009 or after 2009), the corresponding non defined end of the time should be none - -For example: "before 2009" -- starts_at: None, ends_at: 2009 or "after 2009" -- starts_at: 2009, ends_at: None -- Put always the data that comes first in time as starts_at and the timestamps that comes second in time as ends_at -- If starts_at or ends_at cannot be extracted both of them has to be None -## Output Format -Your reply should be a JSON: list of dictionaries with the following structure: -```python -class QueryInterval(BaseModel): - starts_at: Optional[Timestamp] = None - ends_at: Optional[Timestamp] = None -``` \ No newline at end of file +You are tasked with identifying relevant time periods where the answer to a given query should be searched. +Current date is: `{{ time_now }}`. Determine relevant period(s) and return structured intervals. + +Extraction rules: + +1. Query without specific timestamp: use the time period with starts_at set to None and ends_at set to now. +2. Explicit time intervals: If the query specifies a range (e.g., from 2010 to 2020, between January and March 2023), extract both start and end dates. Always assign the earlier date to starts_at and the later date to ends_at. +3. Single timestamp: If the query refers to one specific moment (e.g., in 2015, on March 5, 2022), set starts_at and ends_at to that same timestamp. +4. Open-ended time references: For phrases such as "before X" or "after X", represent the unspecified side as None. For example: before 2009 β starts_at: None, ends_at: 2009; after 2009 β starts_at: 2009, ends_at: None. +5. Current-time references ("now", "current", "today"): If the query explicitly refers to the present, set both starts_at and ends_at to now (the ingestion timestamp). +6. "Who is" and "Who was" questions: These imply a general identity or biographical inquiry without a specific temporal scope. Set both starts_at and ends_at to None. +7. Ordering rule: Always ensure the earlier date is assigned to starts_at and the later date to ends_at. +8. No temporal information: If no valid or inferable time reference is found, set both starts_at and ends_at to None. \ No newline at end of file diff --git a/cognee/infrastructure/loaders/LoaderEngine.py b/cognee/infrastructure/loaders/LoaderEngine.py index 725f37b14..f9511e7c5 100644 --- a/cognee/infrastructure/loaders/LoaderEngine.py +++ b/cognee/infrastructure/loaders/LoaderEngine.py @@ -1,6 +1,7 @@ import filetype from typing import Dict, List, Optional, Any from .LoaderInterface import LoaderInterface +from cognee.infrastructure.files.utils.guess_file_type import guess_file_type from cognee.shared.logging_utils import get_logger logger = get_logger(__name__) @@ -80,7 +81,7 @@ class LoaderEngine: """ from pathlib import Path - file_info = filetype.guess(file_path) + file_info = guess_file_type(file_path) path_extension = Path(file_path).suffix.lstrip(".") diff --git a/cognee/infrastructure/loaders/core/audio_loader.py b/cognee/infrastructure/loaders/core/audio_loader.py index 17294bd94..f04d9a0e0 100644 --- a/cognee/infrastructure/loaders/core/audio_loader.py +++ b/cognee/infrastructure/loaders/core/audio_loader.py @@ -42,6 +42,7 @@ class AudioLoader(LoaderInterface): "audio/wav", "audio/amr", "audio/aiff", + "audio/x-wav", ] @property diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index 9f8c57486..e024bf00b 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,6 +1,7 @@ from typing import List, Union from cognee.infrastructure.engine import DataPoint +from cognee.infrastructure.engine.models.Edge import Edge from cognee.modules.data.processing.document_types import Document from cognee.modules.engine.models import Entity from cognee.tasks.temporal_graph.models import Event @@ -31,6 +32,6 @@ class DocumentChunk(DataPoint): chunk_index: int cut_type: str is_part_of: Document - contains: List[Union[Entity, Event]] = None + contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None metadata: dict = {"index_fields": ["text"]} diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 9703928f0..cb7562422 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -171,8 +171,10 @@ class CogneeGraph(CogneeAbstractGraph): embedding_map = {result.payload["text"]: result.score for result in edge_distances} for edge in self.edges: - relationship_type = edge.attributes.get("relationship_type") - distance = embedding_map.get(relationship_type, None) + edge_key = edge.attributes.get("edge_text") or edge.attributes.get( + "relationship_type" + ) + distance = embedding_map.get(edge_key, None) if distance is not None: edge.attributes["vector_distance"] = distance diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py index 3b01f5af4..c68eb494d 100644 --- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -1,5 +1,6 @@ from typing import Optional +from cognee.infrastructure.engine.models.Edge import Edge from cognee.modules.chunking.models import DocumentChunk from cognee.modules.engine.models import Entity, EntityType from cognee.modules.engine.utils import ( @@ -243,10 +244,26 @@ def _process_graph_nodes( ontology_relationships, ) - # Add entity to data chunk if data_chunk.contains is None: data_chunk.contains = [] - data_chunk.contains.append(entity_node) + + edge_text = "; ".join( + [ + "relationship_name: contains", + f"entity_name: {entity_node.name}", + f"entity_description: {entity_node.description}", + ] + ) + + data_chunk.contains.append( + ( + Edge( + relationship_type="contains", + edge_text=edge_text, + ), + entity_node, + ) + ) def _process_graph_edges( diff --git a/cognee/modules/graph/utils/resolve_edges_to_text.py b/cognee/modules/graph/utils/resolve_edges_to_text.py index eb5bedd2c..5deb13ba8 100644 --- a/cognee/modules/graph/utils/resolve_edges_to_text.py +++ b/cognee/modules/graph/utils/resolve_edges_to_text.py @@ -1,71 +1,70 @@ +import string from typing import List +from collections import Counter + from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge +from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS + + +def _get_top_n_frequent_words( + text: str, stop_words: set = None, top_n: int = 3, separator: str = ", " +) -> str: + """Concatenates the top N frequent words in text.""" + if stop_words is None: + stop_words = DEFAULT_STOP_WORDS + + words = [word.lower().strip(string.punctuation) for word in text.split()] + words = [word for word in words if word and word not in stop_words] + + top_words = [word for word, freq in Counter(words).most_common(top_n)] + return separator.join(top_words) + + +def _create_title_from_text(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str: + """Creates a title by combining first words with most frequent words from the text.""" + first_words = text.split()[:first_n_words] + top_words = _get_top_n_frequent_words(text, top_n=top_n_words) + return f"{' '.join(first_words)}... [{top_words}]" + + +def _extract_nodes_from_edges(retrieved_edges: List[Edge]) -> dict: + """Creates a dictionary of nodes with their names and content.""" + nodes = {} + + for edge in retrieved_edges: + for node in (edge.node1, edge.node2): + if node.id in nodes: + continue + + text = node.attributes.get("text") + if text: + name = _create_title_from_text(text) + content = text + else: + name = node.attributes.get("name", "Unnamed Node") + content = node.attributes.get("description", name) + + nodes[node.id] = {"node": node, "name": name, "content": content} + + return nodes async def resolve_edges_to_text(retrieved_edges: List[Edge]) -> str: - """ - Converts retrieved graph edges into a human-readable string format. + """Converts retrieved graph edges into a human-readable string format.""" + nodes = _extract_nodes_from_edges(retrieved_edges) - Parameters: - ----------- - - - retrieved_edges (list): A list of edges retrieved from the graph. - - Returns: - -------- - - - str: A formatted string representation of the nodes and their connections. - """ - - def _get_nodes(retrieved_edges: List[Edge]) -> dict: - def _get_title(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str: - def _top_n_words(text, stop_words=None, top_n=3, separator=", "): - """Concatenates the top N frequent words in text.""" - if stop_words is None: - from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS - - stop_words = DEFAULT_STOP_WORDS - - import string - - words = [word.lower().strip(string.punctuation) for word in text.split()] - - if stop_words: - words = [word for word in words if word and word not in stop_words] - - from collections import Counter - - top_words = [word for word, freq in Counter(words).most_common(top_n)] - - return separator.join(top_words) - - """Creates a title, by combining first words with most frequent words from the text.""" - first_words = text.split()[:first_n_words] - top_words = _top_n_words(text, top_n=first_n_words) - return f"{' '.join(first_words)}... [{top_words}]" - - """Creates a dictionary of nodes with their names and content.""" - nodes = {} - for edge in retrieved_edges: - for node in (edge.node1, edge.node2): - if node.id not in nodes: - text = node.attributes.get("text") - if text: - name = _get_title(text) - content = text - else: - name = node.attributes.get("name", "Unnamed Node") - content = node.attributes.get("description", name) - nodes[node.id] = {"node": node, "name": name, "content": content} - return nodes - - nodes = _get_nodes(retrieved_edges) node_section = "\n".join( f"Node: {info['name']}\n__node_content_start__\n{info['content']}\n__node_content_end__\n" for info in nodes.values() ) - connection_section = "\n".join( - f"{nodes[edge.node1.id]['name']} --[{edge.attributes['relationship_type']}]--> {nodes[edge.node2.id]['name']}" - for edge in retrieved_edges - ) + + connections = [] + for edge in retrieved_edges: + source_name = nodes[edge.node1.id]["name"] + target_name = nodes[edge.node2.id]["name"] + edge_label = edge.attributes.get("edge_text") or edge.attributes.get("relationship_type") + connections.append(f"{source_name} --[{edge_label}]--> {target_name}") + + connection_section = "\n".join(connections) + return f"Nodes:\n{node_section}\n\nConnections:\n{connection_section}" diff --git a/cognee/modules/ingestion/data_types/BinaryData.py b/cognee/modules/ingestion/data_types/BinaryData.py index f96e0d65c..9448dddcf 100644 --- a/cognee/modules/ingestion/data_types/BinaryData.py +++ b/cognee/modules/ingestion/data_types/BinaryData.py @@ -30,7 +30,7 @@ class BinaryData(IngestionData): async def ensure_metadata(self): if self.metadata is None: - self.metadata = await get_file_metadata(self.data) + self.metadata = await get_file_metadata(self.data, name=self.name) if self.metadata["name"] is None: self.metadata["name"] = self.name diff --git a/cognee/modules/ontology/get_default_ontology_resolver.py b/cognee/modules/ontology/get_default_ontology_resolver.py index f9aebe59a..7d87c10a6 100644 --- a/cognee/modules/ontology/get_default_ontology_resolver.py +++ b/cognee/modules/ontology/get_default_ontology_resolver.py @@ -21,7 +21,8 @@ def get_ontology_resolver_from_env( Supported value: "rdflib". matching_strategy (str): The matching strategy to apply. Supported value: "fuzzy". - ontology_file_path (str): Path to the ontology file required for the resolver. + ontology_file_path (str): Path to the ontology file(s) required for the resolver. + Can be a single path or comma-separated paths for multiple files. Returns: BaseOntologyResolver: An instance of the requested ontology resolver. @@ -31,8 +32,13 @@ def get_ontology_resolver_from_env( or if required parameters are missing. """ if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path: + if "," in ontology_file_path: + file_paths = [path.strip() for path in ontology_file_path.split(",")] + else: + file_paths = ontology_file_path + return RDFLibOntologyResolver( - matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path + matching_strategy=FuzzyMatchingStrategy(), ontology_file=file_paths ) else: raise EnvironmentError( diff --git a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py index 2a7a03751..45e32936a 100644 --- a/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +++ b/cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py @@ -2,7 +2,7 @@ import os import difflib from cognee.shared.logging_utils import get_logger from collections import deque -from typing import List, Tuple, Dict, Optional, Any +from typing import List, Tuple, Dict, Optional, Any, Union from rdflib import Graph, URIRef, RDF, RDFS, OWL from cognee.modules.ontology.exceptions import ( @@ -26,22 +26,50 @@ class RDFLibOntologyResolver(BaseOntologyResolver): def __init__( self, - ontology_file: Optional[str] = None, + ontology_file: Optional[Union[str, List[str]]] = None, matching_strategy: Optional[MatchingStrategy] = None, ) -> None: super().__init__(matching_strategy) self.ontology_file = ontology_file try: - if ontology_file and os.path.exists(ontology_file): + files_to_load = [] + if ontology_file is not None: + if isinstance(ontology_file, str): + files_to_load = [ontology_file] + elif isinstance(ontology_file, list): + files_to_load = ontology_file + else: + raise ValueError( + f"ontology_file must be a string, list of strings, or None. Got: {type(ontology_file)}" + ) + + if files_to_load: self.graph = Graph() - self.graph.parse(ontology_file) - logger.info("Ontology loaded successfully from file: %s", ontology_file) + loaded_files = [] + for file_path in files_to_load: + if os.path.exists(file_path): + self.graph.parse(file_path) + loaded_files.append(file_path) + logger.info("Ontology loaded successfully from file: %s", file_path) + else: + logger.warning( + "Ontology file '%s' not found. Skipping this file.", + file_path, + ) + + if not loaded_files: + logger.info( + "No valid ontology files found. No owl ontology will be attached to the graph." + ) + self.graph = None + else: + logger.info("Total ontology files loaded: %d", len(loaded_files)) else: logger.info( - "Ontology file '%s' not found. No owl ontology will be attached to the graph.", - ontology_file, + "No ontology file provided. No owl ontology will be attached to the graph." ) self.graph = None + self.build_lookup() except Exception as e: logger.error("Failed to load ontology", exc_info=e) diff --git a/cognee/modules/pipelines/operations/run_tasks_base.py b/cognee/modules/pipelines/operations/run_tasks_base.py index ee2ccfd8c..79d37a451 100644 --- a/cognee/modules/pipelines/operations/run_tasks_base.py +++ b/cognee/modules/pipelines/operations/run_tasks_base.py @@ -27,6 +27,7 @@ async def handle_task( additional_properties={ "task_name": running_task.executable.__name__, "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", }, ) @@ -49,6 +50,7 @@ async def handle_task( additional_properties={ "task_name": running_task.executable.__name__, "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", }, ) except Exception as error: @@ -62,6 +64,7 @@ async def handle_task( additional_properties={ "task_name": running_task.executable.__name__, "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", }, ) raise error diff --git a/cognee/modules/pipelines/operations/run_tasks_with_telemetry.py b/cognee/modules/pipelines/operations/run_tasks_with_telemetry.py index 9a52bf854..ae968c7a5 100644 --- a/cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +++ b/cognee/modules/pipelines/operations/run_tasks_with_telemetry.py @@ -28,6 +28,7 @@ async def run_tasks_with_telemetry( additional_properties={ "pipeline_name": str(pipeline_name), "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", } | config, ) @@ -42,6 +43,7 @@ async def run_tasks_with_telemetry( additional_properties={ "pipeline_name": str(pipeline_name), "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", } | config, ) @@ -58,6 +60,7 @@ async def run_tasks_with_telemetry( additional_properties={ "pipeline_name": str(pipeline_name), "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", } | config, ) diff --git a/cognee/modules/retrieval/temporal_retriever.py b/cognee/modules/retrieval/temporal_retriever.py index 8ef5eed69..ec68d37bb 100644 --- a/cognee/modules/retrieval/temporal_retriever.py +++ b/cognee/modules/retrieval/temporal_retriever.py @@ -1,7 +1,7 @@ import os import asyncio from typing import Any, Optional, List, Type - +from datetime import datetime from operator import itemgetter from cognee.infrastructure.databases.vector import get_vector_engine @@ -79,7 +79,11 @@ class TemporalRetriever(GraphCompletionRetriever): else: base_directory = None - system_prompt = render_prompt(prompt_path, {}, base_directory=base_directory) + time_now = datetime.now().strftime("%d-%m-%Y") + + system_prompt = render_prompt( + prompt_path, {"time_now": time_now}, base_directory=base_directory + ) interval = await LLMGateway.acreate_structured_output(query, system_prompt, QueryInterval) @@ -108,8 +112,6 @@ class TemporalRetriever(GraphCompletionRetriever): graph_engine = await get_graph_engine() - triplets = [] - if time_from and time_to: ids = await graph_engine.collect_time_ids(time_from=time_from, time_to=time_to) elif time_from: diff --git a/cognee/modules/retrieval/utils/brute_force_triplet_search.py b/cognee/modules/retrieval/utils/brute_force_triplet_search.py index 1ef7545c2..f8bdbb97d 100644 --- a/cognee/modules/retrieval/utils/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/utils/brute_force_triplet_search.py @@ -71,7 +71,7 @@ async def get_memory_fragment( await memory_fragment.project_graph_from_db( graph_engine, node_properties_to_project=properties_to_project, - edge_properties_to_project=["relationship_name"], + edge_properties_to_project=["relationship_name", "edge_text"], node_type=node_type, node_name=node_name, ) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index 93c0ef5c8..aab004924 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -67,7 +67,10 @@ async def search( send_telemetry( "cognee.search EXECUTION STARTED", user.id, - additional_properties={"cognee_version": cognee_version}, + additional_properties={ + "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", + }, ) # Use search function filtered by permissions if access control is enabled @@ -108,7 +111,10 @@ async def search( send_telemetry( "cognee.search EXECUTION COMPLETED", user.id, - additional_properties={"cognee_version": cognee_version}, + additional_properties={ + "cognee_version": cognee_version, + "tenant_id": str(user.tenant_id) if user.tenant_id else "Single User Tenant", + }, ) await log_result( diff --git a/cognee/modules/visualization/cognee_network_visualization.py b/cognee/modules/visualization/cognee_network_visualization.py index c735e70f1..3bf5ea8e8 100644 --- a/cognee/modules/visualization/cognee_network_visualization.py +++ b/cognee/modules/visualization/cognee_network_visualization.py @@ -16,17 +16,17 @@ async def cognee_network_visualization(graph_data, destination_file_path: str = nodes_list = [] color_map = { - "Entity": "#f47710", - "EntityType": "#6510f4", - "DocumentChunk": "#801212", - "TextSummary": "#1077f4", - "TableRow": "#f47710", - "TableType": "#6510f4", - "ColumnValue": "#13613a", - "SchemaTable": "#f47710", - "DatabaseSchema": "#6510f4", - "SchemaRelationship": "#13613a", - "default": "#D3D3D3", + "Entity": "#5C10F4", + "EntityType": "#A550FF", + "DocumentChunk": "#0DFF00", + "TextSummary": "#5C10F4", + "TableRow": "#A550FF", + "TableType": "#5C10F4", + "ColumnValue": "#757470", + "SchemaTable": "#A550FF", + "DatabaseSchema": "#5C10F4", + "SchemaRelationship": "#323332", + "default": "#D8D8D8", } for node_id, node_info in nodes_data: @@ -98,16 +98,19 @@ async def cognee_network_visualization(graph_data, destination_file_path: str =
+ +