Merge branch 'dev' into enable-multi-user-mode-default
This commit is contained in:
commit
8d4eed6101
24 changed files with 486 additions and 272 deletions
|
|
@ -243,13 +243,14 @@ LITELLM_LOG="ERROR"
|
|||
|
||||
########## Local LLM via Ollama ###############################################
|
||||
|
||||
|
||||
#LLM_API_KEY ="ollama"
|
||||
#LLM_MODEL="llama3.1:8b"
|
||||
#LLM_PROVIDER="ollama"
|
||||
#LLM_ENDPOINT="http://localhost:11434/v1"
|
||||
#EMBEDDING_PROVIDER="ollama"
|
||||
#EMBEDDING_MODEL="nomic-embed-text:latest"
|
||||
#EMBEDDING_ENDPOINT="http://localhost:11434/api/embeddings"
|
||||
#EMBEDDING_ENDPOINT="http://localhost:11434/api/embed"
|
||||
#EMBEDDING_DIMENSIONS=768
|
||||
#HUGGINGFACE_TOKENIZER="nomic-ai/nomic-embed-text-v1.5"
|
||||
|
||||
|
|
|
|||
20
.github/workflows/dockerhub-mcp.yml
vendored
20
.github/workflows/dockerhub-mcp.yml
vendored
|
|
@ -7,14 +7,29 @@ on:
|
|||
|
||||
jobs:
|
||||
docker-build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: Default
|
||||
labels:
|
||||
- docker_build_runner
|
||||
|
||||
steps:
|
||||
- name: Check and free disk space before build
|
||||
run: |
|
||||
echo "=== Before cleanup ==="
|
||||
df -h
|
||||
echo "Removing unused preinstalled SDKs to free space..."
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc || true
|
||||
docker system prune -af || true
|
||||
echo "=== After cleanup ==="
|
||||
df -h
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
buildkitd-flags: --root /tmp/buildkit
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
|
|
@ -34,7 +49,7 @@ jobs:
|
|||
|
||||
- name: Build and push
|
||||
id: build
|
||||
uses: docker/build-push-action@v5
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
|
@ -45,5 +60,6 @@ jobs:
|
|||
cache-from: type=registry,ref=cognee/cognee-mcp:buildcache
|
||||
cache-to: type=registry,ref=cognee/cognee-mcp:buildcache,mode=max
|
||||
|
||||
|
||||
- name: Image digest
|
||||
run: echo ${{ steps.build.outputs.digest }}
|
||||
|
|
|
|||
132
AGENTS.md
Normal file
132
AGENTS.md
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
## Repository Guidelines
|
||||
|
||||
This document summarizes how to work with the cognee repository: how it’s organized, how to build, test, lint, and contribute. It mirrors our actual tooling and CI while providing quick commands for local development.
|
||||
|
||||
## Project Structure & Module Organization
|
||||
|
||||
- `cognee/`: Core Python library and API.
|
||||
- `api/`: FastAPI application and versioned routers (add, cognify, memify, search, delete, users, datasets, responses, visualize, settings, sync, update, checks).
|
||||
- `cli/`: CLI entry points and subcommands invoked via `cognee` / `cognee-cli`.
|
||||
- `infrastructure/`: Databases, LLM providers, embeddings, loaders, and storage adapters.
|
||||
- `modules/`: Domain logic (graph, retrieval, ontology, users, processing, observability, etc.).
|
||||
- `tasks/`: Reusable tasks (e.g., code graph, web scraping, storage). Extend with new tasks here.
|
||||
- `eval_framework/`: Evaluation utilities and adapters.
|
||||
- `shared/`: Cross-cutting helpers (logging, settings, utils).
|
||||
- `tests/`: Unit, integration, CLI, and end-to-end tests organized by feature.
|
||||
- `__main__.py`: Entrypoint to route to CLI.
|
||||
- `cognee-mcp/`: Model Context Protocol server exposing cognee as MCP tools (SSE/HTTP/stdio). Contains its own README and Dockerfile.
|
||||
- `cognee-frontend/`: Next.js UI for local development and demos.
|
||||
- `distributed/`: Utilities for distributed execution (Modal, workers, queues).
|
||||
- `examples/`: Example scripts demonstrating the public APIs and features (graph, code graph, multimodal, permissions, etc.).
|
||||
- `notebooks/`: Jupyter notebooks for demos and tutorials.
|
||||
- `alembic/`: Database migrations for relational backends.
|
||||
|
||||
Notes:
|
||||
- Co-locate feature-specific helpers under their respective package (`modules/`, `infrastructure/`, or `tasks/`).
|
||||
- Extend the system by adding new tasks, loaders, or retrievers rather than modifying core pipeline mechanisms.
|
||||
|
||||
## Build, Test, and Development Commands
|
||||
|
||||
Python (root) – requires Python >= 3.10 and < 3.14. We recommend `uv` for speed and reproducibility.
|
||||
|
||||
- Create/refresh env and install dev deps:
|
||||
```bash
|
||||
uv sync --dev --all-extras --reinstall
|
||||
```
|
||||
|
||||
- Run the CLI (examples):
|
||||
```bash
|
||||
uv run cognee-cli add "Cognee turns documents into AI memory."
|
||||
uv run cognee-cli cognify
|
||||
uv run cognee-cli search "What does cognee do?"
|
||||
uv run cognee-cli -ui # Launches UI, backend API, and MCP server together
|
||||
```
|
||||
|
||||
- Start the FastAPI server directly:
|
||||
```bash
|
||||
uv run python -m cognee.api.client
|
||||
```
|
||||
|
||||
- Run tests (CI mirrors these commands):
|
||||
```bash
|
||||
uv run pytest cognee/tests/unit/ -v
|
||||
uv run pytest cognee/tests/integration/ -v
|
||||
```
|
||||
|
||||
- Lint and format (ruff):
|
||||
```bash
|
||||
uv run ruff check .
|
||||
uv run ruff format .
|
||||
```
|
||||
|
||||
- Optional static type checks (mypy):
|
||||
```bash
|
||||
uv run mypy cognee/
|
||||
```
|
||||
|
||||
MCP Server (`cognee-mcp/`):
|
||||
|
||||
- Install and run locally:
|
||||
```bash
|
||||
cd cognee-mcp
|
||||
uv sync --dev --all-extras --reinstall
|
||||
uv run python src/server.py # stdio (default)
|
||||
uv run python src/server.py --transport sse
|
||||
uv run python src/server.py --transport http --host 127.0.0.1 --port 8000 --path /mcp
|
||||
```
|
||||
|
||||
- API Mode (connect to a running Cognee API):
|
||||
```bash
|
||||
uv run python src/server.py --transport sse --api-url http://localhost:8000 --api-token YOUR_TOKEN
|
||||
```
|
||||
|
||||
- Docker quickstart (examples): see `cognee-mcp/README.md` for full details
|
||||
```bash
|
||||
docker run -e TRANSPORT_MODE=http --env-file ./.env -p 8000:8000 --rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
Frontend (`cognee-frontend/`):
|
||||
```bash
|
||||
cd cognee-frontend
|
||||
npm install
|
||||
npm run dev # Next.js dev server
|
||||
npm run lint # ESLint
|
||||
npm run build && npm start
|
||||
```
|
||||
|
||||
## Coding Style & Naming Conventions
|
||||
|
||||
Python:
|
||||
- 4-space indentation, modules and functions in `snake_case`, classes in `PascalCase`.
|
||||
- Public APIs should be type-annotated where practical.
|
||||
- Use `ruff format` before committing; `ruff check` enforces import hygiene and style (line-length 100 configured in `pyproject.toml`).
|
||||
- Prefer explicit, structured error handling. Use shared logging utilities in `cognee.shared.logging_utils`.
|
||||
|
||||
MCP server and Frontend:
|
||||
- Follow the local `README.md` and ESLint/TypeScript configuration in `cognee-frontend/`.
|
||||
|
||||
## Testing Guidelines
|
||||
|
||||
- Place Python tests under `cognee/tests/`.
|
||||
- Unit tests: `cognee/tests/unit/`
|
||||
- Integration tests: `cognee/tests/integration/`
|
||||
- CLI tests: `cognee/tests/cli_tests/`
|
||||
- Name test files `test_*.py`. Use `pytest.mark.asyncio` for async tests.
|
||||
- Avoid external state; rely on test fixtures and the CI-provided env vars when LLM/embedding providers are required. See CI workflows under `.github/workflows/` for expected environment variables.
|
||||
- When adding public APIs, provide/update targeted examples under `examples/python/`.
|
||||
|
||||
## Commit & Pull Request Guidelines
|
||||
|
||||
- Use clear, imperative subjects (≤ 72 chars) and conventional commit styling in PR titles. Our CI validates semantic PR titles (see `.github/workflows/pr_lint`). Examples:
|
||||
- `feat(graph): add temporal edge weighting`
|
||||
- `fix(api): handle missing auth cookie`
|
||||
- `docs: update installation instructions`
|
||||
- Reference related issues/discussions in the PR body and provide brief context.
|
||||
- PRs should describe scope, list local test commands run, and mention any impacts on MCP server or UI if applicable.
|
||||
- Sign commits and affirm the DCO (see `CONTRIBUTING.md`).
|
||||
|
||||
## CI Mirrors Local Commands
|
||||
|
||||
Our GitHub Actions run the same ruff checks and pytest suites shown above (`.github/workflows/basic_tests.yml` and related workflows). Use the commands in this document locally to minimize CI surprises.
|
||||
|
||||
|
||||
|
|
@ -97,7 +97,7 @@ Hosted platform:
|
|||
|
||||
### 📦 Installation
|
||||
|
||||
You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager.
|
||||
You can install Cognee using either **pip**, **poetry**, **uv** or any other python package manager..
|
||||
|
||||
Cognee supports Python 3.10 to 3.12
|
||||
|
||||
|
|
|
|||
|
|
@ -110,6 +110,47 @@ If you'd rather run cognee-mcp in a container, you have two options:
|
|||
# For stdio transport (default)
|
||||
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
**Installing optional dependencies at runtime:**
|
||||
|
||||
You can install optional dependencies when running the container by setting the `EXTRAS` environment variable:
|
||||
```bash
|
||||
# Install a single optional dependency group at runtime
|
||||
docker run \
|
||||
-e TRANSPORT_MODE=http \
|
||||
-e EXTRAS=aws \
|
||||
--env-file ./.env \
|
||||
-p 8000:8000 \
|
||||
--rm -it cognee/cognee-mcp:main
|
||||
|
||||
# Install multiple optional dependency groups at runtime (comma-separated)
|
||||
docker run \
|
||||
-e TRANSPORT_MODE=sse \
|
||||
-e EXTRAS=aws,postgres,neo4j \
|
||||
--env-file ./.env \
|
||||
-p 8000:8000 \
|
||||
--rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
**Available optional dependency groups:**
|
||||
- `aws` - S3 storage support
|
||||
- `postgres` / `postgres-binary` - PostgreSQL database support
|
||||
- `neo4j` - Neo4j graph database support
|
||||
- `neptune` - AWS Neptune support
|
||||
- `chromadb` - ChromaDB vector store support
|
||||
- `scraping` - Web scraping capabilities
|
||||
- `distributed` - Modal distributed execution
|
||||
- `langchain` - LangChain integration
|
||||
- `llama-index` - LlamaIndex integration
|
||||
- `anthropic` - Anthropic models
|
||||
- `groq` - Groq models
|
||||
- `mistral` - Mistral models
|
||||
- `ollama` / `huggingface` - Local model support
|
||||
- `docs` - Document processing
|
||||
- `codegraph` - Code analysis
|
||||
- `monitoring` - Sentry & Langfuse monitoring
|
||||
- `redis` - Redis support
|
||||
- And more (see [pyproject.toml](https://github.com/topoteretes/cognee/blob/main/pyproject.toml) for full list)
|
||||
2. **Pull from Docker Hub** (no build required):
|
||||
```bash
|
||||
# With HTTP transport (recommended for web deployments)
|
||||
|
|
@ -119,6 +160,17 @@ If you'd rather run cognee-mcp in a container, you have two options:
|
|||
# With stdio transport (default)
|
||||
docker run -e TRANSPORT_MODE=stdio --env-file ./.env --rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
**With runtime installation of optional dependencies:**
|
||||
```bash
|
||||
# Install optional dependencies from Docker Hub image
|
||||
docker run \
|
||||
-e TRANSPORT_MODE=http \
|
||||
-e EXTRAS=aws,postgres \
|
||||
--env-file ./.env \
|
||||
-p 8000:8000 \
|
||||
--rm -it cognee/cognee-mcp:main
|
||||
```
|
||||
|
||||
### **Important: Docker vs Direct Usage**
|
||||
**Docker uses environment variables**, not command line arguments:
|
||||
|
|
|
|||
|
|
@ -4,6 +4,42 @@ set -e # Exit on error
|
|||
echo "Debug mode: $DEBUG"
|
||||
echo "Environment: $ENVIRONMENT"
|
||||
|
||||
# Install optional dependencies if EXTRAS is set
|
||||
if [ -n "$EXTRAS" ]; then
|
||||
echo "Installing optional dependencies: $EXTRAS"
|
||||
|
||||
# Get the cognee version that's currently installed
|
||||
COGNEE_VERSION=$(uv pip show cognee | grep "Version:" | awk '{print $2}')
|
||||
echo "Current cognee version: $COGNEE_VERSION"
|
||||
|
||||
# Build the extras list for cognee
|
||||
IFS=',' read -ra EXTRA_ARRAY <<< "$EXTRAS"
|
||||
# Combine base extras from pyproject.toml with requested extras
|
||||
ALL_EXTRAS=""
|
||||
for extra in "${EXTRA_ARRAY[@]}"; do
|
||||
# Trim whitespace
|
||||
extra=$(echo "$extra" | xargs)
|
||||
# Add to extras list if not already present
|
||||
if [[ ! "$ALL_EXTRAS" =~ (^|,)"$extra"(,|$) ]]; then
|
||||
if [ -z "$ALL_EXTRAS" ]; then
|
||||
ALL_EXTRAS="$extra"
|
||||
else
|
||||
ALL_EXTRAS="$ALL_EXTRAS,$extra"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Installing cognee with extras: $ALL_EXTRAS"
|
||||
echo "Running: uv pip install 'cognee[$ALL_EXTRAS]==$COGNEE_VERSION'"
|
||||
uv pip install "cognee[$ALL_EXTRAS]==$COGNEE_VERSION"
|
||||
|
||||
# Verify installation
|
||||
echo ""
|
||||
echo "✓ Optional dependencies installation completed"
|
||||
else
|
||||
echo "No optional dependencies specified"
|
||||
fi
|
||||
|
||||
# Set default transport mode if not specified
|
||||
TRANSPORT_MODE=${TRANSPORT_MODE:-"stdio"}
|
||||
echo "Transport mode: $TRANSPORT_MODE"
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ dependencies = [
|
|||
# For local cognee repo usage remove comment bellow and add absolute path to cognee. Then run `uv sync --reinstall` in the mcp folder on local cognee changes.
|
||||
#"cognee[postgres,codegraph,gemini,huggingface,docs,neo4j] @ file:/Users/igorilic/Desktop/cognee",
|
||||
# TODO: Remove gemini from optional dependecnies for new Cognee version after 0.3.4
|
||||
"cognee[postgres,codegraph,huggingface,docs,neo4j]==0.3.7",
|
||||
"cognee[postgres,docs,neo4j]==0.3.7",
|
||||
"fastmcp>=2.10.0,<3.0.0",
|
||||
"mcp>=1.12.0,<2.0.0",
|
||||
"uv>=0.6.3,<1.0.0",
|
||||
|
|
|
|||
|
|
@ -37,12 +37,10 @@ async def run():
|
|||
|
||||
toolResult = await session.call_tool("prune", arguments={})
|
||||
|
||||
toolResult = await session.call_tool(
|
||||
"codify", arguments={"repo_path": "SOME_REPO_PATH"}
|
||||
)
|
||||
toolResult = await session.call_tool("cognify", arguments={})
|
||||
|
||||
toolResult = await session.call_tool(
|
||||
"search", arguments={"search_type": "CODE", "search_query": "exceptions"}
|
||||
"search", arguments={"search_type": "GRAPH_COMPLETION"}
|
||||
)
|
||||
|
||||
print(f"Cognify result: {toolResult.content}")
|
||||
|
|
|
|||
64
cognee-mcp/uv.lock
generated
64
cognee-mcp/uv.lock
generated
|
|
@ -718,19 +718,10 @@ wheels = [
|
|||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
codegraph = [
|
||||
{ name = "fastembed", marker = "python_full_version < '3.13'" },
|
||||
{ name = "transformers" },
|
||||
{ name = "tree-sitter" },
|
||||
{ name = "tree-sitter-python" },
|
||||
]
|
||||
docs = [
|
||||
{ name = "lxml" },
|
||||
{ name = "unstructured", extra = ["csv", "doc", "docx", "epub", "md", "odt", "org", "pdf", "ppt", "pptx", "rst", "rtf", "tsv", "xlsx"] },
|
||||
]
|
||||
huggingface = [
|
||||
{ name = "transformers" },
|
||||
]
|
||||
neo4j = [
|
||||
{ name = "neo4j" },
|
||||
]
|
||||
|
|
@ -745,7 +736,7 @@ name = "cognee-mcp"
|
|||
version = "0.4.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "cognee", extra = ["codegraph", "docs", "huggingface", "neo4j", "postgres"] },
|
||||
{ name = "cognee", extra = ["docs", "neo4j", "postgres"] },
|
||||
{ name = "fastmcp" },
|
||||
{ name = "httpx" },
|
||||
{ name = "mcp" },
|
||||
|
|
@ -759,7 +750,7 @@ dev = [
|
|||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "cognee", extras = ["postgres", "codegraph", "huggingface", "docs", "neo4j"], specifier = "==0.3.7" },
|
||||
{ name = "cognee", extras = ["postgres", "docs", "neo4j"], specifier = "==0.3.7" },
|
||||
{ name = "fastmcp", specifier = ">=2.10.0,<3.0.0" },
|
||||
{ name = "httpx", specifier = ">=0.27.0,<1.0.0" },
|
||||
{ name = "mcp", specifier = ">=1.12.0,<2.0.0" },
|
||||
|
|
@ -6038,57 +6029,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/e5/2b/4d2708ac1ff5cd708b6548f4c5812d0ae40d1c28591c4c1c762b6dbdef2d/transformers-4.57.0-py3-none-any.whl", hash = "sha256:9d7c6d098c026e40d897e017ed1f481ab803cbac041021dbc6ae6100e4949b55", size = 11990588 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter"
|
||||
version = "0.24.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a7/a2/698b9d31d08ad5558f8bfbfe3a0781bd4b1f284e89bde3ad18e05101a892/tree-sitter-0.24.0.tar.gz", hash = "sha256:abd95af65ca2f4f7eca356343391ed669e764f37748b5352946f00f7fc78e734", size = 168304 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/08/9a/bd627a02e41671af73222316e1fcf87772c7804dc2fba99405275eb1f3eb/tree_sitter-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f3f00feff1fc47a8e4863561b8da8f5e023d382dd31ed3e43cd11d4cae445445", size = 140890 },
|
||||
{ url = "https://files.pythonhosted.org/packages/5b/9b/b1ccfb187f8be78e2116176a091a2f2abfd043a06d78f80c97c97f315b37/tree_sitter-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f9691be48d98c49ef8f498460278884c666b44129222ed6217477dffad5d4831", size = 134413 },
|
||||
{ url = "https://files.pythonhosted.org/packages/01/39/e25b0042a049eb27e991133a7aa7c49bb8e49a8a7b44ca34e7e6353ba7ac/tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:098a81df9f89cf254d92c1cd0660a838593f85d7505b28249216661d87adde4a", size = 560427 },
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/59/4d132f1388da5242151b90acf32cc56af779bfba063923699ab28b276b62/tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b26bf9e958da6eb7e74a081aab9d9c7d05f9baeaa830dbb67481898fd16f1f5", size = 574327 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/97/3914e45ab9e0ff0f157e493caa91791372508488b97ff0961a0640a37d25/tree_sitter-0.24.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2a84ff87a2f2a008867a1064aba510ab3bd608e3e0cd6e8fef0379efee266c73", size = 577171 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/b0/266a529c3eef171137b73cde8ad7aa282734354609a8b2f5564428e8f12d/tree_sitter-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:c012e4c345c57a95d92ab5a890c637aaa51ab3b7ff25ed7069834b1087361c95", size = 120260 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/c3/07bfaa345e0037ff75d98b7a643cf940146e4092a1fd54eed0359836be03/tree_sitter-0.24.0-cp310-cp310-win_arm64.whl", hash = "sha256:033506c1bc2ba7bd559b23a6bdbeaf1127cee3c68a094b82396718596dfe98bc", size = 108416 },
|
||||
{ url = "https://files.pythonhosted.org/packages/66/08/82aaf7cbea7286ee2a0b43e9b75cb93ac6ac132991b7d3c26ebe5e5235a3/tree_sitter-0.24.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de0fb7c18c6068cacff46250c0a0473e8fc74d673e3e86555f131c2c1346fb13", size = 140733 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/bd/1a84574911c40734d80327495e6e218e8f17ef318dd62bb66b55c1e969f5/tree_sitter-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a7c9c89666dea2ce2b2bf98e75f429d2876c569fab966afefdcd71974c6d8538", size = 134243 },
|
||||
{ url = "https://files.pythonhosted.org/packages/46/c1/c2037af2c44996d7bde84eb1c9e42308cc84b547dd6da7f8a8bea33007e1/tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ddb113e6b8b3e3b199695b1492a47d87d06c538e63050823d90ef13cac585fd", size = 562030 },
|
||||
{ url = "https://files.pythonhosted.org/packages/4c/aa/2fb4d81886df958e6ec7e370895f7106d46d0bbdcc531768326124dc8972/tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01ea01a7003b88b92f7f875da6ba9d5d741e0c84bb1bd92c503c0eecd0ee6409", size = 575585 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e3/3c/5f997ce34c0d1b744e0f0c0757113bdfc173a2e3dadda92c751685cfcbd1/tree_sitter-0.24.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:464fa5b2cac63608915a9de8a6efd67a4da1929e603ea86abaeae2cb1fe89921", size = 578203 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/1f/f2bc7fa7c3081653ea4f2639e06ff0af4616c47105dbcc0746137da7620d/tree_sitter-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:3b1f3cbd9700e1fba0be2e7d801527e37c49fc02dc140714669144ef6ab58dce", size = 120147 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/4c/9add771772c4d72a328e656367ca948e389432548696a3819b69cdd6f41e/tree_sitter-0.24.0-cp311-cp311-win_arm64.whl", hash = "sha256:f3f08a2ca9f600b3758792ba2406971665ffbad810847398d180c48cee174ee2", size = 108302 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/57/3a590f287b5aa60c07d5545953912be3d252481bf5e178f750db75572bff/tree_sitter-0.24.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:14beeff5f11e223c37be7d5d119819880601a80d0399abe8c738ae2288804afc", size = 140788 },
|
||||
{ url = "https://files.pythonhosted.org/packages/61/0b/fc289e0cba7dbe77c6655a4dd949cd23c663fd62a8b4d8f02f97e28d7fe5/tree_sitter-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26a5b130f70d5925d67b47db314da209063664585a2fd36fa69e0717738efaf4", size = 133945 },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/d7/80767238308a137e0b5b5c947aa243e3c1e3e430e6d0d5ae94b9a9ffd1a2/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fc5c3c26d83c9d0ecb4fc4304fba35f034b7761d35286b936c1db1217558b4e", size = 564819 },
|
||||
{ url = "https://files.pythonhosted.org/packages/bf/b3/6c5574f4b937b836601f5fb556b24804b0a6341f2eb42f40c0e6464339f4/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:772e1bd8c0931c866b848d0369b32218ac97c24b04790ec4b0e409901945dd8e", size = 579303 },
|
||||
{ url = "https://files.pythonhosted.org/packages/0a/f4/bd0ddf9abe242ea67cca18a64810f8af230fc1ea74b28bb702e838ccd874/tree_sitter-0.24.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:24a8dd03b0d6b8812425f3b84d2f4763322684e38baf74e5bb766128b5633dc7", size = 581054 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/1c/ff23fa4931b6ef1bbeac461b904ca7e49eaec7e7e5398584e3eef836ec96/tree_sitter-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9e8b1605ab60ed43803100f067eed71b0b0e6c1fb9860a262727dbfbbb74751", size = 120221 },
|
||||
{ url = "https://files.pythonhosted.org/packages/b2/2a/9979c626f303177b7612a802237d0533155bf1e425ff6f73cc40f25453e2/tree_sitter-0.24.0-cp312-cp312-win_arm64.whl", hash = "sha256:f733a83d8355fc95561582b66bbea92ffd365c5d7a665bc9ebd25e049c2b2abb", size = 108234 },
|
||||
{ url = "https://files.pythonhosted.org/packages/61/cd/2348339c85803330ce38cee1c6cbbfa78a656b34ff58606ebaf5c9e83bd0/tree_sitter-0.24.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d4a6416ed421c4210f0ca405a4834d5ccfbb8ad6692d4d74f7773ef68f92071", size = 140781 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/a3/1ea9d8b64e8dcfcc0051028a9c84a630301290995cd6e947bf88267ef7b1/tree_sitter-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e0992d483677e71d5c5d37f30dfb2e3afec2f932a9c53eec4fca13869b788c6c", size = 133928 },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/ae/55c1055609c9428a4aedf4b164400ab9adb0b1bf1538b51f4b3748a6c983/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57277a12fbcefb1c8b206186068d456c600dbfbc3fd6c76968ee22614c5cd5ad", size = 564497 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/d0/f2ffcd04882c5aa28d205a787353130cbf84b2b8a977fd211bdc3b399ae3/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25fa22766d63f73716c6fec1a31ee5cf904aa429484256bd5fdf5259051ed74", size = 578917 },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/82/aebe78ea23a2b3a79324993d4915f3093ad1af43d7c2208ee90be9273273/tree_sitter-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7d5d9537507e1c8c5fa9935b34f320bfec4114d675e028f3ad94f11cf9db37b9", size = 581148 },
|
||||
{ url = "https://files.pythonhosted.org/packages/a1/b4/6b0291a590c2b0417cfdb64ccb8ea242f270a46ed429c641fbc2bfab77e0/tree_sitter-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:f58bb4956917715ec4d5a28681829a8dad5c342cafd4aea269f9132a83ca9b34", size = 120207 },
|
||||
{ url = "https://files.pythonhosted.org/packages/a8/18/542fd844b75272630229c9939b03f7db232c71a9d82aadc59c596319ea6a/tree_sitter-0.24.0-cp313-cp313-win_arm64.whl", hash = "sha256:23641bd25dcd4bb0b6fa91b8fb3f46cc9f1c9f475efe4d536d3f1f688d1b84c8", size = 108232 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-python"
|
||||
version = "0.23.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1c/30/6766433b31be476fda6569a3a374c2220e45ffee0bff75460038a57bf23b/tree_sitter_python-0.23.6.tar.gz", hash = "sha256:354bfa0a2f9217431764a631516f85173e9711af2c13dbd796a8815acfe505d9", size = 155868 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ab/67/577a02acae5f776007c924ca86ef14c19c12e71de0aa9d2a036f3c248e7b/tree_sitter_python-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:28fbec8f74eeb2b30292d97715e60fac9ccf8a8091ce19b9d93e9b580ed280fb", size = 74361 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d2/a6/194b3625a7245c532ad418130d63077ce6cd241152524152f533e4d6edb0/tree_sitter_python-0.23.6-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:680b710051b144fedf61c95197db0094f2245e82551bf7f0c501356333571f7a", size = 76436 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/62/1da112689d6d282920e62c40e67ab39ea56463b0e7167bfc5e81818a770e/tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a9dcef55507b6567207e8ee0a6b053d0688019b47ff7f26edc1764b7f4dc0a4", size = 112060 },
|
||||
{ url = "https://files.pythonhosted.org/packages/5d/62/c9358584c96e38318d69b6704653684fd8467601f7b74e88aa44f4e6903f/tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29dacdc0cd2f64e55e61d96c6906533ebb2791972bec988450c46cce60092f5d", size = 112338 },
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/58/c5e61add45e34fb8ecbf057c500bae9d96ed7c9ca36edb7985da8ae45526/tree_sitter_python-0.23.6-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7e048733c36f564b379831689006801feb267d8194f9e793fbb395ef1723335d", size = 109382 },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/f3/9b30893cae9b3811fe652dc6f90aaadfda12ae0b2757f5722fc7266f423c/tree_sitter_python-0.23.6-cp39-abi3-win_amd64.whl", hash = "sha256:a24027248399fb41594b696f929f9956828ae7cc85596d9f775e6c239cd0c2be", size = 75904 },
|
||||
{ url = "https://files.pythonhosted.org/packages/87/cb/ce35a65f83a47b510d8a2f1eddf3bdbb0d57aabc87351c8788caf3309f76/tree_sitter_python-0.23.6-cp39-abi3-win_arm64.whl", hash = "sha256:71334371bd73d5fe080aed39fbff49ed8efb9506edebe16795b0c7567ed6a272", size = 73649 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "triton"
|
||||
version = "3.5.0"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, field_validator
|
||||
from typing import Optional, Any, Dict
|
||||
|
||||
|
||||
|
|
@ -18,9 +18,21 @@ class Edge(BaseModel):
|
|||
|
||||
# Mixed usage
|
||||
has_items: (Edge(weight=0.5, weights={"confidence": 0.9}), list[Item])
|
||||
|
||||
# With edge_text for rich embedding representation
|
||||
contains: (Edge(relationship_type="contains", edge_text="relationship_name: contains; entity_description: Alice"), Entity)
|
||||
"""
|
||||
|
||||
weight: Optional[float] = None
|
||||
weights: Optional[Dict[str, float]] = None
|
||||
relationship_type: Optional[str] = None
|
||||
properties: Optional[Dict[str, Any]] = None
|
||||
edge_text: Optional[str] = None
|
||||
|
||||
@field_validator("edge_text", mode="before")
|
||||
@classmethod
|
||||
def ensure_edge_text(cls, v, info):
|
||||
"""Auto-populate edge_text from relationship_type if not explicitly provided."""
|
||||
if v is None and info.data.get("relationship_type"):
|
||||
return info.data["relationship_type"]
|
||||
return v
|
||||
|
|
|
|||
|
|
@ -22,42 +22,6 @@ class FileTypeException(Exception):
|
|||
self.message = message
|
||||
|
||||
|
||||
class TxtFileType(filetype.Type):
|
||||
"""
|
||||
Represents a text file type with specific MIME and extension properties.
|
||||
|
||||
Public methods:
|
||||
- match: Determines whether a given buffer matches the text file type.
|
||||
"""
|
||||
|
||||
MIME = "text/plain"
|
||||
EXTENSION = "txt"
|
||||
|
||||
def __init__(self):
|
||||
super(TxtFileType, self).__init__(mime=TxtFileType.MIME, extension=TxtFileType.EXTENSION)
|
||||
|
||||
def match(self, buf):
|
||||
"""
|
||||
Determine if the given buffer contains text content.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- buf: The buffer to check for text content.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
Returns True if the buffer is identified as text content, otherwise False.
|
||||
"""
|
||||
return is_text_content(buf)
|
||||
|
||||
|
||||
txt_file_type = TxtFileType()
|
||||
|
||||
filetype.add_type(txt_file_type)
|
||||
|
||||
|
||||
def guess_file_type(file: BinaryIO) -> filetype.Type:
|
||||
"""
|
||||
Guess the file type from the given binary file stream.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import filetype
|
||||
from typing import Dict, List, Optional, Any
|
||||
from .LoaderInterface import LoaderInterface
|
||||
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
|
@ -80,7 +81,7 @@ class LoaderEngine:
|
|||
"""
|
||||
from pathlib import Path
|
||||
|
||||
file_info = filetype.guess(file_path)
|
||||
file_info = guess_file_type(file_path)
|
||||
|
||||
path_extension = Path(file_path).suffix.lstrip(".")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from typing import List, Union
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.infrastructure.engine.models.Edge import Edge
|
||||
from cognee.modules.data.processing.document_types import Document
|
||||
from cognee.modules.engine.models import Entity
|
||||
from cognee.tasks.temporal_graph.models import Event
|
||||
|
|
@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
|
|||
chunk_index: int
|
||||
cut_type: str
|
||||
is_part_of: Document
|
||||
contains: List[Union[Entity, Event]] = None
|
||||
contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None
|
||||
|
||||
metadata: dict = {"index_fields": ["text"]}
|
||||
|
|
|
|||
|
|
@ -171,8 +171,10 @@ class CogneeGraph(CogneeAbstractGraph):
|
|||
embedding_map = {result.payload["text"]: result.score for result in edge_distances}
|
||||
|
||||
for edge in self.edges:
|
||||
relationship_type = edge.attributes.get("relationship_type")
|
||||
distance = embedding_map.get(relationship_type, None)
|
||||
edge_key = edge.attributes.get("edge_text") or edge.attributes.get(
|
||||
"relationship_type"
|
||||
)
|
||||
distance = embedding_map.get(edge_key, None)
|
||||
if distance is not None:
|
||||
edge.attributes["vector_distance"] = distance
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Optional
|
||||
|
||||
from cognee.infrastructure.engine.models.Edge import Edge
|
||||
from cognee.modules.chunking.models import DocumentChunk
|
||||
from cognee.modules.engine.models import Entity, EntityType
|
||||
from cognee.modules.engine.utils import (
|
||||
|
|
@ -243,10 +244,26 @@ def _process_graph_nodes(
|
|||
ontology_relationships,
|
||||
)
|
||||
|
||||
# Add entity to data chunk
|
||||
if data_chunk.contains is None:
|
||||
data_chunk.contains = []
|
||||
data_chunk.contains.append(entity_node)
|
||||
|
||||
edge_text = "; ".join(
|
||||
[
|
||||
"relationship_name: contains",
|
||||
f"entity_name: {entity_node.name}",
|
||||
f"entity_description: {entity_node.description}",
|
||||
]
|
||||
)
|
||||
|
||||
data_chunk.contains.append(
|
||||
(
|
||||
Edge(
|
||||
relationship_type="contains",
|
||||
edge_text=edge_text,
|
||||
),
|
||||
entity_node,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _process_graph_edges(
|
||||
|
|
|
|||
|
|
@ -1,71 +1,70 @@
|
|||
import string
|
||||
from typing import List
|
||||
from collections import Counter
|
||||
|
||||
from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
|
||||
from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS
|
||||
|
||||
|
||||
def _get_top_n_frequent_words(
|
||||
text: str, stop_words: set = None, top_n: int = 3, separator: str = ", "
|
||||
) -> str:
|
||||
"""Concatenates the top N frequent words in text."""
|
||||
if stop_words is None:
|
||||
stop_words = DEFAULT_STOP_WORDS
|
||||
|
||||
words = [word.lower().strip(string.punctuation) for word in text.split()]
|
||||
words = [word for word in words if word and word not in stop_words]
|
||||
|
||||
top_words = [word for word, freq in Counter(words).most_common(top_n)]
|
||||
return separator.join(top_words)
|
||||
|
||||
|
||||
def _create_title_from_text(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str:
|
||||
"""Creates a title by combining first words with most frequent words from the text."""
|
||||
first_words = text.split()[:first_n_words]
|
||||
top_words = _get_top_n_frequent_words(text, top_n=top_n_words)
|
||||
return f"{' '.join(first_words)}... [{top_words}]"
|
||||
|
||||
|
||||
def _extract_nodes_from_edges(retrieved_edges: List[Edge]) -> dict:
|
||||
"""Creates a dictionary of nodes with their names and content."""
|
||||
nodes = {}
|
||||
|
||||
for edge in retrieved_edges:
|
||||
for node in (edge.node1, edge.node2):
|
||||
if node.id in nodes:
|
||||
continue
|
||||
|
||||
text = node.attributes.get("text")
|
||||
if text:
|
||||
name = _create_title_from_text(text)
|
||||
content = text
|
||||
else:
|
||||
name = node.attributes.get("name", "Unnamed Node")
|
||||
content = node.attributes.get("description", name)
|
||||
|
||||
nodes[node.id] = {"node": node, "name": name, "content": content}
|
||||
|
||||
return nodes
|
||||
|
||||
|
||||
async def resolve_edges_to_text(retrieved_edges: List[Edge]) -> str:
|
||||
"""
|
||||
Converts retrieved graph edges into a human-readable string format.
|
||||
"""Converts retrieved graph edges into a human-readable string format."""
|
||||
nodes = _extract_nodes_from_edges(retrieved_edges)
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- retrieved_edges (list): A list of edges retrieved from the graph.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
- str: A formatted string representation of the nodes and their connections.
|
||||
"""
|
||||
|
||||
def _get_nodes(retrieved_edges: List[Edge]) -> dict:
|
||||
def _get_title(text: str, first_n_words: int = 7, top_n_words: int = 3) -> str:
|
||||
def _top_n_words(text, stop_words=None, top_n=3, separator=", "):
|
||||
"""Concatenates the top N frequent words in text."""
|
||||
if stop_words is None:
|
||||
from cognee.modules.retrieval.utils.stop_words import DEFAULT_STOP_WORDS
|
||||
|
||||
stop_words = DEFAULT_STOP_WORDS
|
||||
|
||||
import string
|
||||
|
||||
words = [word.lower().strip(string.punctuation) for word in text.split()]
|
||||
|
||||
if stop_words:
|
||||
words = [word for word in words if word and word not in stop_words]
|
||||
|
||||
from collections import Counter
|
||||
|
||||
top_words = [word for word, freq in Counter(words).most_common(top_n)]
|
||||
|
||||
return separator.join(top_words)
|
||||
|
||||
"""Creates a title, by combining first words with most frequent words from the text."""
|
||||
first_words = text.split()[:first_n_words]
|
||||
top_words = _top_n_words(text, top_n=first_n_words)
|
||||
return f"{' '.join(first_words)}... [{top_words}]"
|
||||
|
||||
"""Creates a dictionary of nodes with their names and content."""
|
||||
nodes = {}
|
||||
for edge in retrieved_edges:
|
||||
for node in (edge.node1, edge.node2):
|
||||
if node.id not in nodes:
|
||||
text = node.attributes.get("text")
|
||||
if text:
|
||||
name = _get_title(text)
|
||||
content = text
|
||||
else:
|
||||
name = node.attributes.get("name", "Unnamed Node")
|
||||
content = node.attributes.get("description", name)
|
||||
nodes[node.id] = {"node": node, "name": name, "content": content}
|
||||
return nodes
|
||||
|
||||
nodes = _get_nodes(retrieved_edges)
|
||||
node_section = "\n".join(
|
||||
f"Node: {info['name']}\n__node_content_start__\n{info['content']}\n__node_content_end__\n"
|
||||
for info in nodes.values()
|
||||
)
|
||||
connection_section = "\n".join(
|
||||
f"{nodes[edge.node1.id]['name']} --[{edge.attributes['relationship_type']}]--> {nodes[edge.node2.id]['name']}"
|
||||
for edge in retrieved_edges
|
||||
)
|
||||
|
||||
connections = []
|
||||
for edge in retrieved_edges:
|
||||
source_name = nodes[edge.node1.id]["name"]
|
||||
target_name = nodes[edge.node2.id]["name"]
|
||||
edge_label = edge.attributes.get("edge_text") or edge.attributes.get("relationship_type")
|
||||
connections.append(f"{source_name} --[{edge_label}]--> {target_name}")
|
||||
|
||||
connection_section = "\n".join(connections)
|
||||
|
||||
return f"Nodes:\n{node_section}\n\nConnections:\n{connection_section}"
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ async def get_memory_fragment(
|
|||
await memory_fragment.project_graph_from_db(
|
||||
graph_engine,
|
||||
node_properties_to_project=properties_to_project,
|
||||
edge_properties_to_project=["relationship_name"],
|
||||
edge_properties_to_project=["relationship_name", "edge_text"],
|
||||
node_type=node_type,
|
||||
node_name=node_name,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,47 +8,58 @@ logger = get_logger("index_data_points")
|
|||
|
||||
|
||||
async def index_data_points(data_points: list[DataPoint]):
|
||||
created_indexes = {}
|
||||
index_points = {}
|
||||
"""Index data points in the vector engine by creating embeddings for specified fields.
|
||||
|
||||
Process:
|
||||
1. Groups data points into a nested dict: {type_name: {field_name: [points]}}
|
||||
2. Creates vector indexes for each (type, field) combination on first encounter
|
||||
3. Batches points per (type, field) and creates async indexing tasks
|
||||
4. Executes all indexing tasks in parallel for efficient embedding generation
|
||||
|
||||
Args:
|
||||
data_points: List of DataPoint objects to index. Each DataPoint's metadata must
|
||||
contain an 'index_fields' list specifying which fields to embed.
|
||||
|
||||
Returns:
|
||||
The original data_points list.
|
||||
"""
|
||||
data_points_by_type = {}
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
|
||||
for data_point in data_points:
|
||||
data_point_type = type(data_point)
|
||||
type_name = data_point_type.__name__
|
||||
|
||||
for field_name in data_point.metadata["index_fields"]:
|
||||
if getattr(data_point, field_name, None) is None:
|
||||
continue
|
||||
|
||||
index_name = f"{data_point_type.__name__}_{field_name}"
|
||||
if type_name not in data_points_by_type:
|
||||
data_points_by_type[type_name] = {}
|
||||
|
||||
if index_name not in created_indexes:
|
||||
await vector_engine.create_vector_index(data_point_type.__name__, field_name)
|
||||
created_indexes[index_name] = True
|
||||
|
||||
if index_name not in index_points:
|
||||
index_points[index_name] = []
|
||||
if field_name not in data_points_by_type[type_name]:
|
||||
await vector_engine.create_vector_index(type_name, field_name)
|
||||
data_points_by_type[type_name][field_name] = []
|
||||
|
||||
indexed_data_point = data_point.model_copy()
|
||||
indexed_data_point.metadata["index_fields"] = [field_name]
|
||||
index_points[index_name].append(indexed_data_point)
|
||||
data_points_by_type[type_name][field_name].append(indexed_data_point)
|
||||
|
||||
tasks: list[asyncio.Task] = []
|
||||
batch_size = vector_engine.embedding_engine.get_batch_size()
|
||||
|
||||
for index_name_and_field, points in index_points.items():
|
||||
first = index_name_and_field.index("_")
|
||||
index_name = index_name_and_field[:first]
|
||||
field_name = index_name_and_field[first + 1 :]
|
||||
batches = (
|
||||
(type_name, field_name, points[i : i + batch_size])
|
||||
for type_name, fields in data_points_by_type.items()
|
||||
for field_name, points in fields.items()
|
||||
for i in range(0, len(points), batch_size)
|
||||
)
|
||||
|
||||
# Create embedding requests per batch to run in parallel later
|
||||
for i in range(0, len(points), batch_size):
|
||||
batch = points[i : i + batch_size]
|
||||
tasks.append(
|
||||
asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
|
||||
)
|
||||
tasks = [
|
||||
asyncio.create_task(vector_engine.index_data_points(type_name, field_name, batch_points))
|
||||
for type_name, field_name, batch_points in batches
|
||||
]
|
||||
|
||||
# Run all embedding requests in parallel
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
return data_points
|
||||
|
|
|
|||
|
|
@ -1,17 +1,44 @@
|
|||
import asyncio
|
||||
from collections import Counter
|
||||
from typing import Optional, Dict, Any, List, Tuple, Union
|
||||
|
||||
from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from collections import Counter
|
||||
from typing import Optional, Dict, Any, List, Tuple, Union
|
||||
from cognee.infrastructure.databases.vector import get_vector_engine
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from cognee.modules.graph.models.EdgeType import EdgeType
|
||||
from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData
|
||||
from cognee.tasks.storage.index_data_points import index_data_points
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def _get_edge_text(item: dict) -> str:
|
||||
"""Extract edge text for embedding - prefers edge_text field with fallback."""
|
||||
if "edge_text" in item:
|
||||
return item["edge_text"]
|
||||
|
||||
if "relationship_name" in item:
|
||||
return item["relationship_name"]
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def create_edge_type_datapoints(edges_data) -> list[EdgeType]:
|
||||
"""Transform raw edge data into EdgeType datapoints."""
|
||||
edge_texts = [
|
||||
_get_edge_text(item)
|
||||
for edge in edges_data
|
||||
for item in edge
|
||||
if isinstance(item, dict) and "relationship_name" in item
|
||||
]
|
||||
|
||||
edge_types = Counter(edge_texts)
|
||||
|
||||
return [
|
||||
EdgeType(id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count)
|
||||
for text, count in edge_types.items()
|
||||
]
|
||||
|
||||
|
||||
async def index_graph_edges(
|
||||
edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None,
|
||||
):
|
||||
|
|
@ -23,24 +50,17 @@ async def index_graph_edges(
|
|||
the `relationship_name` field.
|
||||
|
||||
Steps:
|
||||
1. Initialize the vector engine and graph engine.
|
||||
2. Retrieve graph edge data and count relationship types (`relationship_name`).
|
||||
3. Create vector indexes for `relationship_name` if they don't exist.
|
||||
4. Transform the counted relationships into `EdgeType` objects.
|
||||
5. Index the transformed data points in the vector engine.
|
||||
1. Initialize the graph engine if needed and retrieve edge data.
|
||||
2. Transform edge data into EdgeType datapoints.
|
||||
3. Index the EdgeType datapoints using the standard indexing function.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If initialization of the vector engine or graph engine fails.
|
||||
RuntimeError: If initialization of the graph engine fails.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
try:
|
||||
created_indexes = {}
|
||||
index_points = {}
|
||||
|
||||
vector_engine = get_vector_engine()
|
||||
|
||||
if edges_data is None:
|
||||
graph_engine = await get_graph_engine()
|
||||
_, edges_data = await graph_engine.get_graph_data()
|
||||
|
|
@ -51,47 +71,7 @@ async def index_graph_edges(
|
|||
logger.error("Failed to initialize engines: %s", e)
|
||||
raise RuntimeError("Initialization error") from e
|
||||
|
||||
edge_types = Counter(
|
||||
item.get("relationship_name")
|
||||
for edge in edges_data
|
||||
for item in edge
|
||||
if isinstance(item, dict) and "relationship_name" in item
|
||||
)
|
||||
|
||||
for text, count in edge_types.items():
|
||||
edge = EdgeType(
|
||||
id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count
|
||||
)
|
||||
data_point_type = type(edge)
|
||||
|
||||
for field_name in edge.metadata["index_fields"]:
|
||||
index_name = f"{data_point_type.__name__}.{field_name}"
|
||||
|
||||
if index_name not in created_indexes:
|
||||
await vector_engine.create_vector_index(data_point_type.__name__, field_name)
|
||||
created_indexes[index_name] = True
|
||||
|
||||
if index_name not in index_points:
|
||||
index_points[index_name] = []
|
||||
|
||||
indexed_data_point = edge.model_copy()
|
||||
indexed_data_point.metadata["index_fields"] = [field_name]
|
||||
index_points[index_name].append(indexed_data_point)
|
||||
|
||||
# Get maximum batch size for embedding model
|
||||
batch_size = vector_engine.embedding_engine.get_batch_size()
|
||||
tasks: list[asyncio.Task] = []
|
||||
|
||||
for index_name, indexable_points in index_points.items():
|
||||
index_name, field_name = index_name.split(".")
|
||||
|
||||
# Create embedding tasks to run in parallel later
|
||||
for start in range(0, len(indexable_points), batch_size):
|
||||
batch = indexable_points[start : start + batch_size]
|
||||
|
||||
tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
|
||||
|
||||
# Start all embedding tasks and wait for completion
|
||||
await asyncio.gather(*tasks)
|
||||
edge_type_datapoints = create_edge_type_datapoints(edges_data)
|
||||
await index_data_points(edge_type_datapoints)
|
||||
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -52,6 +52,33 @@ async def test_edge_ingestion():
|
|||
|
||||
edge_type_counts = Counter(edge_type[2] for edge_type in graph[1])
|
||||
|
||||
"Tests edge_text presence and format"
|
||||
contains_edges = [edge for edge in graph[1] if edge[2] == "contains"]
|
||||
assert len(contains_edges) > 0, "Expected at least one contains edge for edge_text verification"
|
||||
|
||||
edge_properties = contains_edges[0][3]
|
||||
assert "edge_text" in edge_properties, "Expected edge_text in edge properties"
|
||||
|
||||
edge_text = edge_properties["edge_text"]
|
||||
assert "relationship_name: contains" in edge_text, (
|
||||
f"Expected 'relationship_name: contains' in edge_text, got: {edge_text}"
|
||||
)
|
||||
assert "entity_name:" in edge_text, f"Expected 'entity_name:' in edge_text, got: {edge_text}"
|
||||
assert "entity_description:" in edge_text, (
|
||||
f"Expected 'entity_description:' in edge_text, got: {edge_text}"
|
||||
)
|
||||
|
||||
all_edge_texts = [
|
||||
edge[3].get("edge_text", "") for edge in contains_edges if "edge_text" in edge[3]
|
||||
]
|
||||
expected_entities = ["dave", "ana", "bob", "dexter", "apples", "cognee"]
|
||||
found_entity = any(
|
||||
any(entity in text.lower() for entity in expected_entities) for text in all_edge_texts
|
||||
)
|
||||
assert found_entity, (
|
||||
f"Expected to find at least one entity name in edge_text: {all_edge_texts[:3]}"
|
||||
)
|
||||
|
||||
"Tests the presence of basic nested edges"
|
||||
for basic_nested_edge in basic_nested_edges:
|
||||
assert edge_type_counts.get(basic_nested_edge, 0) >= 1, (
|
||||
|
|
|
|||
|
|
@ -0,0 +1,27 @@
|
|||
import pytest
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
from cognee.tasks.storage.index_data_points import index_data_points
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class TestDataPoint(DataPoint):
|
||||
name: str
|
||||
metadata: dict = {"index_fields": ["name"]}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_index_data_points_calls_vector_engine():
|
||||
"""Test that index_data_points creates vector index and indexes data."""
|
||||
data_points = [TestDataPoint(name="test1")]
|
||||
|
||||
mock_vector_engine = AsyncMock()
|
||||
mock_vector_engine.embedding_engine.get_batch_size = MagicMock(return_value=100)
|
||||
|
||||
with patch.dict(
|
||||
index_data_points.__globals__,
|
||||
{"get_vector_engine": lambda: mock_vector_engine},
|
||||
):
|
||||
await index_data_points(data_points)
|
||||
|
||||
assert mock_vector_engine.create_vector_index.await_count >= 1
|
||||
assert mock_vector_engine.index_data_points.await_count >= 1
|
||||
|
|
@ -5,8 +5,7 @@ from cognee.tasks.storage.index_graph_edges import index_graph_edges
|
|||
|
||||
@pytest.mark.asyncio
|
||||
async def test_index_graph_edges_success():
|
||||
"""Test that index_graph_edges uses the index datapoints and creates vector index."""
|
||||
# Create the mocks for the graph and vector engines.
|
||||
"""Test that index_graph_edges retrieves edges and delegates to index_data_points."""
|
||||
mock_graph_engine = AsyncMock()
|
||||
mock_graph_engine.get_graph_data.return_value = (
|
||||
None,
|
||||
|
|
@ -15,26 +14,23 @@ async def test_index_graph_edges_success():
|
|||
[{"relationship_name": "rel2"}],
|
||||
],
|
||||
)
|
||||
mock_vector_engine = AsyncMock()
|
||||
mock_vector_engine.embedding_engine.get_batch_size = MagicMock(return_value=100)
|
||||
mock_index_data_points = AsyncMock()
|
||||
|
||||
# Patch the globals of the function so that when it does:
|
||||
# vector_engine = get_vector_engine()
|
||||
# graph_engine = await get_graph_engine()
|
||||
# it uses the mocked versions.
|
||||
with patch.dict(
|
||||
index_graph_edges.__globals__,
|
||||
{
|
||||
"get_graph_engine": AsyncMock(return_value=mock_graph_engine),
|
||||
"get_vector_engine": lambda: mock_vector_engine,
|
||||
"index_data_points": mock_index_data_points,
|
||||
},
|
||||
):
|
||||
await index_graph_edges()
|
||||
|
||||
# Assertions on the mock calls.
|
||||
mock_graph_engine.get_graph_data.assert_awaited_once()
|
||||
assert mock_vector_engine.create_vector_index.await_count == 1
|
||||
assert mock_vector_engine.index_data_points.await_count == 1
|
||||
mock_index_data_points.assert_awaited_once()
|
||||
|
||||
call_args = mock_index_data_points.call_args[0][0]
|
||||
assert len(call_args) == 2
|
||||
assert all(hasattr(item, "relationship_name") for item in call_args)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
@ -42,20 +38,22 @@ async def test_index_graph_edges_no_relationships():
|
|||
"""Test that index_graph_edges handles empty relationships correctly."""
|
||||
mock_graph_engine = AsyncMock()
|
||||
mock_graph_engine.get_graph_data.return_value = (None, [])
|
||||
mock_vector_engine = AsyncMock()
|
||||
mock_index_data_points = AsyncMock()
|
||||
|
||||
with patch.dict(
|
||||
index_graph_edges.__globals__,
|
||||
{
|
||||
"get_graph_engine": AsyncMock(return_value=mock_graph_engine),
|
||||
"get_vector_engine": lambda: mock_vector_engine,
|
||||
"index_data_points": mock_index_data_points,
|
||||
},
|
||||
):
|
||||
await index_graph_edges()
|
||||
|
||||
mock_graph_engine.get_graph_data.assert_awaited_once()
|
||||
mock_vector_engine.create_vector_index.assert_not_awaited()
|
||||
mock_vector_engine.index_data_points.assert_not_awaited()
|
||||
mock_index_data_points.assert_awaited_once()
|
||||
|
||||
call_args = mock_index_data_points.call_args[0][0]
|
||||
assert len(call_args) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
[project]
|
||||
name = "cognee"
|
||||
|
||||
version = "0.3.7.dev1"
|
||||
version = "0.3.8"
|
||||
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
|
||||
authors = [
|
||||
{ name = "Vasilije Markovic" },
|
||||
|
|
|
|||
2
uv.lock
generated
2
uv.lock
generated
|
|
@ -929,7 +929,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "cognee"
|
||||
version = "0.3.7.dev1"
|
||||
version = "0.3.8"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "aiofiles" },
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue