diff --git a/mcp_server/.env.example b/mcp_server/.env.example index 1e70ee56..dd4677b2 100644 --- a/mcp_server/.env.example +++ b/mcp_server/.env.example @@ -17,6 +17,20 @@ MODEL_NAME=gpt-4.1-mini # Optional: Group ID for namespacing graph data # GROUP_ID=my_project +# Concurrency Control +# Controls how many episodes can be processed simultaneously +# Default: 10 (suitable for OpenAI Tier 3, mid-tier Anthropic) +# Adjust based on your LLM provider's rate limits: +# - OpenAI Tier 1 (free): 1-2 +# - OpenAI Tier 2: 5-8 +# - OpenAI Tier 3: 10-15 +# - OpenAI Tier 4: 20-50 +# - Anthropic default: 5-8 +# - Anthropic high tier: 15-30 +# - Ollama (local): 1-5 +# See README.md "Concurrency and LLM Provider 429 Rate Limit Errors" for details +SEMAPHORE_LIMIT=10 + # Optional: Path configuration for Docker # PATH=/root/.local/bin:${PATH} diff --git a/mcp_server/README.md b/mcp_server/README.md index 59288edf..b1bca95b 100644 --- a/mcp_server/README.md +++ b/mcp_server/README.md @@ -328,10 +328,46 @@ uv run graphiti_mcp_server.py --config config/config-docker-falkordb.yaml ### Concurrency and LLM Provider 429 Rate Limit Errors -Graphiti's ingestion pipelines are designed for high concurrency, controlled by the `SEMAPHORE_LIMIT` environment variable. -By default, `SEMAPHORE_LIMIT` is set to `10` concurrent operations to help prevent `429` rate limit errors from your LLM provider. If you encounter such errors, try lowering this value. +Graphiti's ingestion pipelines are designed for high concurrency, controlled by the `SEMAPHORE_LIMIT` environment variable. This setting determines how many episodes can be processed simultaneously. Since each episode involves multiple LLM calls (entity extraction, deduplication, summarization), the actual number of concurrent LLM requests will be several times higher. -If your LLM provider allows higher throughput, you can increase `SEMAPHORE_LIMIT` to boost episode ingestion performance. +**Default:** `SEMAPHORE_LIMIT=10` (suitable for OpenAI Tier 3, mid-tier Anthropic) + +#### Tuning Guidelines by LLM Provider + +**OpenAI:** +- Tier 1 (free): 3 RPM → `SEMAPHORE_LIMIT=1-2` +- Tier 2: 60 RPM → `SEMAPHORE_LIMIT=5-8` +- Tier 3: 500 RPM → `SEMAPHORE_LIMIT=10-15` +- Tier 4: 5,000 RPM → `SEMAPHORE_LIMIT=20-50` + +**Anthropic:** +- Default tier: 50 RPM → `SEMAPHORE_LIMIT=5-8` +- High tier: 1,000 RPM → `SEMAPHORE_LIMIT=15-30` + +**Azure OpenAI:** +- Consult your quota in Azure Portal and adjust accordingly +- Start conservative and increase gradually + +**Ollama (local):** +- Hardware dependent → `SEMAPHORE_LIMIT=1-5` +- Monitor CPU/GPU usage and adjust + +#### Symptoms + +- **Too high**: 429 rate limit errors, increased API costs from parallel processing +- **Too low**: Slow episode throughput, underutilized API quota + +#### Monitoring + +- Watch logs for `429` rate limit errors +- Monitor episode processing times in server logs +- Check your LLM provider's dashboard for actual request rates +- Track token usage and costs + +Set this in your `.env` file: +```bash +SEMAPHORE_LIMIT=10 # Adjust based on your LLM provider tier +``` ### Docker Deployment diff --git a/mcp_server/config/config.yaml b/mcp_server/config/config.yaml index 53d14cbf..91f72377 100644 --- a/mcp_server/config/config.yaml +++ b/mcp_server/config/config.yaml @@ -1,5 +1,9 @@ # Graphiti MCP Server Configuration # This file supports environment variable expansion using ${VAR_NAME} or ${VAR_NAME:default_value} +# +# IMPORTANT: Set SEMAPHORE_LIMIT environment variable to control episode processing concurrency +# Default: 10 (suitable for OpenAI Tier 3, mid-tier Anthropic) +# See README.md "Concurrency and LLM Provider 429 Rate Limit Errors" section for tuning guidance server: transport: "http" # Options: stdio, sse (deprecated), http diff --git a/mcp_server/src/graphiti_mcp_server.py b/mcp_server/src/graphiti_mcp_server.py index 6f4306bb..d5b6826c 100644 --- a/mcp_server/src/graphiti_mcp_server.py +++ b/mcp_server/src/graphiti_mcp_server.py @@ -45,8 +45,32 @@ else: # Semaphore limit for concurrent Graphiti operations. -# Decrease this if you're experiencing 429 rate limit errors from your LLM provider. -# Increase if you have high rate limits. +# +# This controls how many episodes can be processed simultaneously. Each episode +# processing involves multiple LLM calls (entity extraction, deduplication, etc.), +# so the actual number of concurrent LLM requests will be higher. +# +# TUNING GUIDELINES: +# +# LLM Provider Rate Limits (requests per minute): +# - OpenAI Tier 1 (free): 3 RPM -> SEMAPHORE_LIMIT=1-2 +# - OpenAI Tier 2: 60 RPM -> SEMAPHORE_LIMIT=5-8 +# - OpenAI Tier 3: 500 RPM -> SEMAPHORE_LIMIT=10-15 +# - OpenAI Tier 4: 5,000 RPM -> SEMAPHORE_LIMIT=20-50 +# - Anthropic (default): 50 RPM -> SEMAPHORE_LIMIT=5-8 +# - Anthropic (high tier): 1,000 RPM -> SEMAPHORE_LIMIT=15-30 +# - Azure OpenAI (varies): Consult your quota -> adjust accordingly +# +# SYMPTOMS: +# - Too high: 429 rate limit errors, increased costs from parallel processing +# - Too low: Slow throughput, underutilized API quota +# +# MONITORING: +# - Watch logs for rate limit errors (429) +# - Monitor episode processing times +# - Check LLM provider dashboard for actual request rates +# +# DEFAULT: 10 (suitable for OpenAI Tier 3, mid-tier Anthropic) SEMAPHORE_LIMIT = int(os.getenv('SEMAPHORE_LIMIT', 10))