From d0f494039a7438136480a3f319bd7fa200bd5da1 Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Thu, 30 Oct 2025 16:35:29 -0700 Subject: [PATCH] Add comprehensive SEMAPHORE_LIMIT documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added detailed documentation for SEMAPHORE_LIMIT configuration to help users optimize episode processing concurrency based on their LLM provider's rate limits. Changes: 1. **graphiti_mcp_server.py** - Expanded inline comments from 3 lines to 26 lines - Added provider-specific tuning guidelines (OpenAI, Anthropic, Azure, Ollama) - Documented symptoms of too-high/too-low settings - Added monitoring recommendations 2. **README.md** - Expanded "Concurrency and LLM Provider 429 Rate Limit Errors" section - Added tier-specific recommendations for each provider - Explained relationship between episode concurrency and LLM request rates - Added troubleshooting symptoms and monitoring guidance - Included example .env configuration 3. **config.yaml** - Added header comment referencing detailed documentation - Noted default value and suitable use case 4. **.env.example** - Added SEMAPHORE_LIMIT with inline tuning guidelines - Quick reference for all major LLM provider tiers - Cross-reference to README for full details Benefits: - Users can now make informed decisions about concurrency settings - Reduces likelihood of 429 rate limit errors from misconfiguration - Helps users maximize throughput within their rate limits - Provides clear troubleshooting guidance Addresses PR #1024 review comment about magic number documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- mcp_server/.env.example | 14 +++++++++ mcp_server/README.md | 42 +++++++++++++++++++++++++-- mcp_server/config/config.yaml | 4 +++ mcp_server/src/graphiti_mcp_server.py | 28 ++++++++++++++++-- 4 files changed, 83 insertions(+), 5 deletions(-) diff --git a/mcp_server/.env.example b/mcp_server/.env.example index 1e70ee56..dd4677b2 100644 --- a/mcp_server/.env.example +++ b/mcp_server/.env.example @@ -17,6 +17,20 @@ MODEL_NAME=gpt-4.1-mini # Optional: Group ID for namespacing graph data # GROUP_ID=my_project +# Concurrency Control +# Controls how many episodes can be processed simultaneously +# Default: 10 (suitable for OpenAI Tier 3, mid-tier Anthropic) +# Adjust based on your LLM provider's rate limits: +# - OpenAI Tier 1 (free): 1-2 +# - OpenAI Tier 2: 5-8 +# - OpenAI Tier 3: 10-15 +# - OpenAI Tier 4: 20-50 +# - Anthropic default: 5-8 +# - Anthropic high tier: 15-30 +# - Ollama (local): 1-5 +# See README.md "Concurrency and LLM Provider 429 Rate Limit Errors" for details +SEMAPHORE_LIMIT=10 + # Optional: Path configuration for Docker # PATH=/root/.local/bin:${PATH} diff --git a/mcp_server/README.md b/mcp_server/README.md index 59288edf..b1bca95b 100644 --- a/mcp_server/README.md +++ b/mcp_server/README.md @@ -328,10 +328,46 @@ uv run graphiti_mcp_server.py --config config/config-docker-falkordb.yaml ### Concurrency and LLM Provider 429 Rate Limit Errors -Graphiti's ingestion pipelines are designed for high concurrency, controlled by the `SEMAPHORE_LIMIT` environment variable. -By default, `SEMAPHORE_LIMIT` is set to `10` concurrent operations to help prevent `429` rate limit errors from your LLM provider. If you encounter such errors, try lowering this value. +Graphiti's ingestion pipelines are designed for high concurrency, controlled by the `SEMAPHORE_LIMIT` environment variable. This setting determines how many episodes can be processed simultaneously. Since each episode involves multiple LLM calls (entity extraction, deduplication, summarization), the actual number of concurrent LLM requests will be several times higher. -If your LLM provider allows higher throughput, you can increase `SEMAPHORE_LIMIT` to boost episode ingestion performance. +**Default:** `SEMAPHORE_LIMIT=10` (suitable for OpenAI Tier 3, mid-tier Anthropic) + +#### Tuning Guidelines by LLM Provider + +**OpenAI:** +- Tier 1 (free): 3 RPM → `SEMAPHORE_LIMIT=1-2` +- Tier 2: 60 RPM → `SEMAPHORE_LIMIT=5-8` +- Tier 3: 500 RPM → `SEMAPHORE_LIMIT=10-15` +- Tier 4: 5,000 RPM → `SEMAPHORE_LIMIT=20-50` + +**Anthropic:** +- Default tier: 50 RPM → `SEMAPHORE_LIMIT=5-8` +- High tier: 1,000 RPM → `SEMAPHORE_LIMIT=15-30` + +**Azure OpenAI:** +- Consult your quota in Azure Portal and adjust accordingly +- Start conservative and increase gradually + +**Ollama (local):** +- Hardware dependent → `SEMAPHORE_LIMIT=1-5` +- Monitor CPU/GPU usage and adjust + +#### Symptoms + +- **Too high**: 429 rate limit errors, increased API costs from parallel processing +- **Too low**: Slow episode throughput, underutilized API quota + +#### Monitoring + +- Watch logs for `429` rate limit errors +- Monitor episode processing times in server logs +- Check your LLM provider's dashboard for actual request rates +- Track token usage and costs + +Set this in your `.env` file: +```bash +SEMAPHORE_LIMIT=10 # Adjust based on your LLM provider tier +``` ### Docker Deployment diff --git a/mcp_server/config/config.yaml b/mcp_server/config/config.yaml index 53d14cbf..91f72377 100644 --- a/mcp_server/config/config.yaml +++ b/mcp_server/config/config.yaml @@ -1,5 +1,9 @@ # Graphiti MCP Server Configuration # This file supports environment variable expansion using ${VAR_NAME} or ${VAR_NAME:default_value} +# +# IMPORTANT: Set SEMAPHORE_LIMIT environment variable to control episode processing concurrency +# Default: 10 (suitable for OpenAI Tier 3, mid-tier Anthropic) +# See README.md "Concurrency and LLM Provider 429 Rate Limit Errors" section for tuning guidance server: transport: "http" # Options: stdio, sse (deprecated), http diff --git a/mcp_server/src/graphiti_mcp_server.py b/mcp_server/src/graphiti_mcp_server.py index 6f4306bb..d5b6826c 100644 --- a/mcp_server/src/graphiti_mcp_server.py +++ b/mcp_server/src/graphiti_mcp_server.py @@ -45,8 +45,32 @@ else: # Semaphore limit for concurrent Graphiti operations. -# Decrease this if you're experiencing 429 rate limit errors from your LLM provider. -# Increase if you have high rate limits. +# +# This controls how many episodes can be processed simultaneously. Each episode +# processing involves multiple LLM calls (entity extraction, deduplication, etc.), +# so the actual number of concurrent LLM requests will be higher. +# +# TUNING GUIDELINES: +# +# LLM Provider Rate Limits (requests per minute): +# - OpenAI Tier 1 (free): 3 RPM -> SEMAPHORE_LIMIT=1-2 +# - OpenAI Tier 2: 60 RPM -> SEMAPHORE_LIMIT=5-8 +# - OpenAI Tier 3: 500 RPM -> SEMAPHORE_LIMIT=10-15 +# - OpenAI Tier 4: 5,000 RPM -> SEMAPHORE_LIMIT=20-50 +# - Anthropic (default): 50 RPM -> SEMAPHORE_LIMIT=5-8 +# - Anthropic (high tier): 1,000 RPM -> SEMAPHORE_LIMIT=15-30 +# - Azure OpenAI (varies): Consult your quota -> adjust accordingly +# +# SYMPTOMS: +# - Too high: 429 rate limit errors, increased costs from parallel processing +# - Too low: Slow throughput, underutilized API quota +# +# MONITORING: +# - Watch logs for rate limit errors (429) +# - Monitor episode processing times +# - Check LLM provider dashboard for actual request rates +# +# DEFAULT: 10 (suitable for OpenAI Tier 3, mid-tier Anthropic) SEMAPHORE_LIMIT = int(os.getenv('SEMAPHORE_LIMIT', 10))