Improve Langfuse integration and stream response cleanup handling

• Check env vars before enabling Langfuse
• Move imports after env check logic
• Handle wrapper client aclose() issues
• Add debug logs for cleanup failures

(cherry picked from commit 10f6e6955f)
This commit is contained in:
yangdx 2025-11-03 13:09:45 +08:00 committed by Raphaël MANSUY
parent ccdd3c2786
commit cc33728c10

View file

@ -1,29 +1,15 @@
from ..utils import verbose_debug, VERBOSE_DEBUG from ..utils import verbose_debug, VERBOSE_DEBUG
import sys
import os import os
import logging import logging
if sys.version_info < (3, 9): from collections.abc import AsyncIterator
from typing import AsyncIterator
else: import pipmaster as pm
from collections.abc import AsyncIterator
import pipmaster as pm # Pipmaster for dynamic library install
# install specific modules # install specific modules
if not pm.is_installed("openai"): if not pm.is_installed("openai"):
pm.install("openai") pm.install("openai")
# Try to import Langfuse for LLM observability (optional)
# Falls back to standard OpenAI client if not available
try:
from langfuse.openai import AsyncOpenAI
LANGFUSE_ENABLED = True
logger.info("Langfuse observability enabled for OpenAI client")
except ImportError:
from openai import AsyncOpenAI
LANGFUSE_ENABLED = False
logger.debug("Langfuse not available, using standard OpenAI client")
from openai import ( from openai import (
APIConnectionError, APIConnectionError,
RateLimitError, RateLimitError,
@ -40,6 +26,7 @@ from lightrag.utils import (
safe_unicode_decode, safe_unicode_decode,
logger, logger,
) )
from lightrag.types import GPTKeywordExtractionFormat from lightrag.types import GPTKeywordExtractionFormat
from lightrag.api import __api_version__ from lightrag.api import __api_version__
@ -49,6 +36,32 @@ from typing import Any, Union
from dotenv import load_dotenv from dotenv import load_dotenv
# Try to import Langfuse for LLM observability (optional)
# Falls back to standard OpenAI client if not available
# Langfuse requires proper configuration to work correctly
LANGFUSE_ENABLED = False
try:
# Check if required Langfuse environment variables are set
langfuse_public_key = os.environ.get("LANGFUSE_PUBLIC_KEY")
langfuse_secret_key = os.environ.get("LANGFUSE_SECRET_KEY")
# Only enable Langfuse if both keys are configured
if langfuse_public_key and langfuse_secret_key:
from langfuse.openai import AsyncOpenAI
LANGFUSE_ENABLED = True
logger.info("Langfuse observability enabled for OpenAI client")
else:
from openai import AsyncOpenAI
logger.debug(
"Langfuse environment variables not configured, using standard OpenAI client"
)
except ImportError:
from openai import AsyncOpenAI
logger.debug("Langfuse not available, using standard OpenAI client")
# use the .env that is inside the current folder # use the .env that is inside the current folder
# allows to use different .env file for each lightrag instance # allows to use different .env file for each lightrag instance
# the OS environment variables take precedence over the .env file # the OS environment variables take precedence over the .env file
@ -64,7 +77,7 @@ class InvalidResponseError(Exception):
def create_openai_async_client( def create_openai_async_client(
api_key: str | None = None, api_key: str | None = None,
base_url: str | None = None, base_url: str | None = None,
client_configs: dict[str, Any] = None, client_configs: dict[str, Any] | None = None,
) -> AsyncOpenAI: ) -> AsyncOpenAI:
"""Create an AsyncOpenAI client with the given configuration. """Create an AsyncOpenAI client with the given configuration.
@ -106,57 +119,6 @@ def create_openai_async_client(
return AsyncOpenAI(**merged_configs) return AsyncOpenAI(**merged_configs)
def _normalize_openai_kwargs_for_model(model: str, kwargs: dict[str, Any]) -> None:
"""
Normalize OpenAI API parameters based on the model being used.
This function handles model-specific parameter requirements:
- gpt-5-nano uses 'max_completion_tokens' instead of 'max_tokens'
- gpt-5-nano uses reasoning tokens which consume from the token budget
- gpt-5-nano doesn't support custom temperature values
- Other models support both parameters
Args:
model: The model name (e.g., 'gpt-5-nano', 'gpt-4o', 'gpt-4o-mini')
kwargs: The API parameters dict to normalize (modified in-place)
"""
# Handle max_tokens vs max_completion_tokens conversion for gpt-5 models
if model.startswith("gpt-5"):
# gpt-5-nano and variants use max_completion_tokens
if "max_tokens" in kwargs and "max_completion_tokens" not in kwargs:
# If only max_tokens is set, move it to max_completion_tokens
max_tokens = kwargs.pop("max_tokens")
# For gpt-5-nano, we need to account for reasoning tokens
# Increase buffer to ensure actual content is generated
# Reasoning typically uses 1.5-2x the actual content tokens needed
kwargs["max_completion_tokens"] = int(max(max_tokens * 2.5, 300))
else:
# If both are set, remove max_tokens (it's not supported)
max_tokens = kwargs.pop("max_tokens", None)
if max_tokens and "max_completion_tokens" in kwargs:
# If max_completion_tokens is already set and seems too small, increase it
if kwargs["max_completion_tokens"] < 300:
kwargs["max_completion_tokens"] = int(max(kwargs["max_completion_tokens"] * 2.5, 300))
# Ensure a minimum token budget for gpt-5-nano due to reasoning overhead
if "max_completion_tokens" in kwargs:
if kwargs["max_completion_tokens"] < 300:
# Minimum 300 tokens to account for reasoning (reasoning can be expensive)
original = kwargs["max_completion_tokens"]
kwargs["max_completion_tokens"] = 300
logger.debug(f"Increased max_completion_tokens from {original} to 300 for {model} (reasoning overhead)")
# Handle temperature constraint for gpt-5 models
if model.startswith("gpt-5"):
# gpt-5-nano requires default temperature (doesn't support custom values)
# Remove any custom temperature setting
if "temperature" in kwargs:
kwargs.pop("temperature")
logger.debug(f"Removed custom temperature for {model}: uses default")
logger.debug(f"Normalized parameters for {model}: {kwargs}")
@retry( @retry(
stop=stop_after_attempt(3), stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10), wait=wait_exponential(multiplier=1, min=4, max=10),
@ -180,7 +142,7 @@ async def openai_complete_if_cache(
) -> str: ) -> str:
"""Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration. """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
This function supports automatic integration of reasoning content (思维链) from models that provide This function supports automatic integration of reasoning content from models that provide
Chain of Thought capabilities. The reasoning content is seamlessly integrated into the response Chain of Thought capabilities. The reasoning content is seamlessly integrated into the response
using <think>...</think> tags. using <think>...</think> tags.
@ -262,9 +224,6 @@ async def openai_complete_if_cache(
messages = kwargs.pop("messages", messages) messages = kwargs.pop("messages", messages)
# Normalize API parameters based on model requirements
_normalize_openai_kwargs_for_model(model, kwargs)
try: try:
# Don't use async with context manager, use client directly # Don't use async with context manager, use client directly
if "response_format" in kwargs: if "response_format" in kwargs:
@ -328,19 +287,16 @@ async def openai_complete_if_cache(
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
content = getattr(delta, "content", None) content = getattr(delta, "content", None)
reasoning_content = getattr(delta, "reasoning_content", None) reasoning_content = getattr(delta, "reasoning_content", "")
# Handle COT logic for streaming (only if enabled) # Handle COT logic for streaming (only if enabled)
if enable_cot: if enable_cot:
if content is not None and content != "": if content:
# Regular content is present # Regular content is present
if not initial_content_seen: if not initial_content_seen:
initial_content_seen = True initial_content_seen = True
# If both content and reasoning_content are present initially, don't start COT # If both content and reasoning_content are present initially, don't start COT
if ( if reasoning_content:
reasoning_content is not None
and reasoning_content != ""
):
cot_active = False cot_active = False
cot_started = False cot_started = False
@ -354,7 +310,7 @@ async def openai_complete_if_cache(
content = safe_unicode_decode(content.encode("utf-8")) content = safe_unicode_decode(content.encode("utf-8"))
yield content yield content
elif reasoning_content is not None and reasoning_content != "": elif reasoning_content:
# Only reasoning content is present # Only reasoning content is present
if not initial_content_seen and not cot_started: if not initial_content_seen and not cot_started:
# Start COT if we haven't seen initial content yet # Start COT if we haven't seen initial content yet
@ -372,7 +328,7 @@ async def openai_complete_if_cache(
yield reasoning_content yield reasoning_content
else: else:
# COT disabled, only process regular content # COT disabled, only process regular content
if content is not None and content != "": if content:
if r"\u" in content: if r"\u" in content:
content = safe_unicode_decode(content.encode("utf-8")) content = safe_unicode_decode(content.encode("utf-8"))
yield content yield content
@ -440,18 +396,23 @@ async def openai_complete_if_cache(
) )
# Ensure resources are released even if no exception occurs # Ensure resources are released even if no exception occurs
if ( # Note: Some wrapped clients (e.g., Langfuse) may not implement aclose() properly
iteration_started if iteration_started and hasattr(response, "aclose"):
and hasattr(response, "aclose") aclose_method = getattr(response, "aclose", None)
and callable(getattr(response, "aclose", None)) if callable(aclose_method):
): try:
try: await response.aclose()
await response.aclose() logger.debug("Successfully closed stream response")
logger.debug("Successfully closed stream response") except (AttributeError, TypeError) as close_error:
except Exception as close_error: # Some wrapper objects may report hasattr(aclose) but fail when called
logger.warning( # This is expected behavior for certain client wrappers
f"Failed to close stream response in finally block: {close_error}" logger.debug(
) f"Stream response cleanup not supported by client wrapper: {close_error}"
)
except Exception as close_error:
logger.warning(
f"Unexpected error during stream response cleanup: {close_error}"
)
# This prevents resource leaks since the caller doesn't handle closing # This prevents resource leaks since the caller doesn't handle closing
try: try:
@ -479,7 +440,7 @@ async def openai_complete_if_cache(
message = response.choices[0].message message = response.choices[0].message
content = getattr(message, "content", None) content = getattr(message, "content", None)
reasoning_content = getattr(message, "reasoning_content", None) reasoning_content = getattr(message, "reasoning_content", "")
# Handle COT logic for non-streaming responses (only if enabled) # Handle COT logic for non-streaming responses (only if enabled)
final_content = "" final_content = ""
@ -646,9 +607,10 @@ async def nvidia_openai_complete(
async def openai_embed( async def openai_embed(
texts: list[str], texts: list[str],
model: str = "text-embedding-3-small", model: str = "text-embedding-3-small",
base_url: str = None, base_url: str | None = None,
api_key: str = None, api_key: str | None = None,
client_configs: dict[str, Any] = None, client_configs: dict[str, Any] | None = None,
token_tracker: Any | None = None,
) -> np.ndarray: ) -> np.ndarray:
"""Generate embeddings for a list of texts using OpenAI's API. """Generate embeddings for a list of texts using OpenAI's API.
@ -660,6 +622,7 @@ async def openai_embed(
client_configs: Additional configuration options for the AsyncOpenAI client. client_configs: Additional configuration options for the AsyncOpenAI client.
These will override any default configurations but will be overridden by These will override any default configurations but will be overridden by
explicit parameters (api_key, base_url). explicit parameters (api_key, base_url).
token_tracker: Optional token usage tracker for monitoring API usage.
Returns: Returns:
A numpy array of embeddings, one per input text. A numpy array of embeddings, one per input text.
@ -678,6 +641,14 @@ async def openai_embed(
response = await openai_async_client.embeddings.create( response = await openai_async_client.embeddings.create(
model=model, input=texts, encoding_format="base64" model=model, input=texts, encoding_format="base64"
) )
if token_tracker and hasattr(response, "usage"):
token_counts = {
"prompt_tokens": getattr(response.usage, "prompt_tokens", 0),
"total_tokens": getattr(response.usage, "total_tokens", 0),
}
token_tracker.add_usage(token_counts)
return np.array( return np.array(
[ [
np.array(dp.embedding, dtype=np.float32) np.array(dp.embedding, dtype=np.float32)