Merge pull request #2334 from danielaskdd/hotfix-opena-streaming

HotFix: Restore OpenAI Streaming Response & Refactor keyword_extraction Parameter
This commit is contained in:
Daniel.y 2025-11-09 12:25:20 +08:00 committed by GitHub
commit 8859eaade7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -138,9 +138,9 @@ async def openai_complete_if_cache(
base_url: str | None = None, base_url: str | None = None,
api_key: str | None = None, api_key: str | None = None,
token_tracker: Any | None = None, token_tracker: Any | None = None,
keyword_extraction: bool = False, # Will be removed from kwargs before passing to OpenAI
stream: bool | None = None, stream: bool | None = None,
timeout: int | None = None, timeout: int | None = None,
keyword_extraction: bool = False,
**kwargs: Any, **kwargs: Any,
) -> str: ) -> str:
"""Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration. """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
@ -170,14 +170,15 @@ async def openai_complete_if_cache(
api_key: Optional OpenAI API key. If None, uses the OPENAI_API_KEY environment variable. api_key: Optional OpenAI API key. If None, uses the OPENAI_API_KEY environment variable.
token_tracker: Optional token usage tracker for monitoring API usage. token_tracker: Optional token usage tracker for monitoring API usage.
enable_cot: Whether to enable Chain of Thought (COT) processing. Default is False. enable_cot: Whether to enable Chain of Thought (COT) processing. Default is False.
stream: Whether to stream the response. Default is False.
timeout: Request timeout in seconds. Default is None.
keyword_extraction: Whether to enable keyword extraction mode. When True, triggers
special response formatting for keyword extraction. Default is False.
**kwargs: Additional keyword arguments to pass to the OpenAI API. **kwargs: Additional keyword arguments to pass to the OpenAI API.
Special kwargs: Special kwargs:
- openai_client_configs: Dict of configuration options for the AsyncOpenAI client. - openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
These will be passed to the client constructor but will be overridden by These will be passed to the client constructor but will be overridden by
explicit parameters (api_key, base_url). explicit parameters (api_key, base_url).
- keyword_extraction: Will be removed from kwargs before passing to OpenAI.
- stream: Whether to stream the response. Default is False.
- timeout: Request timeout in seconds. Default is None.
Returns: Returns:
The completed text (with integrated COT content if available) or an async iterator The completed text (with integrated COT content if available) or an async iterator
@ -198,7 +199,6 @@ async def openai_complete_if_cache(
# Remove special kwargs that shouldn't be passed to OpenAI # Remove special kwargs that shouldn't be passed to OpenAI
kwargs.pop("hashing_kv", None) kwargs.pop("hashing_kv", None)
kwargs.pop("keyword_extraction", None)
# Extract client configuration options # Extract client configuration options
client_configs = kwargs.pop("openai_client_configs", {}) client_configs = kwargs.pop("openai_client_configs", {})
@ -228,6 +228,12 @@ async def openai_complete_if_cache(
messages = kwargs.pop("messages", messages) messages = kwargs.pop("messages", messages)
# Add explicit parameters back to kwargs so they're passed to OpenAI API
if stream is not None:
kwargs["stream"] = stream
if timeout is not None:
kwargs["timeout"] = timeout
try: try:
# Don't use async with context manager, use client directly # Don't use async with context manager, use client directly
if "response_format" in kwargs: if "response_format" in kwargs:
@ -516,7 +522,6 @@ async def openai_complete(
) -> Union[str, AsyncIterator[str]]: ) -> Union[str, AsyncIterator[str]]:
if history_messages is None: if history_messages is None:
history_messages = [] history_messages = []
keyword_extraction = kwargs.pop("keyword_extraction", None)
if keyword_extraction: if keyword_extraction:
kwargs["response_format"] = "json" kwargs["response_format"] = "json"
model_name = kwargs["hashing_kv"].global_config["llm_model_name"] model_name = kwargs["hashing_kv"].global_config["llm_model_name"]
@ -525,6 +530,7 @@ async def openai_complete(
prompt, prompt,
system_prompt=system_prompt, system_prompt=system_prompt,
history_messages=history_messages, history_messages=history_messages,
keyword_extraction=keyword_extraction,
**kwargs, **kwargs,
) )
@ -539,7 +545,6 @@ async def gpt_4o_complete(
) -> str: ) -> str:
if history_messages is None: if history_messages is None:
history_messages = [] history_messages = []
keyword_extraction = kwargs.pop("keyword_extraction", None)
if keyword_extraction: if keyword_extraction:
kwargs["response_format"] = GPTKeywordExtractionFormat kwargs["response_format"] = GPTKeywordExtractionFormat
return await openai_complete_if_cache( return await openai_complete_if_cache(
@ -548,6 +553,7 @@ async def gpt_4o_complete(
system_prompt=system_prompt, system_prompt=system_prompt,
history_messages=history_messages, history_messages=history_messages,
enable_cot=enable_cot, enable_cot=enable_cot,
keyword_extraction=keyword_extraction,
**kwargs, **kwargs,
) )
@ -562,7 +568,6 @@ async def gpt_4o_mini_complete(
) -> str: ) -> str:
if history_messages is None: if history_messages is None:
history_messages = [] history_messages = []
keyword_extraction = kwargs.pop("keyword_extraction", None)
if keyword_extraction: if keyword_extraction:
kwargs["response_format"] = GPTKeywordExtractionFormat kwargs["response_format"] = GPTKeywordExtractionFormat
return await openai_complete_if_cache( return await openai_complete_if_cache(
@ -571,6 +576,7 @@ async def gpt_4o_mini_complete(
system_prompt=system_prompt, system_prompt=system_prompt,
history_messages=history_messages, history_messages=history_messages,
enable_cot=enable_cot, enable_cot=enable_cot,
keyword_extraction=keyword_extraction,
**kwargs, **kwargs,
) )
@ -585,13 +591,13 @@ async def nvidia_openai_complete(
) -> str: ) -> str:
if history_messages is None: if history_messages is None:
history_messages = [] history_messages = []
kwargs.pop("keyword_extraction", None)
result = await openai_complete_if_cache( result = await openai_complete_if_cache(
"nvidia/llama-3.1-nemotron-70b-instruct", # context length 128k "nvidia/llama-3.1-nemotron-70b-instruct", # context length 128k
prompt, prompt,
system_prompt=system_prompt, system_prompt=system_prompt,
history_messages=history_messages, history_messages=history_messages,
enable_cot=enable_cot, enable_cot=enable_cot,
keyword_extraction=keyword_extraction,
base_url="https://integrate.api.nvidia.com/v1", base_url="https://integrate.api.nvidia.com/v1",
**kwargs, **kwargs,
) )