diff --git a/frontend/app/chat/_components/assistant-message.tsx b/frontend/app/chat/_components/assistant-message.tsx
index 0a046af0..ae6cffbb 100644
--- a/frontend/app/chat/_components/assistant-message.tsx
+++ b/frontend/app/chat/_components/assistant-message.tsx
@@ -3,9 +3,10 @@ import { motion } from "motion/react";
import DogIcon from "@/components/icons/dog-icon";
import { MarkdownRenderer } from "@/components/markdown-renderer";
import { cn } from "@/lib/utils";
-import type { FunctionCall } from "../_types/types";
+import type { FunctionCall, TokenUsage as TokenUsageType } from "../_types/types";
import { FunctionCalls } from "./function-calls";
import { Message } from "./message";
+import { TokenUsage } from "./token-usage";
interface AssistantMessageProps {
content: string;
@@ -21,6 +22,7 @@ interface AssistantMessageProps {
animate?: boolean;
delay?: number;
isInitialGreeting?: boolean;
+ usage?: TokenUsageType;
}
export function AssistantMessage({
@@ -37,6 +39,7 @@ export function AssistantMessage({
animate = true,
delay = 0.2,
isInitialGreeting = false,
+ usage,
}: AssistantMessageProps) {
return (
+ {usage && !isStreaming && }
diff --git a/frontend/app/chat/_components/token-usage.tsx b/frontend/app/chat/_components/token-usage.tsx
new file mode 100644
index 00000000..2fc5f03a
--- /dev/null
+++ b/frontend/app/chat/_components/token-usage.tsx
@@ -0,0 +1,27 @@
+import { Zap } from "lucide-react";
+import type { TokenUsage as TokenUsageType } from "../_types/types";
+
+interface TokenUsageProps {
+ usage: TokenUsageType;
+}
+
+export function TokenUsage({ usage }: TokenUsageProps) {
+ // Guard against partial/malformed usage data
+ if (typeof usage.input_tokens !== "number" || typeof usage.output_tokens !== "number") {
+ return null;
+ }
+
+ return (
+
+
+
+ {usage.input_tokens.toLocaleString()} in / {usage.output_tokens.toLocaleString()} out
+ {usage.input_tokens_details?.cached_tokens ? (
+
+ ({usage.input_tokens_details.cached_tokens.toLocaleString()} cached)
+
+ ) : null}
+
+
+ );
+}
diff --git a/frontend/app/chat/_types/types.ts b/frontend/app/chat/_types/types.ts
index c0d732ea..c605da7c 100644
--- a/frontend/app/chat/_types/types.ts
+++ b/frontend/app/chat/_types/types.ts
@@ -1,3 +1,15 @@
+export interface TokenUsage {
+ input_tokens: number;
+ output_tokens: number;
+ total_tokens: number;
+ input_tokens_details?: {
+ cached_tokens?: number;
+ };
+ output_tokens_details?: {
+ reasoning_tokens?: number;
+ };
+}
+
export interface Message {
role: "user" | "assistant";
content: string;
@@ -5,6 +17,7 @@ export interface Message {
functionCalls?: FunctionCall[];
isStreaming?: boolean;
source?: "langflow" | "chat";
+ usage?: TokenUsage;
}
export interface FunctionCall {
diff --git a/frontend/app/chat/page.tsx b/frontend/app/chat/page.tsx
index f15cf788..4d2ddfef 100644
--- a/frontend/app/chat/page.tsx
+++ b/frontend/app/chat/page.tsx
@@ -501,6 +501,17 @@ function ChatPage() {
} else {
console.log("No function calls found in message");
}
+
+ // Extract usage data from response_data
+ if (msg.response_data && typeof msg.response_data === "object") {
+ const responseData =
+ typeof msg.response_data === "string"
+ ? JSON.parse(msg.response_data)
+ : msg.response_data;
+ if (responseData.usage) {
+ message.usage = responseData.usage;
+ }
+ }
}
return message;
@@ -849,6 +860,7 @@ function ChatPage() {
role: "assistant",
content: result.response,
timestamp: new Date(),
+ usage: result.usage,
};
setMessages((prev) => [...prev, assistantMessage]);
if (result.response_id) {
@@ -1164,6 +1176,7 @@ function ChatPage() {
messages.length === 1 &&
message.content === "How can I assist?"
}
+ usage={message.usage}
/>
),
diff --git a/frontend/hooks/useChatStreaming.ts b/frontend/hooks/useChatStreaming.ts
index 89d0d810..6d19ac08 100644
--- a/frontend/hooks/useChatStreaming.ts
+++ b/frontend/hooks/useChatStreaming.ts
@@ -3,6 +3,7 @@ import type {
FunctionCall,
Message,
SelectedFilters,
+ TokenUsage,
} from "@/app/chat/_types/types";
import { useChat } from "@/contexts/chat-context";
@@ -130,6 +131,7 @@ export function useChatStreaming({
let currentContent = "";
const currentFunctionCalls: FunctionCall[] = [];
let newResponseId: string | null = null;
+ let usageData: TokenUsage | undefined;
// Initialize streaming message
if (!controller.signal.aborted && thisStreamId === streamIdRef.current) {
@@ -448,6 +450,10 @@ export function useChatStreaming({
else if (chunk.type === "response.output_text.delta") {
currentContent += chunk.delta || "";
}
+ // Handle response.completed event - capture usage
+ else if (chunk.type === "response.completed" && chunk.response?.usage) {
+ usageData = chunk.response.usage;
+ }
// Handle OpenRAG backend format
else if (chunk.output_text) {
currentContent += chunk.output_text;
@@ -567,6 +573,7 @@ export function useChatStreaming({
currentFunctionCalls.length > 0 ? currentFunctionCalls : undefined,
timestamp: new Date(),
isStreaming: false,
+ usage: usageData,
};
if (!controller.signal.aborted && thisStreamId === streamIdRef.current) {
diff --git a/src/agent.py b/src/agent.py
index 08ca99d4..76cf8b1d 100644
--- a/src/agent.py
+++ b/src/agent.py
@@ -197,6 +197,18 @@ async def async_response_stream(
sample_data=str(potential_tool_fields)[:500]
)
+ # Detect response.completed event and log usage
+ if isinstance(chunk_data, dict) and chunk_data.get("type") == "response.completed":
+ response_data = chunk_data.get("response", {})
+ usage = response_data.get("usage")
+ if usage:
+ logger.info(
+ "Stream usage data",
+ input_tokens=usage.get("input_tokens"),
+ output_tokens=usage.get("output_tokens"),
+ total_tokens=usage.get("total_tokens"),
+ )
+
# Middleware: Detect implicit tool calls and inject standardized events
# This helps Granite 3.3 8b and other models that don't emit standard markers
if isinstance(chunk_data, dict) and not detected_tool_call:
@@ -487,6 +499,7 @@ async def async_chat_stream(
full_response = ""
response_id = None
+ usage_data = None
async for chunk in async_stream(
async_client,
prompt,
@@ -506,6 +519,10 @@ async def async_chat_stream(
response_id = chunk_data["id"]
elif "response_id" in chunk_data:
response_id = chunk_data["response_id"]
+ # Capture usage from response.completed event
+ if chunk_data.get("type") == "response.completed":
+ response_obj = chunk_data.get("response", {})
+ usage_data = response_obj.get("usage")
except:
pass
yield chunk
@@ -518,6 +535,9 @@ async def async_chat_stream(
"response_id": response_id,
"timestamp": datetime.now(),
}
+ # Store usage data if available (from response.completed event)
+ if usage_data:
+ assistant_message["response_data"] = {"usage": usage_data}
conversation_state["messages"].append(assistant_message)
# Store the conversation thread with its response_id
@@ -676,6 +696,7 @@ async def async_langflow_chat_stream(
full_response = ""
response_id = None
+ usage_data = None
collected_chunks = [] # Store all chunks for function call data
async for chunk in async_stream(
@@ -700,6 +721,10 @@ async def async_langflow_chat_stream(
response_id = chunk_data["id"]
elif "response_id" in chunk_data:
response_id = chunk_data["response_id"]
+ # Capture usage from response.completed event
+ if chunk_data.get("type") == "response.completed":
+ response_obj = chunk_data.get("response", {})
+ usage_data = response_obj.get("usage")
except:
pass
yield chunk
@@ -713,6 +738,9 @@ async def async_langflow_chat_stream(
"timestamp": datetime.now(),
"chunks": collected_chunks, # Store complete chunk data for function calls
}
+ # Store usage data if available (from response.completed event)
+ if usage_data:
+ assistant_message["response_data"] = {"usage": usage_data}
conversation_state["messages"].append(assistant_message)
# Store the conversation thread with its response_id
diff --git a/src/api/v1/chat.py b/src/api/v1/chat.py
index ecadba16..9fc83d67 100644
--- a/src/api/v1/chat.py
+++ b/src/api/v1/chat.py
@@ -239,11 +239,16 @@ async def chat_get_endpoint(request: Request, chat_service, session_manager):
# Transform to public API format
messages = []
for msg in conversation.get("messages", []):
- messages.append({
+ message_data = {
"role": msg.get("role"),
"content": msg.get("content"),
"timestamp": msg.get("timestamp"),
- })
+ }
+ # Include token usage if available (from Responses API)
+ usage = msg.get("response_data", {}).get("usage") if isinstance(msg.get("response_data"), dict) else None
+ if usage:
+ message_data["usage"] = usage
+ messages.append(message_data)
response_data = {
"chat_id": conversation.get("response_id"),