Skip to content

Commit 3369552

Browse files
author
zhangjk
committed
Add support for audio and video input
1 parent 8e5611c commit 3369552

File tree

2 files changed

+63
-5
lines changed

2 files changed

+63
-5
lines changed

chat.go

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package openai
22

33
import (
4+
"bytes"
45
"context"
56
"encoding/json"
67
"errors"
@@ -86,12 +87,34 @@ type ChatMessagePartType string
8687
const (
8788
ChatMessagePartTypeText ChatMessagePartType = "text"
8889
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
90+
ChatMessagePartTypeAudio ChatMessagePartType = "input_audio"
91+
ChatMessagePartTypeVideo ChatMessagePartType = "video"
92+
ChatMessagePartTypeVideoURL ChatMessagePartType = "video_url"
8993
)
9094

95+
/* reference:
96+
* https://bailian.console.aliyun.com/?spm=5176.29597918.J_SEsSjsNv72yRuRFS2VknO.2.191e7b08wdOQzD&tab=api#/api/?type=model&url=2712576
97+
* https://help.aliyun.com/zh/model-studio/qwen-omni#423736d367a7x
98+
*/
99+
type InputAudio struct {
100+
Data string `json:"data"`
101+
Format string `json:"format"`
102+
}
103+
104+
type CacheControl struct {
105+
Type string `json:"type"` // must be "ephemeral"
106+
}
107+
91108
type ChatMessagePart struct {
92-
Type ChatMessagePartType `json:"type,omitempty"`
93-
Text string `json:"text,omitempty"`
94-
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
109+
Type ChatMessagePartType `json:"type,omitempty"`
110+
Text string `json:"text,omitempty"`
111+
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
112+
Audio *InputAudio `json:"input_audio,omitempty"` // required when Type is "input_audio"
113+
VideoURL *ChatMessageImageURL `json:"video_url,omitempty"` // required when Type is "video_url"
114+
Video []string `json:"video,omitempty"` // required when Type is "video", array of image URLs
115+
MinPixels int `json:"min_pixels,omitempty"`
116+
MaxPixels int `json:"max_pixels,omitempty"`
117+
*CacheControl `json:"cache_control,omitempty"`
95118
}
96119

97120
type ChatCompletionMessage struct {
@@ -333,6 +356,33 @@ type ChatCompletionRequest struct {
333356
SafetyIdentifier string `json:"safety_identifier,omitempty"`
334357
// Embedded struct for non-OpenAI extensions
335358
ChatCompletionRequestExtensions
359+
// non-OpenAI extensions
360+
Extensions map[string]interface{} `json:"-"`
361+
}
362+
363+
type customChatCompletionRequest ChatCompletionRequest
364+
365+
func (r *ChatCompletionRequest) MarshalJSON() ([]byte, error) {
366+
if len(r.Extensions) == 0 {
367+
return json.Marshal((*customChatCompletionRequest)(r))
368+
}
369+
buf := bytes.NewBuffer(nil)
370+
encoder := json.NewEncoder(buf)
371+
if err := encoder.Encode((*customChatCompletionRequest)(r)); err != nil {
372+
return nil, err
373+
}
374+
// remove the trailing "}\n"
375+
buf.Truncate(buf.Len() - 2)
376+
// record the current position
377+
pos := buf.Len()
378+
// append extensions
379+
if err := encoder.Encode(r.Extensions); err != nil {
380+
return nil, err
381+
}
382+
data := buf.Bytes()
383+
// change the leading '{' of extensions to ','
384+
data[pos] = ','
385+
return data, nil
336386
}
337387

338388
type StreamOptions struct {

chat_stream.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ import (
55
"net/http"
66
)
77

8+
// reference: https://bailian.console.aliyun.com/?spm=5176.29597918.J_SEsSjsNv72yRuRFS2VknO.2.191e7b08wdOQzD&tab=api#/api/?type=model&url=2712576
9+
type OutputAudio struct {
10+
Transcript string `json:"transcript"` // streamed text content
11+
Data string `json:"data"` // base64-encoded audio data
12+
ExpiresAt int `json:"expires_at"` // the timestamp when the request was created
13+
}
14+
815
type ChatCompletionStreamChoiceDelta struct {
916
Content string `json:"content,omitempty"`
1017
Role string `json:"role,omitempty"`
@@ -16,7 +23,8 @@ type ChatCompletionStreamChoiceDelta struct {
1623
// which is not in the official documentation.
1724
// the doc from deepseek:
1825
// - https://api-docs.deepseek.com/api/create-chat-completion#responses
19-
ReasoningContent string `json:"reasoning_content,omitempty"`
26+
ReasoningContent string `json:"reasoning_content,omitempty"`
27+
Audio *OutputAudio `json:"audio,omitempty"` // Audio is used for audio responses, if supported by the model, such as "qwen-omni".
2028
}
2129

2230
type ChatCompletionStreamChoiceLogprobs struct {
@@ -95,7 +103,7 @@ func (c *Client) CreateChatCompletionStream(
95103
ctx,
96104
http.MethodPost,
97105
c.fullURL(urlSuffix, withModel(request.Model)),
98-
withBody(request),
106+
withBody(&request),
99107
)
100108
if err != nil {
101109
return nil, err

0 commit comments

Comments
 (0)