Skip to content

Commit 7ed083f

Browse files
committed
feat: support gpt-audio
1 parent 5d7a276 commit 7ed083f

File tree

8 files changed

+651
-105
lines changed

8 files changed

+651
-105
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
# Test binary, built with `go test -c`
99
*.test
10+
test.mp3
1011

1112
# Output of the go coverage tool, specifically when used with LiteIDE
1213
*.out

api_integration_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,38 @@ func TestAPI(t *testing.T) {
108108
},
109109
)
110110
checks.NoError(t, err, "CreateChatCompletion (with functions) returned error")
111+
112+
response, err := c.CreateChatCompletion(
113+
ctx,
114+
openai.ChatCompletionRequest{
115+
Model: openai.GPT4oAudioPreview,
116+
Messages: []openai.ChatCompletionMessage{
117+
{
118+
Role: openai.ChatMessageRoleUser,
119+
Content: "hi",
120+
},
121+
},
122+
Audio: &openai.AudioOutput{
123+
Voice: openai.AudioVoiceAlloy,
124+
Format: openai.AudioFormatPCM16,
125+
},
126+
Modalities: []openai.Modality{openai.ModalityText, openai.ModalityAudio},
127+
},
128+
)
129+
checks.NoError(t, err, "CreateChatCompletion (with audio) returned error")
130+
if response.Choices[0].Message.Audio == nil {
131+
t.Fatal("Audio response is nil")
132+
}
133+
if len(response.Choices[0].Message.Audio.Data) == 0 {
134+
t.Fatal("Audio response data is empty")
135+
}
136+
if response.Choices[0].Message.Audio.Transcript == "" {
137+
t.Fatal("Audio response transcript is empty")
138+
}
139+
if response.Usage.PromptTokens == 0 || response.Usage.CompletionTokens == 0 || response.Usage.TotalTokens == 0 {
140+
t.Fatal("Usage is zero")
141+
}
142+
t.Logf("Usage: %+v", response.Usage)
111143
}
112144

113145
func TestCompletionStream(t *testing.T) {
@@ -145,6 +177,60 @@ func TestCompletionStream(t *testing.T) {
145177
}
146178
}
147179

180+
func TestChatCompletionStream(t *testing.T) {
181+
apiToken := os.Getenv("OPENAI_TOKEN")
182+
if apiToken == "" {
183+
t.Skip("Skipping testing against production OpenAI API. Set OPENAI_TOKEN environment variable to enable it.")
184+
}
185+
186+
c := openai.NewClient(apiToken)
187+
ctx := context.Background()
188+
189+
stream, err := c.CreateChatCompletionStream(ctx, openai.ChatCompletionRequest{
190+
Model: openai.GPT4oAudioPreview,
191+
Messages: []openai.ChatCompletionMessage{
192+
{
193+
Role: openai.ChatMessageRoleUser,
194+
Content: "hi",
195+
},
196+
},
197+
Audio: &openai.AudioOutput{
198+
Voice: openai.AudioVoiceAlloy,
199+
Format: openai.AudioFormatPCM16,
200+
},
201+
Modalities: []openai.Modality{openai.ModalityText, openai.ModalityAudio},
202+
StreamOptions: &openai.StreamOptions{
203+
IncludeUsage: true,
204+
},
205+
})
206+
checks.NoError(t, err, "CreateCompletionStream returned error")
207+
defer stream.Close()
208+
209+
var usage *openai.Usage
210+
counter := 0
211+
for {
212+
response, err := stream.Recv()
213+
if err != nil {
214+
if errors.Is(err, io.EOF) {
215+
break
216+
}
217+
t.Errorf("Stream error: %v", err)
218+
} else {
219+
counter++
220+
}
221+
if response.Usage != nil {
222+
usage = response.Usage
223+
t.Logf("Usage: %+v", usage)
224+
}
225+
}
226+
if counter == 0 {
227+
t.Error("Stream did not return any responses")
228+
}
229+
if usage == nil {
230+
t.Error("Usage is nil")
231+
}
232+
}
233+
148234
func TestAPIError(t *testing.T) {
149235
apiToken := os.Getenv("OPENAI_TOKEN")
150236
if apiToken == "" {

chat.go

Lines changed: 106 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,66 @@ type ChatMessageImageURL struct {
8181
Detail ImageURLDetail `json:"detail,omitempty"`
8282
}
8383

84+
type AudioVoice string
85+
86+
const (
87+
AudioVoiceAlloy AudioVoice = "alloy"
88+
AudioVoiceAsh AudioVoice = "ash"
89+
AudioVoiceBallad AudioVoice = "ballad"
90+
AudioVoiceCoral AudioVoice = "coral"
91+
AudioVoiceEcho AudioVoice = "echo"
92+
AudioVoiceFable AudioVoice = "fable"
93+
AudioVoiceNova AudioVoice = "nova"
94+
AudioVoiceOnyx AudioVoice = "onyx"
95+
AudioVoiceSage AudioVoice = "sage"
96+
AudioVoiceShimmer AudioVoice = "shimmer"
97+
AudioVoiceVerse AudioVoice = "verse"
98+
)
99+
100+
type AudioFormat string
101+
102+
const (
103+
AudioFormatWAV AudioFormat = "wav"
104+
AudioFormatMP3 AudioFormat = "mp3"
105+
AudioFormatFLAC AudioFormat = "flac"
106+
AudioFormatOPUS AudioFormat = "opus"
107+
AudioFormatPCM16 AudioFormat = "pcm16"
108+
)
109+
110+
type ChatMessageAudio struct {
111+
// Base64 encoded audio data.
112+
Data string `json:"data,omitempty"`
113+
// The format of the encoded audio data. Currently supports "wav" and "mp3".
114+
Format AudioFormat `json:"format,omitempty"`
115+
}
116+
117+
type Modality string
118+
119+
const (
120+
ModalityAudio Modality = "audio"
121+
ModalityText Modality = "text"
122+
)
123+
124+
type AudioOutput struct {
125+
// The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
126+
Voice AudioVoice `json:"voice"`
127+
// Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
128+
Format AudioFormat `json:"format"`
129+
}
130+
84131
type ChatMessagePartType string
85132

86133
const (
87-
ChatMessagePartTypeText ChatMessagePartType = "text"
88-
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
134+
ChatMessagePartTypeText ChatMessagePartType = "text"
135+
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
136+
ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
89137
)
90138

91139
type ChatMessagePart struct {
92-
Type ChatMessagePartType `json:"type,omitempty"`
93-
Text string `json:"text,omitempty"`
94-
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
140+
Type ChatMessagePartType `json:"type,omitempty"`
141+
Text string `json:"text,omitempty"`
142+
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
143+
InputAudio *ChatMessageAudio `json:"input_audio,omitempty"`
95144
}
96145

97146
type ChatCompletionMessage struct {
@@ -119,76 +168,77 @@ type ChatCompletionMessage struct {
119168

120169
// For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
121170
ToolCallID string `json:"tool_call_id,omitempty"`
171+
172+
// If the audio output modality is requested, this object contains data about the audio response from the model.
173+
Audio *ChatCompletionAudio `json:"audio,omitempty"`
174+
}
175+
176+
type chatCompletionMessageMultiContent struct {
177+
Role string `json:"role"`
178+
Content string `json:"-"`
179+
Refusal string `json:"refusal,omitempty"`
180+
MultiContent []ChatMessagePart `json:"content,omitempty"`
181+
Name string `json:"name,omitempty"`
182+
ReasoningContent string `json:"reasoning_content,omitempty"`
183+
FunctionCall *FunctionCall `json:"function_call,omitempty"`
184+
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
185+
ToolCallID string `json:"tool_call_id,omitempty"`
186+
Audio *ChatCompletionAudio `json:"audio,omitempty"`
187+
}
188+
189+
type chatCompletionMessageSingleContent struct {
190+
Role string `json:"role"`
191+
Content string `json:"content,omitempty"`
192+
Refusal string `json:"refusal,omitempty"`
193+
MultiContent []ChatMessagePart `json:"-"`
194+
Name string `json:"name,omitempty"`
195+
ReasoningContent string `json:"reasoning_content,omitempty"`
196+
FunctionCall *FunctionCall `json:"function_call,omitempty"`
197+
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
198+
ToolCallID string `json:"tool_call_id,omitempty"`
199+
Audio *ChatCompletionAudio `json:"audio,omitempty"`
122200
}
123201

124202
func (m ChatCompletionMessage) MarshalJSON() ([]byte, error) {
125203
if m.Content != "" && m.MultiContent != nil {
126204
return nil, ErrContentFieldsMisused
127205
}
128206
if len(m.MultiContent) > 0 {
129-
msg := struct {
130-
Role string `json:"role"`
131-
Content string `json:"-"`
132-
Refusal string `json:"refusal,omitempty"`
133-
MultiContent []ChatMessagePart `json:"content,omitempty"`
134-
Name string `json:"name,omitempty"`
135-
ReasoningContent string `json:"reasoning_content,omitempty"`
136-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
137-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
138-
ToolCallID string `json:"tool_call_id,omitempty"`
139-
}(m)
207+
msg := chatCompletionMessageMultiContent(m)
140208
return json.Marshal(msg)
141209
}
142210

143-
msg := struct {
144-
Role string `json:"role"`
145-
Content string `json:"content,omitempty"`
146-
Refusal string `json:"refusal,omitempty"`
147-
MultiContent []ChatMessagePart `json:"-"`
148-
Name string `json:"name,omitempty"`
149-
ReasoningContent string `json:"reasoning_content,omitempty"`
150-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
151-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
152-
ToolCallID string `json:"tool_call_id,omitempty"`
153-
}(m)
211+
msg := chatCompletionMessageSingleContent(m)
154212
return json.Marshal(msg)
155213
}
156214

157215
func (m *ChatCompletionMessage) UnmarshalJSON(bs []byte) error {
158-
msg := struct {
159-
Role string `json:"role"`
160-
Content string `json:"content"`
161-
Refusal string `json:"refusal,omitempty"`
162-
MultiContent []ChatMessagePart
163-
Name string `json:"name,omitempty"`
164-
ReasoningContent string `json:"reasoning_content,omitempty"`
165-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
166-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
167-
ToolCallID string `json:"tool_call_id,omitempty"`
168-
}{}
216+
msg := chatCompletionMessageSingleContent{}
169217

170218
if err := json.Unmarshal(bs, &msg); err == nil {
171219
*m = ChatCompletionMessage(msg)
172220
return nil
173221
}
174-
multiMsg := struct {
175-
Role string `json:"role"`
176-
Content string
177-
Refusal string `json:"refusal,omitempty"`
178-
MultiContent []ChatMessagePart `json:"content"`
179-
Name string `json:"name,omitempty"`
180-
ReasoningContent string `json:"reasoning_content,omitempty"`
181-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
182-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
183-
ToolCallID string `json:"tool_call_id,omitempty"`
184-
}{}
222+
multiMsg := chatCompletionMessageMultiContent{}
185223
if err := json.Unmarshal(bs, &multiMsg); err != nil {
186224
return err
187225
}
188226
*m = ChatCompletionMessage(multiMsg)
189227
return nil
190228
}
191229

230+
type ChatCompletionAudio struct {
231+
// Unique identifier for this audio response.
232+
ID string `json:"id"`
233+
// The Unix timestamp (in seconds) for when this audio response will no longer
234+
// be accessible on the server for use in multi-turn conversations.
235+
ExpiresAt int64 `json:"expires_at"`
236+
// Base64 encoded audio bytes generated by the model, in the format specified in the request.
237+
Data string `json:"data"`
238+
// Transcript of the audio generated by the model.
239+
Transcript string `json:"transcript"`
240+
}
241+
192242
type ToolCall struct {
193243
// Index is not nil only in chat completion chunk object
194244
Index *int `json:"index,omitempty"`
@@ -331,6 +381,13 @@ type ChatCompletionRequest struct {
331381
// We recommend hashing their username or email address, in order to avoid sending us any identifying information.
332382
// https://platform.openai.com/docs/api-reference/chat/create#chat_create-safety_identifier
333383
SafetyIdentifier string `json:"safety_identifier,omitempty"`
384+
// Output types that you would like the model to generate for this request.
385+
// Most models are capable of generating text, which is the default: ["text"]
386+
// The gpt-4o-audio-preview model can also be used to generate audio.
387+
// To request that this model generate both text and audio responses, you can use: ["text", "audio"]
388+
Modalities []Modality `json:"modalities,omitempty"`
389+
// Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
390+
Audio *AudioOutput `json:"audio,omitempty"`
334391
// Embedded struct for non-OpenAI extensions
335392
ChatCompletionRequestExtensions
336393
}

chat_stream.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,19 @@ import (
55
"net/http"
66
)
77

8+
type ChatCompletionStreamChoiceDeltaAudio struct {
9+
ID string `json:"id,omitempty"`
10+
Transcript string `json:"transcript,omitempty"`
11+
Data string `json:"data,omitempty"`
12+
}
13+
814
type ChatCompletionStreamChoiceDelta struct {
9-
Content string `json:"content,omitempty"`
10-
Role string `json:"role,omitempty"`
11-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
12-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
13-
Refusal string `json:"refusal,omitempty"`
15+
Content string `json:"content,omitempty"`
16+
Role string `json:"role,omitempty"`
17+
FunctionCall *FunctionCall `json:"function_call,omitempty"`
18+
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
19+
Refusal string `json:"refusal,omitempty"`
20+
Audio *ChatCompletionStreamChoiceDeltaAudio `json:"audio,omitempty"`
1421

1522
// This property is used for the "reasoning" feature supported by deepseek-reasoner
1623
// which is not in the official documentation.

0 commit comments

Comments
 (0)