@@ -81,17 +81,66 @@ type ChatMessageImageURL struct {
8181 Detail ImageURLDetail `json:"detail,omitempty"`
8282}
8383
84+ type AudioVoice string
85+
86+ const (
87+ AudioVoiceAlloy AudioVoice = "alloy"
88+ AudioVoiceAsh AudioVoice = "ash"
89+ AudioVoiceBallad AudioVoice = "ballad"
90+ AudioVoiceCoral AudioVoice = "coral"
91+ AudioVoiceEcho AudioVoice = "echo"
92+ AudioVoiceFable AudioVoice = "fable"
93+ AudioVoiceNova AudioVoice = "nova"
94+ AudioVoiceOnyx AudioVoice = "onyx"
95+ AudioVoiceSage AudioVoice = "sage"
96+ AudioVoiceShimmer AudioVoice = "shimmer"
97+ AudioVoiceVerse AudioVoice = "verse"
98+ )
99+
100+ type AudioFormat string
101+
102+ const (
103+ AudioFormatWAV AudioFormat = "wav"
104+ AudioFormatMP3 AudioFormat = "mp3"
105+ AudioFormatFLAC AudioFormat = "flac"
106+ AudioFormatOPUS AudioFormat = "opus"
107+ AudioFormatPCM16 AudioFormat = "pcm16"
108+ )
109+
110+ type ChatMessageAudio struct {
111+ // Base64 encoded audio data.
112+ Data string `json:"data,omitempty"`
113+ // The format of the encoded audio data. Currently supports "wav" and "mp3".
114+ Format AudioFormat `json:"format,omitempty"`
115+ }
116+
117+ type Modality string
118+
119+ const (
120+ ModalityAudio Modality = "audio"
121+ ModalityText Modality = "text"
122+ )
123+
124+ type AudioOutput struct {
125+ // The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
126+ Voice AudioVoice `json:"voice"`
127+ // Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
128+ Format AudioFormat `json:"format"`
129+ }
130+
84131type ChatMessagePartType string
85132
86133const (
87- ChatMessagePartTypeText ChatMessagePartType = "text"
88- ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
134+ ChatMessagePartTypeText ChatMessagePartType = "text"
135+ ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
136+ ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
89137)
90138
91139type ChatMessagePart struct {
92- Type ChatMessagePartType `json:"type,omitempty"`
93- Text string `json:"text,omitempty"`
94- ImageURL * ChatMessageImageURL `json:"image_url,omitempty"`
140+ Type ChatMessagePartType `json:"type,omitempty"`
141+ Text string `json:"text,omitempty"`
142+ ImageURL * ChatMessageImageURL `json:"image_url,omitempty"`
143+ InputAudio * ChatMessageAudio `json:"input_audio,omitempty"`
95144}
96145
97146type ChatCompletionMessage struct {
@@ -119,76 +168,77 @@ type ChatCompletionMessage struct {
119168
120169 // For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
121170 ToolCallID string `json:"tool_call_id,omitempty"`
171+
172+ // If the audio output modality is requested, this object contains data about the audio response from the model.
173+ Audio * ChatCompletionAudio `json:"audio,omitempty"`
174+ }
175+
176+ type chatCompletionMessageMultiContent struct {
177+ Role string `json:"role"`
178+ Content string `json:"-"`
179+ Refusal string `json:"refusal,omitempty"`
180+ MultiContent []ChatMessagePart `json:"content,omitempty"`
181+ Name string `json:"name,omitempty"`
182+ ReasoningContent string `json:"reasoning_content,omitempty"`
183+ FunctionCall * FunctionCall `json:"function_call,omitempty"`
184+ ToolCalls []ToolCall `json:"tool_calls,omitempty"`
185+ ToolCallID string `json:"tool_call_id,omitempty"`
186+ Audio * ChatCompletionAudio `json:"audio,omitempty"`
187+ }
188+
189+ type chatCompletionMessageSingleContent struct {
190+ Role string `json:"role"`
191+ Content string `json:"content,omitempty"`
192+ Refusal string `json:"refusal,omitempty"`
193+ MultiContent []ChatMessagePart `json:"-"`
194+ Name string `json:"name,omitempty"`
195+ ReasoningContent string `json:"reasoning_content,omitempty"`
196+ FunctionCall * FunctionCall `json:"function_call,omitempty"`
197+ ToolCalls []ToolCall `json:"tool_calls,omitempty"`
198+ ToolCallID string `json:"tool_call_id,omitempty"`
199+ Audio * ChatCompletionAudio `json:"audio,omitempty"`
122200}
123201
124202func (m ChatCompletionMessage ) MarshalJSON () ([]byte , error ) {
125203 if m .Content != "" && m .MultiContent != nil {
126204 return nil , ErrContentFieldsMisused
127205 }
128206 if len (m .MultiContent ) > 0 {
129- msg := struct {
130- Role string `json:"role"`
131- Content string `json:"-"`
132- Refusal string `json:"refusal,omitempty"`
133- MultiContent []ChatMessagePart `json:"content,omitempty"`
134- Name string `json:"name,omitempty"`
135- ReasoningContent string `json:"reasoning_content,omitempty"`
136- FunctionCall * FunctionCall `json:"function_call,omitempty"`
137- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
138- ToolCallID string `json:"tool_call_id,omitempty"`
139- }(m )
207+ msg := chatCompletionMessageMultiContent (m )
140208 return json .Marshal (msg )
141209 }
142210
143- msg := struct {
144- Role string `json:"role"`
145- Content string `json:"content,omitempty"`
146- Refusal string `json:"refusal,omitempty"`
147- MultiContent []ChatMessagePart `json:"-"`
148- Name string `json:"name,omitempty"`
149- ReasoningContent string `json:"reasoning_content,omitempty"`
150- FunctionCall * FunctionCall `json:"function_call,omitempty"`
151- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
152- ToolCallID string `json:"tool_call_id,omitempty"`
153- }(m )
211+ msg := chatCompletionMessageSingleContent (m )
154212 return json .Marshal (msg )
155213}
156214
157215func (m * ChatCompletionMessage ) UnmarshalJSON (bs []byte ) error {
158- msg := struct {
159- Role string `json:"role"`
160- Content string `json:"content"`
161- Refusal string `json:"refusal,omitempty"`
162- MultiContent []ChatMessagePart
163- Name string `json:"name,omitempty"`
164- ReasoningContent string `json:"reasoning_content,omitempty"`
165- FunctionCall * FunctionCall `json:"function_call,omitempty"`
166- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
167- ToolCallID string `json:"tool_call_id,omitempty"`
168- }{}
216+ msg := chatCompletionMessageSingleContent {}
169217
170218 if err := json .Unmarshal (bs , & msg ); err == nil {
171219 * m = ChatCompletionMessage (msg )
172220 return nil
173221 }
174- multiMsg := struct {
175- Role string `json:"role"`
176- Content string
177- Refusal string `json:"refusal,omitempty"`
178- MultiContent []ChatMessagePart `json:"content"`
179- Name string `json:"name,omitempty"`
180- ReasoningContent string `json:"reasoning_content,omitempty"`
181- FunctionCall * FunctionCall `json:"function_call,omitempty"`
182- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
183- ToolCallID string `json:"tool_call_id,omitempty"`
184- }{}
222+ multiMsg := chatCompletionMessageMultiContent {}
185223 if err := json .Unmarshal (bs , & multiMsg ); err != nil {
186224 return err
187225 }
188226 * m = ChatCompletionMessage (multiMsg )
189227 return nil
190228}
191229
230+ type ChatCompletionAudio struct {
231+ // Unique identifier for this audio response.
232+ ID string `json:"id"`
233+ // The Unix timestamp (in seconds) for when this audio response will no longer
234+ // be accessible on the server for use in multi-turn conversations.
235+ ExpiresAt int64 `json:"expires_at"`
236+ // Base64 encoded audio bytes generated by the model, in the format specified in the request.
237+ Data string `json:"data"`
238+ // Transcript of the audio generated by the model.
239+ Transcript string `json:"transcript"`
240+ }
241+
192242type ToolCall struct {
193243 // Index is not nil only in chat completion chunk object
194244 Index * int `json:"index,omitempty"`
@@ -331,6 +381,13 @@ type ChatCompletionRequest struct {
331381 // We recommend hashing their username or email address, in order to avoid sending us any identifying information.
332382 // https://platform.openai.com/docs/api-reference/chat/create#chat_create-safety_identifier
333383 SafetyIdentifier string `json:"safety_identifier,omitempty"`
384+ // Output types that you would like the model to generate for this request.
385+ // Most models are capable of generating text, which is the default: ["text"]
386+ // The gpt-4o-audio-preview model can also be used to generate audio.
387+ // To request that this model generate both text and audio responses, you can use: ["text", "audio"]
388+ Modalities []Modality `json:"modalities,omitempty"`
389+ // Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
390+ Audio * AudioOutput `json:"audio,omitempty"`
334391 // Embedded struct for non-OpenAI extensions
335392 ChatCompletionRequestExtensions
336393}
0 commit comments