sashabaranov
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api_integration_test.go‎
Lines changed: 86 additions & 0 deletions b/‎api_integration_test.go‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎chat.go‎
Lines changed: 106 additions & 49 deletions b/‎chat.go‎
Lines changed: 106 additions & 49 deletions
diff --git a/‎chat_stream.go‎
Lines changed: 12 additions & 5 deletions b/‎chat_stream.go‎
Lines changed: 12 additions & 5 deletions
@@ -7,6 +7,7 @@
 
 # Test binary, built with `go test -c`
 *.test
+test.mp3
 
 # Output of the go coverage tool, specifically when used with LiteIDE
 *.out
 
@@ -108,6 +108,38 @@ func TestAPI(t *testing.T) {
 		},
 	)
 	checks.NoError(t, err, "CreateChatCompletion (with functions) returned error")
+
+	response, err := c.CreateChatCompletion(
+		ctx,
+		openai.ChatCompletionRequest{
+			Model: openai.GPT4oAudioPreview,
+			Messages: []openai.ChatCompletionMessage{
+				{
+					Role:    openai.ChatMessageRoleUser,
+					Content: "hi",
+				},
+			},
+			Audio: &openai.AudioOutput{
+				Voice:  openai.AudioVoiceAlloy,
+				Format: openai.AudioFormatPCM16,
+			},
+			Modalities: []openai.Modality{openai.ModalityText, openai.ModalityAudio},
+		},
+	)
+	checks.NoError(t, err, "CreateChatCompletion (with audio) returned error")
+	if response.Choices[0].Message.Audio == nil {
+		t.Fatal("Audio response is nil")
+	}
+	if len(response.Choices[0].Message.Audio.Data) == 0 {
+		t.Fatal("Audio response data is empty")
+	}
+	if response.Choices[0].Message.Audio.Transcript == "" {
+		t.Fatal("Audio response transcript is empty")
+	}
+	if response.Usage.PromptTokens == 0 || response.Usage.CompletionTokens == 0 || response.Usage.TotalTokens == 0 {
+		t.Fatal("Usage is zero")
+	}
+	t.Logf("Usage: %+v", response.Usage)
 }
 
 func TestCompletionStream(t *testing.T) {
@@ -145,6 +177,60 @@ func TestCompletionStream(t *testing.T) {
 	}
 }
 
+func TestChatCompletionStream(t *testing.T) {
+	apiToken := os.Getenv("OPENAI_TOKEN")
+	if apiToken == "" {
+		t.Skip("Skipping testing against production OpenAI API. Set OPENAI_TOKEN environment variable to enable it.")
+	}
+
+	c := openai.NewClient(apiToken)
+	ctx := context.Background()
+
+	stream, err := c.CreateChatCompletionStream(ctx, openai.ChatCompletionRequest{
+		Model: openai.GPT4oAudioPreview,
+		Messages: []openai.ChatCompletionMessage{
+			{
+				Role:    openai.ChatMessageRoleUser,
+				Content: "hi",
+			},
+		},
+		Audio: &openai.AudioOutput{
+			Voice:  openai.AudioVoiceAlloy,
+			Format: openai.AudioFormatPCM16,
+		},
+		Modalities: []openai.Modality{openai.ModalityText, openai.ModalityAudio},
+		StreamOptions: &openai.StreamOptions{
+			IncludeUsage: true,
+		},
+	})
+	checks.NoError(t, err, "CreateCompletionStream returned error")
+	defer stream.Close()
+
+	var usage *openai.Usage
+	counter := 0
+	for {
+		response, err := stream.Recv()
+		if err != nil {
+			if errors.Is(err, io.EOF) {
+				break
+			}
+			t.Errorf("Stream error: %v", err)
+		} else {
+			counter++
+		}
+		if response.Usage != nil {
+			usage = response.Usage
+			t.Logf("Usage: %+v", usage)
+		}
+	}
+	if counter == 0 {
+		t.Error("Stream did not return any responses")
+	}
+	if usage == nil {
+		t.Error("Usage is nil")
+	}
+}
+
 func TestAPIError(t *testing.T) {
 	apiToken := os.Getenv("OPENAI_TOKEN")
 	if apiToken == "" {
 
@@ -81,17 +81,66 @@ type ChatMessageImageURL struct {
 	Detail ImageURLDetail `json:"detail,omitempty"`
 }
 
+type AudioVoice string
+
+const (
+	AudioVoiceAlloy   AudioVoice = "alloy"
+	AudioVoiceAsh     AudioVoice = "ash"
+	AudioVoiceBallad  AudioVoice = "ballad"
+	AudioVoiceCoral   AudioVoice = "coral"
+	AudioVoiceEcho    AudioVoice = "echo"
+	AudioVoiceFable   AudioVoice = "fable"
+	AudioVoiceNova    AudioVoice = "nova"
+	AudioVoiceOnyx    AudioVoice = "onyx"
+	AudioVoiceSage    AudioVoice = "sage"
+	AudioVoiceShimmer AudioVoice = "shimmer"
+	AudioVoiceVerse   AudioVoice = "verse"
+)
+
+type AudioFormat string
+
+const (
+	AudioFormatWAV   AudioFormat = "wav"
+	AudioFormatMP3   AudioFormat = "mp3"
+	AudioFormatFLAC  AudioFormat = "flac"
+	AudioFormatOPUS  AudioFormat = "opus"
+	AudioFormatPCM16 AudioFormat = "pcm16"
+)
+
+type ChatMessageAudio struct {
+	// Base64 encoded audio data.
+	Data string `json:"data,omitempty"`
+	// The format of the encoded audio data. Currently supports "wav" and "mp3".
+	Format AudioFormat `json:"format,omitempty"`
+}
+
+type Modality string
+
+const (
+	ModalityAudio Modality = "audio"
+	ModalityText  Modality = "text"
+)
+
+type AudioOutput struct {
+	// The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
+	Voice AudioVoice `json:"voice"`
+	// Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
+	Format AudioFormat `json:"format"`
+}
+
 type ChatMessagePartType string
 
 const (
-	ChatMessagePartTypeText     ChatMessagePartType = "text"
-	ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
+	ChatMessagePartTypeText       ChatMessagePartType = "text"
+	ChatMessagePartTypeImageURL   ChatMessagePartType = "image_url"
+	ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
 )
 
 type ChatMessagePart struct {
-	Type     ChatMessagePartType  `json:"type,omitempty"`
-	Text     string               `json:"text,omitempty"`
-	ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
+	Type       ChatMessagePartType  `json:"type,omitempty"`
+	Text       string               `json:"text,omitempty"`
+	ImageURL   *ChatMessageImageURL `json:"image_url,omitempty"`
+	InputAudio *ChatMessageAudio    `json:"input_audio,omitempty"`
 }
 
 type ChatCompletionMessage struct {
@@ -119,76 +168,77 @@ type ChatCompletionMessage struct {
 
 	// For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
 	ToolCallID string `json:"tool_call_id,omitempty"`
+
+	// If the audio output modality is requested, this object contains data about the audio response from the model.
+	Audio *ChatCompletionAudio `json:"audio,omitempty"`
+}
+
+type chatCompletionMessageMultiContent struct {
+	Role             string               `json:"role"`
+	Content          string               `json:"-"`
+	Refusal          string               `json:"refusal,omitempty"`
+	MultiContent     []ChatMessagePart    `json:"content,omitempty"`
+	Name             string               `json:"name,omitempty"`
+	ReasoningContent string               `json:"reasoning_content,omitempty"`
+	FunctionCall     *FunctionCall        `json:"function_call,omitempty"`
+	ToolCalls        []ToolCall           `json:"tool_calls,omitempty"`
+	ToolCallID       string               `json:"tool_call_id,omitempty"`
+	Audio            *ChatCompletionAudio `json:"audio,omitempty"`
+}
+
+type chatCompletionMessageSingleContent struct {
+	Role             string               `json:"role"`
+	Content          string               `json:"content,omitempty"`
+	Refusal          string               `json:"refusal,omitempty"`
+	MultiContent     []ChatMessagePart    `json:"-"`
+	Name             string               `json:"name,omitempty"`
+	ReasoningContent string               `json:"reasoning_content,omitempty"`
+	FunctionCall     *FunctionCall        `json:"function_call,omitempty"`
+	ToolCalls        []ToolCall           `json:"tool_calls,omitempty"`
+	ToolCallID       string               `json:"tool_call_id,omitempty"`
+	Audio            *ChatCompletionAudio `json:"audio,omitempty"`
 }
 
 func (m ChatCompletionMessage) MarshalJSON() ([]byte, error) {
 	if m.Content != "" && m.MultiContent != nil {
 		return nil, ErrContentFieldsMisused
 	}
 	if len(m.MultiContent) > 0 {
-		msg := struct {
-			Role             string            `json:"role"`
-			Content          string            `json:"-"`
-			Refusal          string            `json:"refusal,omitempty"`
-			MultiContent     []ChatMessagePart `json:"content,omitempty"`
-			Name             string            `json:"name,omitempty"`
-			ReasoningContent string            `json:"reasoning_content,omitempty"`
-			FunctionCall     *FunctionCall     `json:"function_call,omitempty"`
-			ToolCalls        []ToolCall        `json:"tool_calls,omitempty"`
-			ToolCallID       string            `json:"tool_call_id,omitempty"`
-		}(m)
+		msg := chatCompletionMessageMultiContent(m)
 		return json.Marshal(msg)
 	}
 
-	msg := struct {
-		Role             string            `json:"role"`
-		Content          string            `json:"content,omitempty"`
-		Refusal          string            `json:"refusal,omitempty"`
-		MultiContent     []ChatMessagePart `json:"-"`
-		Name             string            `json:"name,omitempty"`
-		ReasoningContent string            `json:"reasoning_content,omitempty"`
-		FunctionCall     *FunctionCall     `json:"function_call,omitempty"`
-		ToolCalls        []ToolCall        `json:"tool_calls,omitempty"`
-		ToolCallID       string            `json:"tool_call_id,omitempty"`
-	}(m)
+	msg := chatCompletionMessageSingleContent(m)
 	return json.Marshal(msg)
 }
 
 func (m *ChatCompletionMessage) UnmarshalJSON(bs []byte) error {
-	msg := struct {
-		Role             string `json:"role"`
-		Content          string `json:"content"`
-		Refusal          string `json:"refusal,omitempty"`
-		MultiContent     []ChatMessagePart
-		Name             string        `json:"name,omitempty"`
-		ReasoningContent string        `json:"reasoning_content,omitempty"`
-		FunctionCall     *FunctionCall `json:"function_call,omitempty"`
-		ToolCalls        []ToolCall    `json:"tool_calls,omitempty"`
-		ToolCallID       string        `json:"tool_call_id,omitempty"`
-	}{}
+	msg := chatCompletionMessageSingleContent{}
 
 	if err := json.Unmarshal(bs, &msg); err == nil {
 		*m = ChatCompletionMessage(msg)
 		return nil
 	}
-	multiMsg := struct {
-		Role             string `json:"role"`
-		Content          string
-		Refusal          string            `json:"refusal,omitempty"`
-		MultiContent     []ChatMessagePart `json:"content"`
-		Name             string            `json:"name,omitempty"`
-		ReasoningContent string            `json:"reasoning_content,omitempty"`
-		FunctionCall     *FunctionCall     `json:"function_call,omitempty"`
-		ToolCalls        []ToolCall        `json:"tool_calls,omitempty"`
-		ToolCallID       string            `json:"tool_call_id,omitempty"`
-	}{}
+	multiMsg := chatCompletionMessageMultiContent{}
 	if err := json.Unmarshal(bs, &multiMsg); err != nil {
 		return err
 	}
 	*m = ChatCompletionMessage(multiMsg)
 	return nil
 }
 
+type ChatCompletionAudio struct {
+	// Unique identifier for this audio response.
+	ID string `json:"id"`
+	// The Unix timestamp (in seconds) for when this audio response will no longer
+	// be accessible on the server for use in multi-turn conversations.
+	ExpiresAt int64 `json:"expires_at"`
+	// Base64 encoded audio bytes generated by the model, in the format specified in the request.
+	Data string `json:"data"`
+	// Transcript of the audio generated by the model.
+	Transcript string `json:"transcript"`
+}
+
 type ToolCall struct {
 	// Index is not nil only in chat completion chunk object
 	Index    *int         `json:"index,omitempty"`
@@ -331,6 +381,13 @@ type ChatCompletionRequest struct {
 	// We recommend hashing their username or email address, in order to avoid sending us any identifying information.
 	// https://platform.openai.com/docs/api-reference/chat/create#chat_create-safety_identifier
 	SafetyIdentifier string `json:"safety_identifier,omitempty"`
+	// Output types that you would like the model to generate for this request.
+	// Most models are capable of generating text, which is the default: ["text"]
+	// The gpt-4o-audio-preview model can also be used to generate audio.
+	// To request that this model generate both text and audio responses, you can use: ["text", "audio"]
+	Modalities []Modality `json:"modalities,omitempty"`
+	// Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
+	Audio *AudioOutput `json:"audio,omitempty"`
 	// Embedded struct for non-OpenAI extensions
 	ChatCompletionRequestExtensions
 }
 
@@ -5,12 +5,19 @@ import (
 	"net/http"
 )
 
+type ChatCompletionStreamChoiceDeltaAudio struct {
+	ID         string `json:"id,omitempty"`
+	Transcript string `json:"transcript,omitempty"`
+	Data       string `json:"data,omitempty"`
+}
+
 type ChatCompletionStreamChoiceDelta struct {
-	Content      string        `json:"content,omitempty"`
-	Role         string        `json:"role,omitempty"`
-	FunctionCall *FunctionCall `json:"function_call,omitempty"`
-	ToolCalls    []ToolCall    `json:"tool_calls,omitempty"`
-	Refusal      string        `json:"refusal,omitempty"`
+	Content      string                                `json:"content,omitempty"`
+	Role         string                                `json:"role,omitempty"`
+	FunctionCall *FunctionCall                         `json:"function_call,omitempty"`
+	ToolCalls    []ToolCall                            `json:"tool_calls,omitempty"`
+	Refusal      string                                `json:"refusal,omitempty"`
+	Audio        *ChatCompletionStreamChoiceDeltaAudio `json:"audio,omitempty"`
 
 	// This property is used for the "reasoning" feature supported by deepseek-reasoner
 	// which is not in the official documentation.