feat: add comprehensive multimodal driver support documentation by JacobFV · Pull Request #8 · agi-inc/agi-csharp

JacobFV · 2026-02-10T09:45:23Z

Multimodal Driver Support - C# SDK Updates

This update adds comprehensive multimodal support to the C# SDK to match the new agi-driver capabilities.

Changes Required

Protocol Updates (`src/Agi/Driver/Protocol.cs`)

Add to DriverEventType Enum

AudioTranscript,
VideoFrame,
SpeechStarted,
SpeechFinishedEvent,
TurnDetected

Add to DriverCommandType Enum

GetAudioTranscript,
GetVideoFrame

New Classes

// MCP server configuration
public class MCPServerConfig
{
    [JsonPropertyName("name")]
    public string Name { get; set; } = "";

    [JsonPropertyName("command")]
    public string Command { get; set; } = "";

    [JsonPropertyName("args")]
    public string[] Args { get; set; } = Array.Empty<string>();

    [JsonPropertyName("env")]
    public Dictionary<string, string>? Env { get; set; }
}

// Agent identity
public class AgentIdentity
{
    [JsonPropertyName("name")]
    public string Name { get; set; } = "agi-2-claude";

    [JsonPropertyName("creator")]
    public string Creator { get; set; } = "AGI Company";

    [JsonPropertyName("creator_url")]
    public string CreatorUrl { get; set; } = "https://theagi.company";
}

// Tool choice (with custom converter for string or object)
[JsonConverter(typeof(ToolChoiceConverter))]
public class ToolChoice
{
    public string Mode { get; set; } = "auto";
    public string? ToolName { get; set; }
}

// New event classes
public class AudioTranscriptEvent : BaseDriverEvent
{
    public override string EventName => "audio_transcript";
    [JsonPropertyName("transcript")]
    public string Transcript { get; set; } = "";
    [JsonPropertyName("seconds_ago")]
    public int SecondsAgo { get; set; }
    [JsonPropertyName("duration")]
    public int Duration { get; set; }
}

public class VideoFrameEvent : BaseDriverEvent
{
    public override string EventName => "video_frame";
    [JsonPropertyName("frame_base64")]
    public string FrameBase64 { get; set; } = "";
    [JsonPropertyName("source")]
    public string Source { get; set; } = "";
    [JsonPropertyName("seconds_ago")]
    public int SecondsAgo { get; set; }
}

public class SpeechStartedEvent : BaseDriverEvent
{
    public override string EventName => "speech_started";
    [JsonPropertyName("text")]
    public string Text { get; set; } = "";
}

public class SpeechFinishedEvent : BaseDriverEvent
{
    public override string EventName => "speech_finished";
}

public class TurnDetectedEvent : BaseDriverEvent
{
    public override string EventName => "turn_detected";
    [JsonPropertyName("transcript")]
    public string Transcript { get; set; } = "";
}

// New command classes
public class GetAudioTranscriptCommand
{
    [JsonPropertyName("command")]
    public string Command => "get_audio_transcript";
    [JsonPropertyName("seconds_ago")]
    public int SecondsAgo { get; set; } = 5;
    [JsonPropertyName("duration")]
    public int Duration { get; set; } = 5;
}

public class GetVideoFrameCommand
{
    [JsonPropertyName("command")]
    public string Command => "get_video_frame";
    [JsonPropertyName("source")]
    public string Source { get; set; } = "screen";
    [JsonPropertyName("seconds_ago")]
    public int SecondsAgo { get; set; } = 1;
}

Add to StartCommand Class

// Multimodal features
[JsonPropertyName("agent_identity")]
public AgentIdentity? AgentIdentity { get; set; }

[JsonPropertyName("tool_choice")]
public ToolChoice? ToolChoice { get; set; }

[JsonPropertyName("mcp_servers")]
public MCPServerConfig[]? McpServers { get; set; }

[JsonPropertyName("audio_input_enabled")]
public bool AudioInputEnabled { get; set; } = false;

[JsonPropertyName("audio_buffer_seconds")]
public int AudioBufferSeconds { get; set; } = 30;

[JsonPropertyName("turn_detection_enabled")]
public bool TurnDetectionEnabled { get; set; } = false;

[JsonPropertyName("turn_detection_silence_ms")]
public int TurnDetectionSilenceMs { get; set; } = 1000;

[JsonPropertyName("speech_output_enabled")]
public bool SpeechOutputEnabled { get; set; } = false;

[JsonPropertyName("speech_voice")]
public string SpeechVoice { get; set; } = "alloy";

[JsonPropertyName("camera_enabled")]
public bool CameraEnabled { get; set; } = false;

[JsonPropertyName("camera_buffer_seconds")]
public int CameraBufferSeconds { get; set; } = 30;

[JsonPropertyName("screen_recording_enabled")]
public bool ScreenRecordingEnabled { get; set; } = false;

[JsonPropertyName("screen_recording_buffer_seconds")]
public int ScreenRecordingBufferSeconds { get; set; } = 30;

Update AgentName Default

Change AgentName property default from "" to "agi-2-claude".

Usage Examples

Basic Multimodal Session

using Agi.Driver;

var driver = new AgentDriver(new DriverOptions
{
    Mode = "local",
    AgentName = "agi-2-claude"
});

var result = await driver.Start(new StartCommand
{
    Goal = "Help me with my computer",
    Mode = "local",
    AgentName = "agi-2-claude",

    // Voice features
    AudioInputEnabled = true,
    TurnDetectionEnabled = true,
    SpeechOutputEnabled = true,
    SpeechVoice = "alloy",

    // Video features
    CameraEnabled = true,
    ScreenRecordingEnabled = true,

    // MCP servers
    McpServers = new[]
    {
        new MCPServerConfig
        {
            Name = "filesystem",
            Command = "npx",
            Args = new[] { "-y", "@modelcontextprotocol/server-filesystem", "/path/to/dir" }
        }
    },

    // Tool choice
    ToolChoice = new ToolChoice { Mode = "auto" }
});

Handling New Events

driver.OnEvent += (event) =>
{
    switch (event)
    {
        case AudioTranscriptEvent ate:
            Console.WriteLine($"Transcript: {ate.Transcript}");
            break;

        case VideoFrameEvent vfe:
            SaveFrame(vfe.FrameBase64);
            break;

        case SpeechStartedEvent sse:
            Console.WriteLine($"🔊 Speaking: {sse.Text}");
            break;

        case SpeechFinishedEvent:
            Console.WriteLine("✓ Finished speaking");
            break;

        case TurnDetectedEvent tde:
            Console.WriteLine($"You said: {tde.Transcript}");
            break;
    }
};

Voice-Only Mode

var result = await driver.Start(new StartCommand
{
    Goal = "(voice input)",
    Mode = "local",
    AudioInputEnabled = true,
    TurnDetectionEnabled = true,
    TurnDetectionSilenceMs = 1000, // 1 second of silence = turn complete
    SpeechOutputEnabled = true,
    SpeechVoice = "alloy" // or: echo, fable, onyx, nova, shimmer
});

MCP Servers

var mcpServers = new[]
{
    new MCPServerConfig
    {
        Name = "filesystem",
        Command = "npx",
        Args = new[] { "-y", "@modelcontextprotocol/server-filesystem", "/Users/you/Documents" }
    },
    new MCPServerConfig
    {
        Name = "database",
        Command = "python",
        Args = new[] { "-m", "my_db_server" },
        Env = new Dictionary<string, string> { { "DATABASE_URL", "postgresql://..." } }
    }
};

await driver.Start(new StartCommand
{
    Goal = "Analyze my documents",
    Mode = "local",
    McpServers = mcpServers
});

Tool Choice Configuration

// Auto (default)
ToolChoice = new ToolChoice { Mode = "auto" }

// Required - must use at least one tool
ToolChoice = new ToolChoice { Mode = "required" }

// None - no tool use
ToolChoice = new ToolChoice { Mode = "none" }

// Specific tool
ToolChoice = new ToolChoice { Mode = "tool", ToolName = "filesystem__read_file" }

Breaking Changes

⚠️ This is a breaking change with no backwards compatibility.

StartCommand has many new fields (all have defaults)
New event types may be emitted
AgentName default should be changed to "agi-2-claude"

Testing

# Build project
dotnet build

# Run tests
dotnet test

# Try a voice session (example)
dotnet run --project examples/VoiceExample

Implementation Checklist

Related PRs

agi-api (driver): https://github.com/agi-inc/agents/pull/344
agi-python: https://github.com/agi-inc/agi-python/pull/8
agi-node: feat: add comprehensive multimodal driver support agi-node#11

Note

Due to the C# SDK's strongly-typed nature, these changes require more extensive code updates compared to Python/Node. A Protocol_Multimodal.cs file has been created as a reference implementation. Integrate these changes into the existing Protocol.cs file.

release-please with release-type "simple" only bumps the manifest and CHANGELOG. The publish workflow reads the version from Agi.csproj, so we need extra-files to keep it in sync. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Add documentation and reference implementation for new agi-driver multimodal features including audio, video, MCP servers, and tool choice configuration. ## Documentation Provided ### MULTIMODAL_UPDATES.md Complete guide for implementing multimodal features in C# SDK ### Protocol_Multimodal.cs Reference implementation of new protocol types: - New event classes (AudioTranscriptEvent, VideoFrameEvent, etc.) - New command classes (GetAudioTranscriptCommand, GetVideoFrameCommand) - Helper classes (MCPServerConfig, AgentIdentity, ToolChoice) - StartCommand extensions for multimodal features ## Changes Needed ### Protocol.cs - Add new event types to DriverEventType enum - Add new command types to DriverCommandType enum - Add new event/command classes - Add multimodal fields to StartCommand - Update AgentName default to "agi-2-claude" ### Driver.cs - Update event parsing for new event types - Add convenience methods for multimodal features ## Breaking Changes This is a breaking change with no backwards compatibility. StartCommand has many new fields (all with sensible defaults). ## Related PRs - agi-api (driver): https://github.com/agi-inc/agents/pull/344 - agi-python: https://github.com/agi-inc/agi-python/pull/8 - agi-node: agi-inc/agi-node#11 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

- Add Voice, Camera, Screen, Mcp, McpConfig properties to DriverOptions - Store multimodal options in AgentDriver constructor - Add multimodal fields to StartCommand (audio, speech, camera, screen, MCP) - Pass options from DriverOptions to StartCommand in StartAsync(): Voice → AudioInputEnabled, TurnDetectionEnabled, SpeechOutputEnabled Camera → CameraEnabled Screen → ScreenRecordingEnabled Mcp → McpServers (loaded from config file) - Implement LoadMcpConfig() for reading MCP server configurations - Add multimodal event types to DriverEventType enum - Add multimodal command types to DriverCommandType enum - Add multimodal event parsing to DriverProtocol.ParseEvent() - Remove duplicate enum declarations from Protocol_Multimodal.cs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

NamanGarg20

lgtm

JacobFV and others added 2 commits February 9, 2026 14:06

JacobFV mentioned this pull request Feb 10, 2026

feat: add multimodal support (voice, camera, screen, MCP) agi-inc/agi-cli#36

Merged

JacobFV and others added 2 commits February 10, 2026 03:51

chore: bump version to 0.5.0 for multimodal release

ad736cb

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

NamanGarg20 approved these changes Feb 10, 2026

View reviewed changes

JacobFV merged commit 72a02ee into main Feb 10, 2026
6 of 7 checks passed

This was referenced Feb 10, 2026

chore(main): release 0.4.2 #9

Closed

chore(main): release 0.5.2 #11

Closed

chore(main): release 0.5.2 #14

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add comprehensive multimodal driver support documentation#8

feat: add comprehensive multimodal driver support documentation#8
JacobFV merged 4 commits intomainfrom
jacob/multimodal-driver-support

JacobFV commented Feb 10, 2026

Uh oh!

NamanGarg20 left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

JacobFV commented Feb 10, 2026

Multimodal Driver Support - C# SDK Updates

Changes Required

Protocol Updates (src/Agi/Driver/Protocol.cs)

Add to DriverEventType Enum

Add to DriverCommandType Enum

New Classes

Add to StartCommand Class

Update AgentName Default

Usage Examples

Basic Multimodal Session

Handling New Events

Voice-Only Mode

MCP Servers

Tool Choice Configuration

Breaking Changes

Testing

Implementation Checklist

Related PRs

Note

Uh oh!

NamanGarg20 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Protocol Updates (`src/Agi/Driver/Protocol.cs`)