Skip to content

Commit 3e76fb9

Browse files
authored
feat: add a very basic golang vad demo (#5)
feat: add a very basic golang vad demo
1 parent 7b3eb7a commit 3e76fb9

File tree

5 files changed

+361
-0
lines changed

5 files changed

+361
-0
lines changed

examples/go-tenvad/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Simple Vad Demo
2+
3+
```bash
4+
cd examples/go-tenvad
5+
go run .
6+
```
7+
8+
```
9+
10+
```

examples/go-tenvad/go.mod

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
module tenvad
2+
3+
go 1.23.6
4+
5+
require (
6+
github.com/go-audio/audio v1.0.0 // indirect
7+
github.com/go-audio/riff v1.0.0 // indirect
8+
github.com/go-audio/wav v1.1.0 // indirect
9+
)

examples/go-tenvad/go.sum

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
2+
github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
3+
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
4+
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
5+
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
6+
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=

examples/go-tenvad/main.go

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"log"
6+
"os"
7+
8+
"github.com/go-audio/wav"
9+
)
10+
11+
// loadWavSamplesWithGoAudio reads a WAV file using the go-audio library and returns its 16-bit PCM samples and sample rate.
12+
// It expects a mono, 16-bit PCM WAV file for compatibility with the VAD.
13+
func loadWavSamplesWithGoAudio(filePath string) ([]int16, int, error) {
14+
// Reminder: You'll need to run:
15+
// go get github.com/go-audio/audio
16+
// go get github.com/go-audio/wav
17+
18+
file, err := os.Open(filePath)
19+
if err != nil {
20+
return nil, 0, fmt.Errorf("could not open wav file '%s': %w", filePath, err)
21+
}
22+
defer file.Close()
23+
24+
d := wav.NewDecoder(file)
25+
if d == nil {
26+
return nil, 0, fmt.Errorf("could not create wav decoder for '%s'", filePath)
27+
}
28+
29+
d.ReadInfo()
30+
if err := d.Err(); err != nil {
31+
return nil, 0, fmt.Errorf("error reading wav info from '%s': %w", filePath, err)
32+
}
33+
34+
format := d.Format()
35+
if format == nil {
36+
return nil, 0, fmt.Errorf("could not get audio format from '%s'", filePath)
37+
}
38+
39+
if format.NumChannels != 1 {
40+
return nil, 0, fmt.Errorf("unsupported number of channels in '%s': %d. Only mono (1) is supported", filePath, format.NumChannels)
41+
}
42+
if d.BitDepth != 16 {
43+
return nil, 0, fmt.Errorf("unsupported bit depth in '%s': %d. Only 16-bit is supported", filePath, d.BitDepth)
44+
}
45+
46+
buf, err := d.FullPCMBuffer()
47+
if err != nil {
48+
return nil, 0, fmt.Errorf("could not read full PCM buffer from '%s': %w", filePath, err)
49+
}
50+
51+
// The VAD expects int16 samples. audio.IntBuffer.Data is []int.
52+
// We need to convert []int to []int16.
53+
// This conversion is appropriate because we've confirmed BitDepth == 16.
54+
intData := buf.Data
55+
pcmData := make([]int16, len(intData))
56+
for i, val := range intData {
57+
pcmData[i] = int16(val)
58+
}
59+
60+
log.Printf("Successfully loaded WAV with go-audio: %s, Sample Rate: %d Hz, Channels: %d, Bits/Sample: %d, Samples: %d",
61+
filePath, format.SampleRate, format.NumChannels, d.BitDepth, len(pcmData))
62+
63+
return pcmData, format.SampleRate, nil
64+
}
65+
66+
func main() {
67+
fmt.Println("Starting VAD demo with WAV file processing (using go-audio/wav)...")
68+
69+
wavFilePath := "../s0724-s0730.wav" // Placeholder: You need to provide a "input.wav" file in the same directory or specify a full path.
70+
71+
// VAD Parameters
72+
hopSize := 256 // Frame size in samples
73+
threshold := float32(0.5) // VAD detection threshold
74+
75+
// 1. Load audio samples from WAV file using go-audio library
76+
audioSamples, _, err := loadWavSamplesWithGoAudio(wavFilePath)
77+
if err != nil {
78+
log.Fatalf("Failed to load WAV file '%s': %v", wavFilePath, err)
79+
}
80+
if len(audioSamples) == 0 {
81+
log.Fatalf("No audio samples loaded from WAV file '%s'.", wavFilePath)
82+
}
83+
// The Printf from the previous version showing sample rate is now part of loadWavSamplesWithGoAudio log
84+
85+
// 2. Initialize VAD
86+
vadInstance, err := NewVad(hopSize, threshold) // hopSize is in samples
87+
if err != nil {
88+
log.Fatalf("Failed to create VAD instance: %v", err)
89+
}
90+
defer func() {
91+
fmt.Println("Closing VAD instance...")
92+
if err := vadInstance.Close(); err != nil {
93+
log.Printf("Error closing VAD instance: %v", err)
94+
}
95+
fmt.Println("VAD instance closed.")
96+
}()
97+
98+
fmt.Printf("VAD instance created successfully. Hop Size (Frame Size): %d samples, Threshold: %.2f\n",
99+
vadInstance.FrameSize(), threshold)
100+
101+
// 3. Process audio frames from the WAV file
102+
numFrames := len(audioSamples) / hopSize
103+
fmt.Printf("Total samples: %d, Hop size: %d, Number of full frames to process: %d\n", len(audioSamples), hopSize, numFrames)
104+
105+
for i := 0; i < numFrames; i++ {
106+
start := i * hopSize
107+
end := start + hopSize
108+
frame := audioSamples[start:end]
109+
110+
probability, isSpeech, err := vadInstance.Process(frame)
111+
if err != nil {
112+
log.Printf("Error processing frame %d: %v", i, err)
113+
continue
114+
}
115+
116+
speechFlag := 0
117+
if isSpeech {
118+
speechFlag = 1
119+
}
120+
fmt.Printf("[%d] %.6f, %d\n", i, probability, speechFlag)
121+
122+
// actualFrameDurationMs := (float64(hopSize) * 1000.0) / float64(wavSampleRate)
123+
// time.Sleep(time.Duration(actualFrameDurationMs) * time.Millisecond)
124+
}
125+
126+
remainingSamples := len(audioSamples) % hopSize
127+
if remainingSamples > 0 {
128+
fmt.Printf("Note: %d remaining samples at the end of the WAV file were not processed as they don't form a full frame of size %d.\n", remainingSamples, hopSize)
129+
}
130+
131+
fmt.Println("VAD demo with WAV file finished.")
132+
}
133+
134+
// getFrameDescription is a helper function to describe the frame content simply.
135+
// For WAV file frames, this gives a rough idea of activity.
136+
func getFrameDescription(frame []int16) string {
137+
isSilent := true
138+
var sumAbs int64
139+
for _, s := range frame {
140+
if s != 0 {
141+
isSilent = false
142+
}
143+
if s < 0 {
144+
sumAbs += int64(-s)
145+
} else {
146+
sumAbs += int64(s)
147+
}
148+
}
149+
if isSilent {
150+
return "completely silent"
151+
}
152+
averageAmplitude := float64(sumAbs) / float64(len(frame))
153+
return fmt.Sprintf("potentially active, avg_abs_amp: %.2f", averageAmplitude)
154+
}

examples/go-tenvad/vad.go

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
package main
2+
3+
/*
4+
#cgo CFLAGS: -I${SRCDIR}/../../include
5+
6+
// macOS (Darwin) - Universal Framework (assuming it supports both amd64 and arm64)
7+
#cgo darwin CFLAGS: -I${SRCDIR}/../../lib/macOS/ten_vad.framework/Versions/A/Headers
8+
#cgo darwin LDFLAGS: -F${SRCDIR}/../../lib/macOS -framework ten_vad -Wl,-rpath,${SRCDIR}/../../lib/macOS
9+
10+
// Linux AMD64
11+
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/../../lib/Linux/amd64 -lten_vad -Wl,-rpath,'$ORIGIN'/../../lib/Linux/amd64
12+
13+
// Linux ARM64
14+
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/../../lib/Linux/arm64 -lten_vad -Wl,-rpath,'$ORIGIN'/../../lib/Linux/arm64
15+
16+
// Windows AMD64
17+
// For Windows, the .dll needs to be in the PATH or alongside the .exe at runtime.
18+
// The .lib file is used for linking.
19+
#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/../../lib/Windows/amd64 -lten_vad
20+
21+
#include "ten_vad.h"
22+
#include <stdlib.h> // Required for C.free if ever used directly for strings (not in this API but good practice)
23+
// Explicitly include headers that define C types we will use, like size_t
24+
#include <stddef.h>
25+
#include <stdint.h>
26+
*/
27+
import "C"
28+
import (
29+
"fmt"
30+
"runtime"
31+
"unsafe"
32+
)
33+
34+
// VadMode defines the aggressiveness of the VAD.
35+
type VadMode int
36+
37+
const (
38+
// VadModeNormal is the normal mode.
39+
VadModeNormal VadMode = 0
40+
// VadModeLowBitrate is optimized for low bitrate.
41+
VadModeLowBitrate VadMode = 1
42+
// VadModeAggressive is the aggressive mode.
43+
VadModeAggressive VadMode = 2
44+
// VadModeVeryAggressive is the most aggressive mode.
45+
VadModeVeryAggressive VadMode = 3
46+
)
47+
48+
// VadError represents an error from the TenVAD library.
49+
type VadError struct {
50+
Code int
51+
Message string
52+
}
53+
54+
func (e *VadError) Error() string {
55+
return fmt.Sprintf("ten_vad error (code %d): %s", e.Code, e.Message)
56+
}
57+
58+
var (
59+
ErrVadInitFailed = &VadError{Code: -1, Message: "Initialization failed"}
60+
ErrVadInvalidSampleRate = &VadError{Code: -2, Message: "Invalid sample rate (must be 8000, 16000, 32000, or 48000 Hz)"}
61+
ErrVadInvalidFrameLength = &VadError{Code: -3, Message: "Invalid frame length (must be 10, 20, or 30 ms)"}
62+
ErrVadInvalidMode = &VadError{Code: -4, Message: "Invalid mode"}
63+
ErrVadUninitialized = &VadError{Code: -5, Message: "VAD instance is uninitialized or already closed"}
64+
ErrVadProcessError = &VadError{Code: -6, Message: "Error during processing"}
65+
ErrVadInvalidParameter = &VadError{Code: -7, Message: "Invalid parameter for set operations"}
66+
ErrVadInternalError = &VadError{Code: -100, Message: "Unknown internal error during processing"}
67+
)
68+
69+
func mapErrorCodeToError(code C.int) error {
70+
switch int(code) {
71+
case 0: // Success for some operations or non-error state for process
72+
return nil
73+
case 1: // Speech detected (not an error for process)
74+
return nil
75+
case -1:
76+
return ErrVadInitFailed
77+
case -2:
78+
return ErrVadInvalidSampleRate
79+
case -3:
80+
return ErrVadInvalidFrameLength
81+
case -4:
82+
return ErrVadInvalidMode
83+
case -5:
84+
return ErrVadUninitialized // Or a more specific error if available from C context
85+
case -6:
86+
return ErrVadProcessError
87+
case -7:
88+
return ErrVadInvalidParameter
89+
default:
90+
if code < 0 {
91+
return &VadError{Code: int(code), Message: fmt.Sprintf("Unknown C VAD error code: %d", code)}
92+
}
93+
return nil // Non-negative codes (like 0 or 1 from process) are not errors
94+
}
95+
}
96+
97+
// Vad represents a Voice Activity Detection instance.
98+
type Vad struct {
99+
instance C.ten_vad_handle_t
100+
hopSize int // Number of samples per frame, consistent with ten_vad_create hop_size
101+
}
102+
103+
// NewVad creates and initializes a new VAD instance.
104+
// hopSize: The number of samples between the start points of two consecutive analysis frames (e.g., 256).
105+
// threshold: VAD detection threshold ranging from [0.0, 1.0].
106+
func NewVad(hopSize int, threshold float32) (*Vad, error) {
107+
var inst C.ten_vad_handle_t
108+
109+
cHopSize := C.size_t(hopSize)
110+
cThreshold := C.float(threshold)
111+
112+
if !(threshold >= 0.0 && threshold <= 1.0) {
113+
return nil, ErrVadInvalidParameter // Or a more specific error for threshold
114+
}
115+
// Basic validation for hopSize, e.g., must be positive
116+
if hopSize <= 0 {
117+
return nil, ErrVadInvalidParameter // Or a specific error for hopSize
118+
}
119+
120+
ret := C.ten_vad_create(&inst, cHopSize, cThreshold)
121+
if ret != 0 || inst == nil {
122+
return nil, ErrVadInitFailed
123+
}
124+
125+
v := &Vad{
126+
instance: inst,
127+
hopSize: hopSize,
128+
}
129+
130+
runtime.SetFinalizer(v, func(vad *Vad) {
131+
if vad.instance != nil {
132+
C.ten_vad_destroy(&vad.instance)
133+
vad.instance = nil
134+
}
135+
})
136+
return v, nil
137+
}
138+
139+
// Close explicitly releases the C VAD instance and its associated resources.
140+
// It's good practice to call Close when done with the VAD instance,
141+
// rather than relying solely on the garbage collector.
142+
func (v *Vad) Close() error {
143+
if v.instance == nil {
144+
return ErrVadUninitialized
145+
}
146+
C.ten_vad_destroy(&v.instance)
147+
v.instance = nil
148+
runtime.SetFinalizer(v, nil) // Remove the finalizer
149+
return nil
150+
}
151+
152+
// Process processes a single audio frame to determine if it contains speech.
153+
// speechFrame: A slice of int16 PCM audio samples.
154+
// The length of speechFrame should be equal to the hopSize used during initialization.
155+
// Returns probability of speech, true if speech is detected, false otherwise, and an error if one occurred.
156+
func (v *Vad) Process(speechFrame []int16) (float32, bool, error) {
157+
if v.instance == nil {
158+
return 0.0, false, ErrVadUninitialized
159+
}
160+
if len(speechFrame) != v.hopSize {
161+
return 0.0, false, fmt.Errorf("ten_vad: input audio frame length %d does not match expected hop_size %d", len(speechFrame), v.hopSize)
162+
}
163+
164+
cSpeechFramePtr := (*C.short)(unsafe.Pointer(&speechFrame[0]))
165+
cAudioDataLength := C.size_t(v.hopSize) // This is the hop_size
166+
167+
var cOutProbability C.float
168+
var cOutFlag C.int
169+
170+
result := C.ten_vad_process(v.instance, cSpeechFramePtr, cAudioDataLength, &cOutProbability, &cOutFlag)
171+
172+
if result != 0 { // ten_vad_process returns 0 on success, -1 on error
173+
return 0.0, false, mapErrorCodeToError(result) // Ensure mapErrorCodeToError handles -1 appropriately for process error
174+
}
175+
176+
return float32(cOutProbability), cOutFlag == 1, nil
177+
}
178+
179+
// FrameSize returns the expected number of int16 samples per frame (i.e., hop_size).
180+
func (v *Vad) FrameSize() int {
181+
return v.hopSize
182+
}

0 commit comments

Comments
 (0)