Skip to content

Commit

Permalink
tweaking vad and minSpeechLength settings; PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
cpoile committed Mar 22, 2024
1 parent d855144 commit bd6e7dd
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 25 deletions.
39 changes: 19 additions & 20 deletions cmd/transcriber/call/live_captions.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ const (
removeWindowAfterSilence = 3 * time.Second

// VAD settings
vadWindowSizeInSamples = 512
vadWindowSizeInSamples = 512 // 30 ms
vadThreshold = 0.5
vadMinSilenceDurationMs = 350
vadSpeechPadMs = 200
minSpeechLengthSamples = 1000 * trackOutAudioSamplesPerMs // 1 second of speech
vadMinSilenceDurationMs = 150
vadSpeechPadMs = 60
minSpeechLengthSamples = 330 * trackOutAudioSamplesPerMs // padding (120) + 210 of detected speech
)

type captionPackage struct {
pcm []float32
retCh chan string
}

func (t *Transcriber) processLiveCaptionsForTrack(ctx trackContext, pktPayloads <-chan []byte, doneCh <-chan struct{}) {
func (t *Transcriber) processLiveCaptionsForTrack(ctx trackContext, pktPayloadsCh <-chan []byte) {
opusDec, err := opus.NewDecoder(trackOutAudioRate, trackAudioChannels)
if err != nil {
slog.Error("processLiveCaptionsForTrack: failed to create opus decoder for live captions",
Expand All @@ -54,8 +54,6 @@ func (t *Transcriber) processLiveCaptionsForTrack(ctx trackContext, pktPayloads
ModelPath: filepath.Join(getModelsDir(), "silero_vad.onnx"),
SampleRate: trackOutAudioRate,

// set WindowSize to 512 to get as fine-grained detection as possible (for when
// the number of samples don't cleanly divide into the WindowSize
WindowSize: vadWindowSizeInSamples,
Threshold: vadThreshold,
MinSilenceDurationMs: vadMinSilenceDurationMs,
Expand All @@ -73,18 +71,16 @@ func (t *Transcriber) processLiveCaptionsForTrack(ctx trackContext, pktPayloads
slog.String("trackID", ctx.trackID))
}()

windowPressureLimitSamples := windowPressureLimitSec * 1000 * trackOutAudioSamplesPerMs
window := make([]float32, 0, windowPressureLimitSamples)
pcmBuf := make([]float32, trackOutFrameSize)

// readTrackPktPayloads drains the pktPayload channel (data from the track) and converts it to PCM.
readTrackPktPayloads := func() error {
// readTrackPktPayloads drains the pktPayloadsCh (audio data from the track) and converts it to PCM.
readTrackPktPayloads := func(window []float32) ([]float32, error) {
for {
select {
case payload, ok := <-pktPayloads:
case payload, ok := <-pktPayloadsCh:
if !ok {
// Exit on channel close
return errors.New("closed")
return nil, errors.New("closed")
}
n, err := opusDec.Decode(payload, pcmBuf)
if err != nil {
Expand All @@ -95,11 +91,13 @@ func (t *Transcriber) processLiveCaptionsForTrack(ctx trackContext, pktPayloads
window = append(window, pcmBuf[:n]...)
default:
// Done draining
return nil
return window, nil
}
}
}

windowPressureLimitSamples := windowPressureLimitSec * 1000 * trackOutAudioSamplesPerMs
window := make([]float32, 0, windowPressureLimitSamples)
prevTranscribedPos := 0
prevWindowLen := 0
var prevAudioAt time.Time
Expand All @@ -121,14 +119,14 @@ func (t *Transcriber) processLiveCaptionsForTrack(ctx trackContext, pktPayloads

for {
select {
case <-doneCh:
return
case <-ticker.C:
// empty the waiting pktPayloads
if err := readTrackPktPayloads(); err != nil {
// empty the waiting pktPayloadsCh
window, err = readTrackPktPayloads(window)
if err != nil {
// exit on close
return
}

// track how long we were waiting until consuming the next batch of audio data, as a measure
// of the pressure on the transcription process
newAudioLenMs := (len(window) - prevWindowLen) / trackOutAudioSamplesPerMs
Expand Down Expand Up @@ -189,7 +187,7 @@ func (t *Transcriber) processLiveCaptionsForTrack(ctx trackContext, pktPayloads

// Prepare the vad segments and the audio for transcription.
segments := convertToSegmentSamples(vadSegments, len(window))
removeShortSpeeches(segments)
segments = removeShortSpeeches(segments)
cleaned := cleanAudio(window, segments)

// Before sending off data to be transcribed, check if new data is silence.
Expand Down Expand Up @@ -303,12 +301,13 @@ func convertToSegmentSamples(segments []speech.Segment, audioLen int) []segmentS

// removeShortSpeeches removes small sections of speech because either they are not actual words,
// or the transcriber will have trouble with such a short amount.
func removeShortSpeeches(segments []segmentSamples) {
func removeShortSpeeches(segments []segmentSamples) []segmentSamples {
for i, seg := range segments {
if !seg.Silence && (seg.End-seg.Start) < minSpeechLengthSamples {
segments[i].Silence = true
}
}
return segments
}

func cleanAudio(audio []float32, segments []segmentSamples) []float32 {
Expand Down
4 changes: 1 addition & 3 deletions cmd/transcriber/call/tracks.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,11 @@ func (t *Transcriber) processLiveTrack(track trackRemote, sessionID string, user
var pktPayloadCh chan []byte
if t.cfg.LiveCaptionsOn {
pktPayloadCh = make(chan []byte, pktPayloadChBuffer)
doneChan := make(chan struct{})
defer func() {
close(pktPayloadCh)
close(doneChan)
}()

go t.processLiveCaptionsForTrack(ctx, pktPayloadCh, doneChan)
go t.processLiveCaptionsForTrack(ctx, pktPayloadCh)
}

// Read track audio:
Expand Down
2 changes: 1 addition & 1 deletion cmd/transcriber/call/transcriber.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func (t *Transcriber) Start(ctx context.Context) error {
return nil
})
t.client.On(client.RTCTrackEvent, t.handleTrack)
t.client.On(client.CloseEvent, func(msg any) error {
t.client.On(client.CloseEvent, func(_ any) error {
go t.done()
return nil
})
Expand Down
6 changes: 5 additions & 1 deletion cmd/transcriber/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,11 @@ func (cfg *CallTranscriberConfig) SetDefaults() {
}

if cfg.NumThreads == 0 {
cfg.NumThreads = min(NumThreadsDefault, runtime.NumCPU()/2)
if cfg.LiveCaptionsOn {
cfg.NumThreads = min(NumThreadsDefault, runtime.NumCPU()/2)
} else {
cfg.NumThreads = max(1, runtime.NumCPU()/2)
}
}

if cfg.OutputOptions.WebVTT.IsEmpty() {
Expand Down

0 comments on commit bd6e7dd

Please sign in to comment.