Skip to content

Commit

Permalink
MM-56540 - Live-captions support (#18)
Browse files Browse the repository at this point in the history
* initial live-captions support; wip

* upgrade silero-vad-go

* linting

* tests

* update rtcd

* add config options for LiveCaptions, esp. LiveCaptionsOn; tests

* fix test

* fix tests

* close captionDoneCh with other doneCh closers

* make transcription loop more async

* remove unused field

* send NewAudioLenMs (a measure of load) in the caption ws event

* be explicit that NewAudioLenMs is converted to float64 by marshaling

* improvements: don't cut off old voice before it's transcribed; better logs

* param tuning; better defaults: 2 transcribers x 2 threads

* fix tests

* return blank transcription if transcription error

* tweak debug statement

* add pressure valve to prevent death spiral on overloaded machines

* tweak pressure valve

* move structs to calls public; send release valve metrics over

* implement backpressure for the transcriber pool

* add backoff for transcriberQueueCh

* remove unnecessary else

* fix buffer size calculation

* report tickrate metric

* report initial tickrate on transcriber start

* report initial tickrate on transcriber start

* revert tickrate metric (not useful); update calls dependency

* Revert tickrate metric; wasn't useful

* update calls dependency; rename metric ws events

* lower LiveCaptionsNumTranscribersDefault to 1

* fix useless min(LiveCaptionsNumTranscribersDefault, runtime.NumCPU()/2)

* add NumThreadsDefault

* single segment, language = en

* Revert "single segment, language = en"

This reverts commit 564f3be.

* language = en

* comment out debug statements for now

* single segment

* remove backoff

* simplify mutexes and multiple windowing

* more cleanup

* recState -> jobState

* update rtcd for moving type to JobStateClient

* complicated algorithm but with clearer code; using normal vad

* improve quality of transcription with background noise

* lint

* tweaking vad and minSpeechLength settings; PR comments

* close off last segment

* fix useless for select

* add LiveCaptionsLanguage

* add LiveCaptionsLanguage debug statement

* PR comments

* no need for loop label

* upgrade tagged dependencies

* SendWs -> SendWS; go mod tidy
  • Loading branch information
cpoile authored Mar 27, 2024
1 parent 2b22927 commit d5cfa96
Show file tree
Hide file tree
Showing 11 changed files with 783 additions and 103 deletions.
20 changes: 18 additions & 2 deletions cmd/transcriber/apis/whisper.cpp/context.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package whisper

// #cgo LDFLAGS: -l:libwhisper.a -lm -lstdc++
// #cgo linux LDFLAGS: -l:libwhisper.a -lm -lstdc++
// #cgo darwin LDFLAGS: -lwhisper -lstdc++ -framework Accelerate
// #include <whisper.h>
// #include <stdlib.h>
import "C"
Expand All @@ -22,6 +23,15 @@ type Config struct {
NumThreads int
// Whether or not past transcription should be used as prompt.
NoContext bool
// 512 = a bit more than 10s. Use multiples of 64. Results in a speedup of 3x at 512, b/c whisper was tuned for 30s chunks. See: https://github.com/ggerganov/whisper.cpp/pull/141
// TODO: tests, validation
AudioContext int
// Whether or not to print progress to stdout (default false).
PrintProgress bool
// Language to use (defaults to autodetection).
Language string
// Whether or not to generate a single segment (default false).
SingleSegment bool
}

func (c Config) IsValid() error {
Expand Down Expand Up @@ -72,8 +82,14 @@ func NewContext(cfg Config) (*Context, error) {

c.params = C.whisper_full_default_params(C.WHISPER_SAMPLING_GREEDY)
c.params.no_context = C.bool(c.cfg.NoContext)
c.params.audio_ctx = C.int(c.cfg.AudioContext)
c.params.n_threads = C.int(c.cfg.NumThreads)
c.params.language = C.CString("auto")
if c.cfg.Language == "" {
c.cfg.Language = "auto"
}
c.params.language = C.CString(c.cfg.Language)
c.params.single_segment = C.bool(c.cfg.SingleSegment)
c.params.print_progress = C.bool(c.cfg.PrintProgress)

return &c, nil
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/transcriber/call/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func (t *Transcriber) postJobStatus(status public.JobStatus) error {
defer cancelCtx()
resp, err := t.apiClient.DoAPIRequestBytes(ctx, http.MethodPost, apiURL, payload, "")
if err != nil {
return fmt.Errorf("request failed%w", err)
return fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
cancelCtx()
Expand Down
Loading

0 comments on commit d5cfa96

Please sign in to comment.