Skip to content

Commit

Permalink
feat: Support custom Whisper API endpoints for voice transcription
Browse files Browse the repository at this point in the history
Adds configuration options to support self-hosted Whisper API endpoints:

OPENAI_API_BASE_WHISPER:
- Specifies base URL for alternate Whisper API (e.g. http://localhost:2022)
- Used to override default OpenAI endpoint

OPENAI_API_KEY_WHISPER:
- API key for alternative Whisper API endpoint
- If unset when OPENAI_API_BASE_WHISPER is set, api_key will be empty

Technical details:
- Modified voice transcription to explicitly pass api_base and api_key to litellm
  instead of relying on environment variables
- Improves configuration flexibility and allows different credentials for
  chat vs voice APIs
- Maintains backwards compatibility with existing OpenAI usage

Testing:
- Verified with both OpenAI and self-hosted Whisper endpoints
- Confirmed empty api_key behavior works as expected
  • Loading branch information
mbailey committed Dec 15, 2024
1 parent 65555b5 commit 5911c51
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 11 deletions.
8 changes: 8 additions & 0 deletions aider/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ def get_parser(default_config_files, git_root):
"--openai-api-key",
help="Specify the OpenAI API key",
)
group.add_argument(
"--openai-api-key-whisper",
help="Specify the OpenAI API key for Whisper transcriptions",
)
group.add_argument(
"--anthropic-api-key",
help="Specify the Anthropic API key",
Expand All @@ -145,6 +149,10 @@ def get_parser(default_config_files, git_root):
"--openai-api-base",
help="Specify the api base url",
)
group.add_argument(
"--openai-api-base-whisper",
help="Specify the api base url for Whisper transcriptions",
)
group.add_argument(
"--openai-api-type",
help="(deprecated, use --set-env OPENAI_API_TYPE=<value>)",
Expand Down
28 changes: 24 additions & 4 deletions aider/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1112,12 +1112,32 @@ def cmd_voice(self, args):
"Record and transcribe voice input"

if not self.voice:
if "OPENAI_API_KEY" not in os.environ:
self.io.tool_error("To use /voice you must provide an OpenAI API key.")
return
if self.args.openai_api_base_whisper:
# When using custom Whisper API base, require matching API key
if not (self.args.openai_api_key_whisper or os.getenv("OPENAI_API_KEY_WHISPER")):
self.io.tool_error(
"When using a custom Whisper API base URL (--openai-api-base-whisper), you"
" must also provide a Whisper API key (--openai-api-key-whisper or"
" OPENAI_API_KEY_WHISPER)."
)
return
else:
# When using standard OpenAI API, require OpenAI API key
if not (self.args.openai_api_key or os.getenv("OPENAI_API_KEY")):
self.io.tool_error(
"To use /voice with the standard OpenAI API, you must provide an OpenAI API"
" key (--openai-api-key or OPENAI_API_KEY)."
)
return

try:
self.voice = voice.Voice(
audio_format=self.args.voice_format, device_name=self.args.voice_input_device
audio_format=self.args.voice_format,
device_name=self.args.voice_input_device,
api_base=self.args.openai_api_base,
api_key=self.args.openai_api_key,
api_base_whisper=self.args.openai_api_base_whisper,
api_key_whisper=self.args.openai_api_key_whisper,
)
except voice.SoundDeviceError:
self.io.tool_error(
Expand Down
46 changes: 41 additions & 5 deletions aider/voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,25 @@ class Voice:

threshold = 0.15

def __init__(self, audio_format="wav", device_name=None):
def __init__(
self,
audio_format="wav",
device_name=None,
api_base=None,
api_key=None,
api_base_whisper=None,
api_key_whisper=None,
):
# If whisper-specific base URL is provided, use whisper-specific credentials
if api_base_whisper:
if not api_key_whisper:
raise ValueError("api_key_whisper is required when api_base_whisper is specified")
self.api_base = api_base_whisper
self.api_key = api_key_whisper
else:
# Otherwise fall back to standard OpenAI credentials
self.api_base = api_base
self.api_key = api_key
if sf is None:
raise SoundDeviceError
try:
Expand Down Expand Up @@ -150,8 +168,14 @@ def raw_record_and_transcribe(self, history, language):

with open(filename, "rb") as fh:
try:
# Always use whisper-specific credentials for transcription
transcript = litellm.transcription(
model="whisper-1", file=fh, prompt=history, language=language
model="whisper-1",
file=fh,
prompt=history,
language=language,
api_base=self.api_base,
api_key=self.api_key,
)
except Exception as err:
print(f"Unable to transcribe {filename}: {err}")
Expand All @@ -166,6 +190,18 @@ def raw_record_and_transcribe(self, history, language):

if __name__ == "__main__":
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("Please set the OPENAI_API_KEY environment variable.")
print(Voice().record_and_transcribe())
api_base = os.getenv("OPENAI_API_BASE")
api_key_whisper = os.getenv("OPENAI_API_KEY_WHISPER")
api_base_whisper = os.getenv("OPENAI_API_BASE_WHISPER")
if not (api_key or api_key_whisper):
raise ValueError(
"Please set either OPENAI_API_KEY or OPENAI_API_KEY_WHISPER environment variable."
)
print(
Voice(
api_base=api_base,
api_key=api_key,
api_base_whisper=api_base_whisper,
api_key_whisper=api_key_whisper,
).record_and_transcribe()
)
6 changes: 6 additions & 0 deletions aider/website/assets/sample.aider.conf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@
## Specify the api base url
#openai-api-base: xxx

## Specify an alternate api base url for Whisper transcriptions (optional)
#openai-api-base-whisper: xxx

## Specify an alternate api key for Whisper transcriptions (optional)
#openai-api-key-whisper: xxx

## (deprecated, use --set-env OPENAI_API_TYPE=<value>)
#openai-api-type: xxx

Expand Down
6 changes: 6 additions & 0 deletions aider/website/assets/sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@
## Specify the api base url
#AIDER_OPENAI_API_BASE=

## Specify an alternate api base url for Whisper transcriptions (optional)
#AIDER_OPENAI_API_BASE_WHISPER=

## Specify an alternate api key for Whisper transcriptions (optional)
#AIDER_OPENAI_API_KEY_WHISPER=

## (deprecated, use --set-env OPENAI_API_TYPE=<value>)
#AIDER_OPENAI_API_TYPE=

Expand Down
25 changes: 25 additions & 0 deletions aider/website/docs/install/optional.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ for additional information.
Aider supports
[coding with your voice](https://aider.chat/docs/usage/voice.html)
using the in-chat `/voice` command.

### Audio capture setup

Aider uses the [PortAudio](http://www.portaudio.com) library to
capture audio.
Installing PortAudio is completely optional, but can usually be accomplished like this:
Expand All @@ -55,6 +58,28 @@ Installing PortAudio is completely optional, but can usually be accomplished lik
- For Linux, do `sudo apt-get install libportaudio2`
- Some linux environments may also need `sudo apt install libasound2-plugins`

### Whisper API configuration

By default, aider uses OpenAI's Whisper API for voice transcription. You can optionally configure it to use an alternate Whisper API endpoint, including self-hosted instances:

```bash
# Optional: Use an alternate Whisper API endpoint (via env vars, config file, or command line)
export AIDER_OPENAI_API_BASE_WHISPER=http://your-whisper-api-endpoint
export AIDER_OPENAI_API_KEY_WHISPER=your-alternate-api-key
```

You can configure these settings through:
- Environment variables (shown above)
- Config file (.aider.conf.yml)
- Command line arguments (--openai-api-base-whisper, --openai-api-key-whisper)

This is useful if you want to:
- Use a different Whisper API provider
- Run Whisper locally or on your own infrastructure
- Control costs or data privacy

If these are not set, aider will use the standard OpenAI API endpoint and credentials.

## Add aider to your editor

Other projects have integrated aider into some IDE/editors.
Expand Down
4 changes: 2 additions & 2 deletions aider/website/docs/usage/voice.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ when you ask aider to edit your code.
Use the in-chat `/voice` command to start recording,
and press `ENTER` when you're done speaking.
Your voice coding instructions will be transcribed,
as if you had typed them into
as if you had typed them into
the aider chat session.

See the [installation instructions](https://aider.chat/docs/install/optional.html#enable-voice-coding) for
information on how to enable the `/voice` command.
information on how to enable the `/voice` command and configure alternate Whisper API endpoints.

<br/>
<div class="chat-transcript" markdown="1">
Expand Down
17 changes: 17 additions & 0 deletions tests/basic/test_voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,23 @@ def test_voice_init_invalid_format():
assert "Unsupported audio format" in str(exc.value)


def test_voice_init_whisper_credentials(mock_sounddevice):
voice = Voice(api_base_whisper="whisper_url", api_key_whisper="whisper_key")
assert voice.api_base == "whisper_url"
assert voice.api_key == "whisper_key"

# Test that providing api_base_whisper without api_key_whisper raises an error
with pytest.raises(ValueError) as exc:
Voice(api_base_whisper="whisper_url")
assert "api_key_whisper is required" in str(exc.value)


def test_voice_init_fallback_credentials(mock_sounddevice):
voice = Voice(api_base="base_url", api_key="base_key")
assert voice.api_base == "base_url"
assert voice.api_key == "base_key"


def test_callback_processing():
with patch("aider.voice.sf", MagicMock()): # Need to mock sf to avoid SoundDeviceError
voice = Voice()
Expand Down

0 comments on commit 5911c51

Please sign in to comment.