feat: Support custom Whisper API endpoints for voice transcription

Adds configuration options to support self-hosted Whisper API endpoints: OPENAI_API_BASE_WHISPER: - Specifies base URL for alternate Whisper API (e.g. http://localhost:2022) - Used to override default OpenAI endpoint OPENAI_API_KEY_WHISPER: - API key for alternative Whisper API endpoint - If unset when OPENAI_API_BASE_WHISPER is set, api_key will be empty Technical details: - Modified voice transcription to explicitly pass api_base and api_key to litellm instead of relying on environment variables - Improves configuration flexibility and allows different credentials for chat vs voice APIs - Maintains backwards compatibility with existing OpenAI usage Testing: - Verified with both OpenAI and self-hosted Whisper endpoints - Confirmed empty api_key behavior works as expected
Aider-AI · Dec 15, 2024 · 5911c51 · 5911c51
1 parent 65555b5
commit 5911c51
Show file tree

Hide file tree

Showing 8 changed files with 129 additions and 11 deletions.
diff --git a/aider/args.py b/aider/args.py
@@ -137,6 +137,10 @@ def get_parser(default_config_files, git_root):
         "--openai-api-key",
         help="Specify the OpenAI API key",
     )
+    group.add_argument(
+        "--openai-api-key-whisper",
+        help="Specify the OpenAI API key for Whisper transcriptions",
+    )
     group.add_argument(
         "--anthropic-api-key",
         help="Specify the Anthropic API key",
@@ -145,6 +149,10 @@ def get_parser(default_config_files, git_root):
         "--openai-api-base",
         help="Specify the api base url",
     )
+    group.add_argument(
+        "--openai-api-base-whisper",
+        help="Specify the api base url for Whisper transcriptions",
+    )
     group.add_argument(
         "--openai-api-type",
         help="(deprecated, use --set-env OPENAI_API_TYPE=<value>)",

diff --git a/aider/commands.py b/aider/commands.py
@@ -1112,12 +1112,32 @@ def cmd_voice(self, args):
         "Record and transcribe voice input"
 
         if not self.voice:
-            if "OPENAI_API_KEY" not in os.environ:
-                self.io.tool_error("To use /voice you must provide an OpenAI API key.")
-                return
+            if self.args.openai_api_base_whisper:
+                # When using custom Whisper API base, require matching API key
+                if not (self.args.openai_api_key_whisper or os.getenv("OPENAI_API_KEY_WHISPER")):
+                    self.io.tool_error(
+                        "When using a custom Whisper API base URL (--openai-api-base-whisper), you"
+                        " must also provide a Whisper API key (--openai-api-key-whisper or"
+                        " OPENAI_API_KEY_WHISPER)."
+                    )
+                    return
+            else:
+                # When using standard OpenAI API, require OpenAI API key
+                if not (self.args.openai_api_key or os.getenv("OPENAI_API_KEY")):
+                    self.io.tool_error(
+                        "To use /voice with the standard OpenAI API, you must provide an OpenAI API"
+                        " key (--openai-api-key or OPENAI_API_KEY)."
+                    )
+                    return
+
             try:
                 self.voice = voice.Voice(
-                    audio_format=self.args.voice_format, device_name=self.args.voice_input_device
+                    audio_format=self.args.voice_format,
+                    device_name=self.args.voice_input_device,
+                    api_base=self.args.openai_api_base,
+                    api_key=self.args.openai_api_key,
+                    api_base_whisper=self.args.openai_api_base_whisper,
+                    api_key_whisper=self.args.openai_api_key_whisper,
                 )
             except voice.SoundDeviceError:
                 self.io.tool_error(

diff --git a/aider/voice.py b/aider/voice.py
@@ -36,7 +36,25 @@ class Voice:
 
     threshold = 0.15
 
-    def __init__(self, audio_format="wav", device_name=None):
+    def __init__(
+        self,
+        audio_format="wav",
+        device_name=None,
+        api_base=None,
+        api_key=None,
+        api_base_whisper=None,
+        api_key_whisper=None,
+    ):
+        # If whisper-specific base URL is provided, use whisper-specific credentials
+        if api_base_whisper:
+            if not api_key_whisper:
+                raise ValueError("api_key_whisper is required when api_base_whisper is specified")
+            self.api_base = api_base_whisper
+            self.api_key = api_key_whisper
+        else:
+            # Otherwise fall back to standard OpenAI credentials
+            self.api_base = api_base
+            self.api_key = api_key
         if sf is None:
             raise SoundDeviceError
         try:
@@ -150,8 +168,14 @@ def raw_record_and_transcribe(self, history, language):
 
         with open(filename, "rb") as fh:
             try:
+                # Always use whisper-specific credentials for transcription
                 transcript = litellm.transcription(
-                    model="whisper-1", file=fh, prompt=history, language=language
+                    model="whisper-1",
+                    file=fh,
+                    prompt=history,
+                    language=language,
+                    api_base=self.api_base,
+                    api_key=self.api_key,
                 )
             except Exception as err:
                 print(f"Unable to transcribe {filename}: {err}")
@@ -166,6 +190,18 @@ def raw_record_and_transcribe(self, history, language):
 
 if __name__ == "__main__":
     api_key = os.getenv("OPENAI_API_KEY")
-    if not api_key:
-        raise ValueError("Please set the OPENAI_API_KEY environment variable.")
-    print(Voice().record_and_transcribe())
+    api_base = os.getenv("OPENAI_API_BASE")
+    api_key_whisper = os.getenv("OPENAI_API_KEY_WHISPER")
+    api_base_whisper = os.getenv("OPENAI_API_BASE_WHISPER")
+    if not (api_key or api_key_whisper):
+        raise ValueError(
+            "Please set either OPENAI_API_KEY or OPENAI_API_KEY_WHISPER environment variable."
+        )
+    print(
+        Voice(
+            api_base=api_base,
+            api_key=api_key,
+            api_base_whisper=api_base_whisper,
+            api_key_whisper=api_key_whisper,
+        ).record_and_transcribe()
+    )
diff --git a/aider/website/assets/sample.aider.conf.yml b/aider/website/assets/sample.aider.conf.yml
@@ -65,6 +65,12 @@
 ## Specify the api base url
 #openai-api-base: xxx
 
+## Specify an alternate api base url for Whisper transcriptions (optional)
+#openai-api-base-whisper: xxx
+
+## Specify an alternate api key for Whisper transcriptions (optional)
+#openai-api-key-whisper: xxx
+
 ## (deprecated, use --set-env OPENAI_API_TYPE=<value>)
 #openai-api-type: xxx
 

diff --git a/aider/website/assets/sample.env b/aider/website/assets/sample.env
@@ -69,6 +69,12 @@
 ## Specify the api base url
 #AIDER_OPENAI_API_BASE=
 
+## Specify an alternate api base url for Whisper transcriptions (optional)
+#AIDER_OPENAI_API_BASE_WHISPER=
+
+## Specify an alternate api key for Whisper transcriptions (optional)
+#AIDER_OPENAI_API_KEY_WHISPER=
+
 ## (deprecated, use --set-env OPENAI_API_TYPE=<value>)
 #AIDER_OPENAI_API_TYPE=
 

diff --git a/aider/website/docs/install/optional.md b/aider/website/docs/install/optional.md
@@ -46,6 +46,9 @@ for additional information.
 Aider supports 
 [coding with your voice](https://aider.chat/docs/usage/voice.html)
 using the in-chat `/voice` command.
+
+### Audio capture setup
+
 Aider uses the [PortAudio](http://www.portaudio.com) library to
 capture audio.
 Installing PortAudio is completely optional, but can usually be accomplished like this:
@@ -55,6 +58,28 @@ Installing PortAudio is completely optional, but can usually be accomplished lik
 - For Linux, do `sudo apt-get install libportaudio2`
   - Some linux environments may also need `sudo apt install libasound2-plugins`
 
+### Whisper API configuration
+
+By default, aider uses OpenAI's Whisper API for voice transcription. You can optionally configure it to use an alternate Whisper API endpoint, including self-hosted instances:
+
+```bash
+# Optional: Use an alternate Whisper API endpoint (via env vars, config file, or command line)
+export AIDER_OPENAI_API_BASE_WHISPER=http://your-whisper-api-endpoint
+export AIDER_OPENAI_API_KEY_WHISPER=your-alternate-api-key
+```
+
+You can configure these settings through:
+- Environment variables (shown above)
+- Config file (.aider.conf.yml)
+- Command line arguments (--openai-api-base-whisper, --openai-api-key-whisper)
+
+This is useful if you want to:
+- Use a different Whisper API provider
+- Run Whisper locally or on your own infrastructure
+- Control costs or data privacy
+
+If these are not set, aider will use the standard OpenAI API endpoint and credentials.
+
 ## Add aider to your editor 
 
 Other projects have integrated aider into some IDE/editors.

diff --git a/aider/website/docs/usage/voice.md b/aider/website/docs/usage/voice.md
@@ -17,11 +17,11 @@ when you ask aider to edit your code.
 Use the in-chat `/voice` command to start recording,
 and press `ENTER` when you're done speaking.
 Your voice coding instructions will be transcribed, 
-as if you had  typed them into
+as if you had typed them into
 the aider chat session.
 
 See the [installation instructions](https://aider.chat/docs/install/optional.html#enable-voice-coding) for
-information on how to enable the `/voice` command.
+information on how to enable the `/voice` command and configure alternate Whisper API endpoints.
 
 <br/>
 <div class="chat-transcript" markdown="1">

diff --git a/tests/basic/test_voice.py b/tests/basic/test_voice.py
@@ -53,6 +53,23 @@ def test_voice_init_invalid_format():
         assert "Unsupported audio format" in str(exc.value)
 
 
+def test_voice_init_whisper_credentials(mock_sounddevice):
+    voice = Voice(api_base_whisper="whisper_url", api_key_whisper="whisper_key")
+    assert voice.api_base == "whisper_url"
+    assert voice.api_key == "whisper_key"
+
+    # Test that providing api_base_whisper without api_key_whisper raises an error
+    with pytest.raises(ValueError) as exc:
+        Voice(api_base_whisper="whisper_url")
+    assert "api_key_whisper is required" in str(exc.value)
+
+
+def test_voice_init_fallback_credentials(mock_sounddevice):
+    voice = Voice(api_base="base_url", api_key="base_key")
+    assert voice.api_base == "base_url"
+    assert voice.api_key == "base_key"
+
+
 def test_callback_processing():
     with patch("aider.voice.sf", MagicMock()):  # Need to mock sf to avoid SoundDeviceError
         voice = Voice()