Allow selection of channel when loading / creating audio as mono

* Allow `remix` to choose a channel, and just copy results from that channel if required * `ffmpeg` version - use `pan` audio filter (see [section in manual](https://trac.ffmpeg.org/wiki/AudioChannelManipulation)). * Propagate this parameter through all the audio loading layers * Add tests around all aspects of this This fixes CPJKU#211.
declension · Jan 16, 2019 · 0a47099 · 0a47099
1 parent 3942e9f
commit 0a47099
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 36 deletions.
diff --git a/madmom/audio/signal.py b/madmom/audio/signal.py
@@ -166,7 +166,7 @@ def normalize(signal):
     return np.asanyarray(signal / scaling, dtype=signal.dtype)
 
 
-def remix(signal, num_channels):
+def remix(signal, num_channels, channel=None):
     """
     Remix the signal to have the desired number of channels.
 
@@ -176,6 +176,8 @@ def remix(signal, num_channels):
         Signal to be remixed.
     num_channels : int
         Number of channels.
+    channel : int, optional
+        The channel to select when num_signals==1, or None to average them all.
 
     Returns
     -------
@@ -196,16 +198,19 @@ def remix(signal, num_channels):
     convert the dtype first.
 
     """
-    # convert to the desired number of channels
     if num_channels == signal.ndim or num_channels is None:
         # return as many channels as there are.
         return signal
     elif num_channels == 1 and signal.ndim > 1:
-        # down-mix to mono
-        # Note: to prevent clipping, the signal is converted to float first
-        #       and then converted back to the original dtype
-        # TODO: add weighted mixing
-        return np.mean(signal, axis=-1).astype(signal.dtype)
+        if channel is None:
+            # down-mix to mono
+            # Note: to prevent clipping, the signal is converted to float first
+            #       and then converted back to the original dtype
+            # TODO: add weighted mixing
+            return np.mean(signal, axis=-1).astype(signal.dtype)
+        else:
+            # Copy the requested signal verbatim
+            return signal[:, channel]
     elif num_channels > 1 and signal.ndim == 1:
         # up-mix a mono signal simply by copying channels
         return np.tile(signal[:, np.newaxis], num_channels)
@@ -457,7 +462,7 @@ def load_wave_file(*args, **kwargs):
     warnings.warn('Deprecated as of version 0.16. Please use madmom.io.audio.'
                   'load_wave_file instead. Will be removed in version 0.18.')
     from ..io.audio import load_wave_file
-    return load_wave_file(*args, **kwargs)
+    return load_wave_file()
 
 
 def write_wave_file(*args, **kwargs):
@@ -488,6 +493,7 @@ def load_audio_file(*args, **kwargs):
 # signal classes
 SAMPLE_RATE = None
 NUM_CHANNELS = None
+CHANNEL = None
 START = None
 STOP = None
 NORM = False
@@ -510,6 +516,9 @@ class Signal(np.ndarray):
     num_channels : int, optional
         Reduce or expand the signal to `num_channels` channels, or 'None'
         to return the signal with its original channels.
+    channel : int, optional
+        When reducing a signal to `num_channels` of 1, use this channel,
+        or 'None' to return the average across all channels.
     start : float, optional
         Start position [seconds].
     stop : float, optional
@@ -581,14 +590,14 @@ class Signal(np.ndarray):
     # pylint: disable=attribute-defined-outside-init
 
     def __init__(self, data, sample_rate=SAMPLE_RATE,
-                 num_channels=NUM_CHANNELS, start=START, stop=STOP, norm=NORM,
-                 gain=GAIN, dtype=DTYPE, **kwargs):
+                 num_channels=NUM_CHANNELS, channel=CHANNEL, start=START,
+                 stop=STOP, norm=NORM, gain=GAIN, dtype=DTYPE, **kwargs):
         # this method is for documentation purposes only
         pass
 
     def __new__(cls, data, sample_rate=SAMPLE_RATE, num_channels=NUM_CHANNELS,
-                start=START, stop=STOP, norm=NORM, gain=GAIN, dtype=DTYPE,
-                **kwargs):
+                channel=CHANNEL, start=START, stop=STOP, norm=NORM, gain=GAIN,
+                dtype=DTYPE, **kwargs):
         from ..io.audio import load_audio_file
         # try to load an audio file if the data is not a numpy array
         if not isinstance(data, np.ndarray):
@@ -602,7 +611,7 @@ def __new__(cls, data, sample_rate=SAMPLE_RATE, num_channels=NUM_CHANNELS,
             data.sample_rate = sample_rate
         # remix to desired number of channels
         if num_channels:
-            data = remix(data, num_channels)
+            data = remix(data, num_channels, channel)
         # normalize signal if needed
         if norm:
             data = normalize(data)

diff --git a/madmom/io/audio.py b/madmom/io/audio.py
@@ -69,7 +69,8 @@ def _ffmpeg_fmt(dtype):
     return str(fmt)
 
 
-def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None, num_channels=1,
+def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None,
+                 num_channels=1, channel=None,
                  skip=None, max_len=None, cmd='ffmpeg',
                  replaygain_mode=None, replaygain_preamp=0.0):
     """
@@ -91,6 +92,8 @@ def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None, num_channels=1,
         Sample rate to re-sample the signal to (if set) [Hz].
     num_channels : int, optional
         Number of channels to reduce the signal to.
+    channel : int, optional
+        Single channel to select if `num_channels` is '1'.
     skip : float, optional
         Number of seconds to skip at beginning of file.
     max_len : float, optional
@@ -150,13 +153,16 @@ def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None, num_channels=1,
     # output options
     if num_channels:
         call.extend(["-ac", str(int(num_channels))])
+    if channel is not None and num_channels == 1:
+        call.extend(["-af", "pan=mono|c0=c%d" % int(channel)])
     if sample_rate:
         call.extend(["-ar", str(int(sample_rate))])
     call.append(output)
     return call
 
 
-def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
+def decode_to_disk(infile, fmt='f32le', sample_rate=None,
+                   num_channels=1, channel=None,
                    skip=None, max_len=None, outfile=None, tmp_dir=None,
                    tmp_suffix=None, cmd='ffmpeg',
                    replaygain_mode=None, replaygain_preamp=0.0):
@@ -175,6 +181,8 @@ def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
         Sample rate to re-sample the signal to (if set) [Hz].
     num_channels : int, optional
         Number of channels to reduce the signal to.
+    channe : int, optional
+        Single channel to select if `num_channels` is '1'.
     skip : float, optional
         Number of seconds to skip at beginning of file.
     max_len : float, optional
@@ -221,7 +229,8 @@ def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
                          % outfile)
     # call ffmpeg (throws exception on error)
     try:
-        call = _ffmpeg_call(infile, outfile, fmt, sample_rate, num_channels,
+        call = _ffmpeg_call(infile, outfile, fmt, sample_rate,
+                            num_channels, channel,
                             skip, max_len, cmd,
                             replaygain_mode=replaygain_mode,
                             replaygain_preamp=replaygain_preamp)
@@ -233,7 +242,8 @@ def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
     return outfile
 
 
-def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
+def decode_to_pipe(infile, fmt='f32le', sample_rate=None,
+                   num_channels=1, channel=None,
                    skip=None, max_len=None, buf_size=-1, cmd='ffmpeg',
                    replaygain_mode=None, replaygain_preamp=0.0):
     """
@@ -252,6 +262,8 @@ def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
         Sample rate to re-sample the signal to (if set) [Hz].
     num_channels : int, optional
         Number of channels to reduce the signal to.
+    channel : int, optional
+        Single channel to select if `num_channels` is '1'.
     skip : float, optional
         Number of seconds to skip at beginning of file.
     max_len : float, optional
@@ -289,8 +301,9 @@ def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
     #       reacts on that. A cleaner solution would be calling proc.terminate
     #       explicitly, but this is only available in Python 2.6+. proc.wait
     #       needs to be called in any case.
-    call = _ffmpeg_call(infile, "pipe:1", fmt, sample_rate, num_channels, skip,
-                        max_len, cmd,
+    call = _ffmpeg_call(infile, "pipe:1", fmt, sample_rate,
+                        num_channels, channel,
+                        skip, max_len, cmd,
                         replaygain_mode=replaygain_mode,
                         replaygain_preamp=replaygain_preamp)
     # redirect stdout to a pipe and buffer as requested
@@ -302,7 +315,8 @@ def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
     return proc.stdout, proc
 
 
-def decode_to_memory(infile, fmt='f32le', sample_rate=None, num_channels=1,
+def decode_to_memory(infile, fmt='f32le', sample_rate=None,
+                     num_channels=1, channel=None,
                      skip=None, max_len=None, cmd='ffmpeg',
                      replaygain_mode=None, replaygain_preamp=0.0):
     """
@@ -320,6 +334,8 @@ def decode_to_memory(infile, fmt='f32le', sample_rate=None, num_channels=1,
         Sample rate to re-sample the signal to (if set) [Hz].
     num_channels : int, optional
         Number of channels to reduce the signal to.
+    channel : int, optional
+        The single channel to select, if `num_channels` is '1'.
     skip : float, optional
         Number of seconds to skip at beginning of file.
     max_len : float, optional
@@ -343,8 +359,8 @@ def decode_to_memory(infile, fmt='f32le', sample_rate=None, num_channels=1,
                          "as `infile`, not %s." % infile)
     # prepare decoding to pipe
     _, proc = decode_to_pipe(infile, fmt=fmt, sample_rate=sample_rate,
-                             num_channels=num_channels, skip=skip,
-                             max_len=max_len, cmd=cmd,
+                             num_channels=num_channels, channel=channel,
+                             skip=skip, max_len=max_len, cmd=cmd,
                              replaygain_mode=replaygain_mode,
                              replaygain_preamp=replaygain_preamp)
     # decode the input to memory
@@ -402,7 +418,8 @@ def get_file_info(infile, cmd='ffprobe'):
     return info
 
 
-def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,
+def load_ffmpeg_file(filename, sample_rate=None,
+                     num_channels=None, channel=None,
                      start=None, stop=None, dtype=None,
                      cmd_decode='ffmpeg', cmd_probe='ffprobe',
                      replaygain_mode=None, replaygain_preamp=0.0):
@@ -423,6 +440,8 @@ def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,
     num_channels : int, optional
         Reduce or expand the signal to `num_channels` channels; 'None' returns
         the signal with its original channels.
+    channel : int, optional
+        Single channel to select if `num_channels` is '1'.
     start : float, optional
         Start position [seconds].
     stop : float, optional
@@ -463,6 +482,7 @@ def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,
     signal = np.frombuffer(decode_to_memory(filename, fmt=fmt,
                                             sample_rate=sample_rate,
                                             num_channels=num_channels,
+                                            channel=channel,
                                             skip=start, max_len=max_len,
                                             cmd=cmd_decode,
                                             replaygain_mode=replaygain_mode,
@@ -483,8 +503,8 @@ def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,
 
 
 # functions for loading/saving wave files
-def load_wave_file(filename, sample_rate=None, num_channels=None, start=None,
-                   stop=None, dtype=None):
+def load_wave_file(filename, sample_rate=None, num_channels=None, channel=None,
+                   start=None, stop=None, dtype=None):
     """
     Load the audio data from the given file and return it as a numpy array.
 
@@ -502,6 +522,8 @@ def load_wave_file(filename, sample_rate=None, num_channels=None, start=None,
     num_channels : int, optional
         Reduce or expand the signal to `num_channels` channels, or 'None'
         to return the signal with its original channels.
+    channel : int, optional
+        Single channel to select if `num_channels` is '1' (or 'None')
     start : float, optional
         Start position [seconds].
     stop : float, optional
@@ -545,10 +567,12 @@ def load_wave_file(filename, sample_rate=None, num_channels=None, start=None,
         stop = min(len(signal), int(stop * file_sample_rate))
     if start is not None or stop is not None:
         signal = signal[start: stop]
-    # up-/down-mix if needed
+    if channel is not None and num_channels is None:
+        # It's clear what the caller means here
+        num_channels = 1
     if num_channels is not None:
         from ..audio.signal import remix
-        signal = remix(signal, num_channels)
+        signal = remix(signal, num_channels, channel)
     # return the signal
     return signal, file_sample_rate
 
@@ -586,8 +610,9 @@ def write_wave_file(signal, filename, sample_rate=None):
 
 
 # function for automatically determining how to open audio files
-def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
-                    stop=None, dtype=None,
+def load_audio_file(filename, sample_rate=None,
+                    num_channels=None, channel=None,
+                    start=None, stop=None, dtype=None,
                     replaygain_mode=None, replaygain_preamp=0.0):
     """
     Load the audio data from the given file and return it as a numpy array.
@@ -600,9 +625,11 @@ def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
     sample_rate : int, optional
         Desired sample rate of the signal [Hz], or 'None' to return the
         signal in its original rate.
-    num_channels: int, optional
+    num_channels : int, optional
         Reduce or expand the signal to `num_channels` channels, or 'None'
         to return the signal with its original channels.
+    channel : int, optional
+        Single channel to select if `num_channels` is '1'.
     start : float, optional
         Start position [seconds].
     stop : float, optional
@@ -644,15 +671,15 @@ def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
     error = "All attempts to load audio file %r failed." % filename
     try:
         return load_wave_file(filename, sample_rate=sample_rate,
-                              num_channels=num_channels, start=start,
-                              stop=stop, dtype=dtype)
+                              num_channels=num_channels, channel=channel,
+                              start=start, stop=stop, dtype=dtype)
     except ValueError:
         pass
     # not a wave file (or other sample rate requested), try ffmpeg
     try:
         return load_ffmpeg_file(filename, sample_rate=sample_rate,
-                                num_channels=num_channels, start=start,
-                                stop=stop, dtype=dtype,
+                                num_channels=num_channels, channel=channel,
+                                start=start, stop=stop, dtype=dtype,
                                 replaygain_mode=replaygain_mode,
                                 replaygain_preamp=replaygain_preamp)
     except OSError as e:
@@ -663,8 +690,8 @@ def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
         # ffmpeg is not present, try avconv
         try:
             return load_ffmpeg_file(filename, sample_rate=sample_rate,
-                                    num_channels=num_channels, start=start,
-                                    stop=stop, dtype=dtype,
+                                    num_channels=num_channels, channel=channel,
+                                    start=start, stop=stop, dtype=dtype,
                                     cmd_decode='avconv', cmd_probe='avprobe',
                                     replaygain_mode=replaygain_mode,
                                     replaygain_preamp=replaygain_preamp)

diff --git a/tests/test_audio_signal.py b/tests/test_audio_signal.py
@@ -338,6 +338,13 @@ def test_types(self):
         self.assertTrue(result.shape == sig_2d.shape)
         self.assertTrue(result.dtype == np.int)
 
+    def test_channel_selection(self):
+        result = remix(sig_2d, 1, channel=0)
+        self.assertEquals(result.shape, sig_1d.shape)
+        self.assertTrue(np.array_equal(result, sig_1d))
+        result = remix(sig_2d, 1, channel=1)
+        self.assertTrue(np.array_equal(result, sig_2d[:, 1]), 0)
+
     def test_values(self):
         # mono signals
         result = remix(sig_1d, 1)

diff --git a/tests/test_io_audio.py b/tests/test_io_audio.py
@@ -73,6 +73,13 @@ def test_downmix(self):
         self.assertTrue(sample_rate == 44100)
         self.assertTrue(signal.shape == (182919,))
 
+    def test_channel_choice(self):
+        signal, sample_rate = load_wave_file(stereo_sample_file, channel=0)
+        self.assertTrue(signal.shape == (182919,))
+        self.assertTrue(np.allclose(signal[:4], [33, 35, 29, 36]))
+        signal, sample_rate = load_wave_file(stereo_sample_file, channel=1)
+        self.assertTrue(np.allclose(signal[:4], [38, 36, 34, 31]))
+
     def test_upmix(self):
         signal, sample_rate = load_wave_file(sample_file, num_channels=2)
         self.assertTrue(np.allclose(signal[:5],
@@ -193,6 +200,11 @@ def test_values(self):
         self.assertTrue(sample_rate == 44100)
         self.assertTrue(signal.shape == (182919, 2))
 
+    def test_wave_channel_selection(self):
+        signal, sample_rate = load_audio_file(stereo_sample_file, channel=1)
+        self.assertTrue(signal.shape == (182919,))
+        self.assertTrue(np.allclose(signal[:4], [38, 36, 34, 31]))
+
     def test_start_stop(self):
         # test wave loader
         signal, sample_rate = load_audio_file(sample_file, start=1. / 44100,
@@ -263,6 +275,14 @@ def test_resample(self):
         # avconv results in a different length of 91450 samples
         self.assertTrue(np.allclose(len(signal), 91460, atol=10))
 
+    def test_choose_channel(self):
+        signal, sample_rate = load_audio_file(flac_file,
+                                              sample_rate=22050,
+                                              num_channels=1, channel=0)
+        # avconv results in a different length of 91450 samples
+        self.assertTrue(np.allclose(len(signal), 91460, atol=10))
+        self.assertTrue(np.allclose(signal[:5], [34, 32, 37, 35, 32], atol=1))
+
     def test_replaygain_disabled(self):
         original = load_signal(flac_file)
         signal = load_signal(rg_flac_file, replaygain_mode=None)