Skip to content

Commit

Permalink
Allow selection of channel when loading / creating audio as mono
Browse files Browse the repository at this point in the history
 * Allow `remix` to choose a channel, and just copy results from that channel if required
 * `ffmpeg` version - use `pan` audio filter (see [section in manual](https://trac.ffmpeg.org/wiki/AudioChannelManipulation)).
 * Propagate this parameter through all the audio loading layers
 * Add tests around all aspects of this

This fixes CPJKU#211.
  • Loading branch information
declension committed Jan 16, 2019
1 parent 3942e9f commit 0a47099
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 36 deletions.
35 changes: 22 additions & 13 deletions madmom/audio/signal.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def normalize(signal):
return np.asanyarray(signal / scaling, dtype=signal.dtype)


def remix(signal, num_channels):
def remix(signal, num_channels, channel=None):
"""
Remix the signal to have the desired number of channels.
Expand All @@ -176,6 +176,8 @@ def remix(signal, num_channels):
Signal to be remixed.
num_channels : int
Number of channels.
channel : int, optional
The channel to select when num_signals==1, or None to average them all.
Returns
-------
Expand All @@ -196,16 +198,19 @@ def remix(signal, num_channels):
convert the dtype first.
"""
# convert to the desired number of channels
if num_channels == signal.ndim or num_channels is None:
# return as many channels as there are.
return signal
elif num_channels == 1 and signal.ndim > 1:
# down-mix to mono
# Note: to prevent clipping, the signal is converted to float first
# and then converted back to the original dtype
# TODO: add weighted mixing
return np.mean(signal, axis=-1).astype(signal.dtype)
if channel is None:
# down-mix to mono
# Note: to prevent clipping, the signal is converted to float first
# and then converted back to the original dtype
# TODO: add weighted mixing
return np.mean(signal, axis=-1).astype(signal.dtype)
else:
# Copy the requested signal verbatim
return signal[:, channel]
elif num_channels > 1 and signal.ndim == 1:
# up-mix a mono signal simply by copying channels
return np.tile(signal[:, np.newaxis], num_channels)
Expand Down Expand Up @@ -457,7 +462,7 @@ def load_wave_file(*args, **kwargs):
warnings.warn('Deprecated as of version 0.16. Please use madmom.io.audio.'
'load_wave_file instead. Will be removed in version 0.18.')
from ..io.audio import load_wave_file
return load_wave_file(*args, **kwargs)
return load_wave_file()


def write_wave_file(*args, **kwargs):
Expand Down Expand Up @@ -488,6 +493,7 @@ def load_audio_file(*args, **kwargs):
# signal classes
SAMPLE_RATE = None
NUM_CHANNELS = None
CHANNEL = None
START = None
STOP = None
NORM = False
Expand All @@ -510,6 +516,9 @@ class Signal(np.ndarray):
num_channels : int, optional
Reduce or expand the signal to `num_channels` channels, or 'None'
to return the signal with its original channels.
channel : int, optional
When reducing a signal to `num_channels` of 1, use this channel,
or 'None' to return the average across all channels.
start : float, optional
Start position [seconds].
stop : float, optional
Expand Down Expand Up @@ -581,14 +590,14 @@ class Signal(np.ndarray):
# pylint: disable=attribute-defined-outside-init

def __init__(self, data, sample_rate=SAMPLE_RATE,
num_channels=NUM_CHANNELS, start=START, stop=STOP, norm=NORM,
gain=GAIN, dtype=DTYPE, **kwargs):
num_channels=NUM_CHANNELS, channel=CHANNEL, start=START,
stop=STOP, norm=NORM, gain=GAIN, dtype=DTYPE, **kwargs):
# this method is for documentation purposes only
pass

def __new__(cls, data, sample_rate=SAMPLE_RATE, num_channels=NUM_CHANNELS,
start=START, stop=STOP, norm=NORM, gain=GAIN, dtype=DTYPE,
**kwargs):
channel=CHANNEL, start=START, stop=STOP, norm=NORM, gain=GAIN,
dtype=DTYPE, **kwargs):
from ..io.audio import load_audio_file
# try to load an audio file if the data is not a numpy array
if not isinstance(data, np.ndarray):
Expand All @@ -602,7 +611,7 @@ def __new__(cls, data, sample_rate=SAMPLE_RATE, num_channels=NUM_CHANNELS,
data.sample_rate = sample_rate
# remix to desired number of channels
if num_channels:
data = remix(data, num_channels)
data = remix(data, num_channels, channel)
# normalize signal if needed
if norm:
data = normalize(data)
Expand Down
73 changes: 50 additions & 23 deletions madmom/io/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def _ffmpeg_fmt(dtype):
return str(fmt)


def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None, num_channels=1,
def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None,
num_channels=1, channel=None,
skip=None, max_len=None, cmd='ffmpeg',
replaygain_mode=None, replaygain_preamp=0.0):
"""
Expand All @@ -91,6 +92,8 @@ def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None, num_channels=1,
Sample rate to re-sample the signal to (if set) [Hz].
num_channels : int, optional
Number of channels to reduce the signal to.
channel : int, optional
Single channel to select if `num_channels` is '1'.
skip : float, optional
Number of seconds to skip at beginning of file.
max_len : float, optional
Expand Down Expand Up @@ -150,13 +153,16 @@ def _ffmpeg_call(infile, output, fmt='f32le', sample_rate=None, num_channels=1,
# output options
if num_channels:
call.extend(["-ac", str(int(num_channels))])
if channel is not None and num_channels == 1:
call.extend(["-af", "pan=mono|c0=c%d" % int(channel)])
if sample_rate:
call.extend(["-ar", str(int(sample_rate))])
call.append(output)
return call


def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
def decode_to_disk(infile, fmt='f32le', sample_rate=None,
num_channels=1, channel=None,
skip=None, max_len=None, outfile=None, tmp_dir=None,
tmp_suffix=None, cmd='ffmpeg',
replaygain_mode=None, replaygain_preamp=0.0):
Expand All @@ -175,6 +181,8 @@ def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
Sample rate to re-sample the signal to (if set) [Hz].
num_channels : int, optional
Number of channels to reduce the signal to.
channe : int, optional
Single channel to select if `num_channels` is '1'.
skip : float, optional
Number of seconds to skip at beginning of file.
max_len : float, optional
Expand Down Expand Up @@ -221,7 +229,8 @@ def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
% outfile)
# call ffmpeg (throws exception on error)
try:
call = _ffmpeg_call(infile, outfile, fmt, sample_rate, num_channels,
call = _ffmpeg_call(infile, outfile, fmt, sample_rate,
num_channels, channel,
skip, max_len, cmd,
replaygain_mode=replaygain_mode,
replaygain_preamp=replaygain_preamp)
Expand All @@ -233,7 +242,8 @@ def decode_to_disk(infile, fmt='f32le', sample_rate=None, num_channels=1,
return outfile


def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
def decode_to_pipe(infile, fmt='f32le', sample_rate=None,
num_channels=1, channel=None,
skip=None, max_len=None, buf_size=-1, cmd='ffmpeg',
replaygain_mode=None, replaygain_preamp=0.0):
"""
Expand All @@ -252,6 +262,8 @@ def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
Sample rate to re-sample the signal to (if set) [Hz].
num_channels : int, optional
Number of channels to reduce the signal to.
channel : int, optional
Single channel to select if `num_channels` is '1'.
skip : float, optional
Number of seconds to skip at beginning of file.
max_len : float, optional
Expand Down Expand Up @@ -289,8 +301,9 @@ def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
# reacts on that. A cleaner solution would be calling proc.terminate
# explicitly, but this is only available in Python 2.6+. proc.wait
# needs to be called in any case.
call = _ffmpeg_call(infile, "pipe:1", fmt, sample_rate, num_channels, skip,
max_len, cmd,
call = _ffmpeg_call(infile, "pipe:1", fmt, sample_rate,
num_channels, channel,
skip, max_len, cmd,
replaygain_mode=replaygain_mode,
replaygain_preamp=replaygain_preamp)
# redirect stdout to a pipe and buffer as requested
Expand All @@ -302,7 +315,8 @@ def decode_to_pipe(infile, fmt='f32le', sample_rate=None, num_channels=1,
return proc.stdout, proc


def decode_to_memory(infile, fmt='f32le', sample_rate=None, num_channels=1,
def decode_to_memory(infile, fmt='f32le', sample_rate=None,
num_channels=1, channel=None,
skip=None, max_len=None, cmd='ffmpeg',
replaygain_mode=None, replaygain_preamp=0.0):
"""
Expand All @@ -320,6 +334,8 @@ def decode_to_memory(infile, fmt='f32le', sample_rate=None, num_channels=1,
Sample rate to re-sample the signal to (if set) [Hz].
num_channels : int, optional
Number of channels to reduce the signal to.
channel : int, optional
The single channel to select, if `num_channels` is '1'.
skip : float, optional
Number of seconds to skip at beginning of file.
max_len : float, optional
Expand All @@ -343,8 +359,8 @@ def decode_to_memory(infile, fmt='f32le', sample_rate=None, num_channels=1,
"as `infile`, not %s." % infile)
# prepare decoding to pipe
_, proc = decode_to_pipe(infile, fmt=fmt, sample_rate=sample_rate,
num_channels=num_channels, skip=skip,
max_len=max_len, cmd=cmd,
num_channels=num_channels, channel=channel,
skip=skip, max_len=max_len, cmd=cmd,
replaygain_mode=replaygain_mode,
replaygain_preamp=replaygain_preamp)
# decode the input to memory
Expand Down Expand Up @@ -402,7 +418,8 @@ def get_file_info(infile, cmd='ffprobe'):
return info


def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,
def load_ffmpeg_file(filename, sample_rate=None,
num_channels=None, channel=None,
start=None, stop=None, dtype=None,
cmd_decode='ffmpeg', cmd_probe='ffprobe',
replaygain_mode=None, replaygain_preamp=0.0):
Expand All @@ -423,6 +440,8 @@ def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,
num_channels : int, optional
Reduce or expand the signal to `num_channels` channels; 'None' returns
the signal with its original channels.
channel : int, optional
Single channel to select if `num_channels` is '1'.
start : float, optional
Start position [seconds].
stop : float, optional
Expand Down Expand Up @@ -463,6 +482,7 @@ def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,
signal = np.frombuffer(decode_to_memory(filename, fmt=fmt,
sample_rate=sample_rate,
num_channels=num_channels,
channel=channel,
skip=start, max_len=max_len,
cmd=cmd_decode,
replaygain_mode=replaygain_mode,
Expand All @@ -483,8 +503,8 @@ def load_ffmpeg_file(filename, sample_rate=None, num_channels=None,


# functions for loading/saving wave files
def load_wave_file(filename, sample_rate=None, num_channels=None, start=None,
stop=None, dtype=None):
def load_wave_file(filename, sample_rate=None, num_channels=None, channel=None,
start=None, stop=None, dtype=None):
"""
Load the audio data from the given file and return it as a numpy array.
Expand All @@ -502,6 +522,8 @@ def load_wave_file(filename, sample_rate=None, num_channels=None, start=None,
num_channels : int, optional
Reduce or expand the signal to `num_channels` channels, or 'None'
to return the signal with its original channels.
channel : int, optional
Single channel to select if `num_channels` is '1' (or 'None')
start : float, optional
Start position [seconds].
stop : float, optional
Expand Down Expand Up @@ -545,10 +567,12 @@ def load_wave_file(filename, sample_rate=None, num_channels=None, start=None,
stop = min(len(signal), int(stop * file_sample_rate))
if start is not None or stop is not None:
signal = signal[start: stop]
# up-/down-mix if needed
if channel is not None and num_channels is None:
# It's clear what the caller means here
num_channels = 1
if num_channels is not None:
from ..audio.signal import remix
signal = remix(signal, num_channels)
signal = remix(signal, num_channels, channel)
# return the signal
return signal, file_sample_rate

Expand Down Expand Up @@ -586,8 +610,9 @@ def write_wave_file(signal, filename, sample_rate=None):


# function for automatically determining how to open audio files
def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
stop=None, dtype=None,
def load_audio_file(filename, sample_rate=None,
num_channels=None, channel=None,
start=None, stop=None, dtype=None,
replaygain_mode=None, replaygain_preamp=0.0):
"""
Load the audio data from the given file and return it as a numpy array.
Expand All @@ -600,9 +625,11 @@ def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
sample_rate : int, optional
Desired sample rate of the signal [Hz], or 'None' to return the
signal in its original rate.
num_channels: int, optional
num_channels : int, optional
Reduce or expand the signal to `num_channels` channels, or 'None'
to return the signal with its original channels.
channel : int, optional
Single channel to select if `num_channels` is '1'.
start : float, optional
Start position [seconds].
stop : float, optional
Expand Down Expand Up @@ -644,15 +671,15 @@ def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
error = "All attempts to load audio file %r failed." % filename
try:
return load_wave_file(filename, sample_rate=sample_rate,
num_channels=num_channels, start=start,
stop=stop, dtype=dtype)
num_channels=num_channels, channel=channel,
start=start, stop=stop, dtype=dtype)
except ValueError:
pass
# not a wave file (or other sample rate requested), try ffmpeg
try:
return load_ffmpeg_file(filename, sample_rate=sample_rate,
num_channels=num_channels, start=start,
stop=stop, dtype=dtype,
num_channels=num_channels, channel=channel,
start=start, stop=stop, dtype=dtype,
replaygain_mode=replaygain_mode,
replaygain_preamp=replaygain_preamp)
except OSError as e:
Expand All @@ -663,8 +690,8 @@ def load_audio_file(filename, sample_rate=None, num_channels=None, start=None,
# ffmpeg is not present, try avconv
try:
return load_ffmpeg_file(filename, sample_rate=sample_rate,
num_channels=num_channels, start=start,
stop=stop, dtype=dtype,
num_channels=num_channels, channel=channel,
start=start, stop=stop, dtype=dtype,
cmd_decode='avconv', cmd_probe='avprobe',
replaygain_mode=replaygain_mode,
replaygain_preamp=replaygain_preamp)
Expand Down
7 changes: 7 additions & 0 deletions tests/test_audio_signal.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,13 @@ def test_types(self):
self.assertTrue(result.shape == sig_2d.shape)
self.assertTrue(result.dtype == np.int)

def test_channel_selection(self):
result = remix(sig_2d, 1, channel=0)
self.assertEquals(result.shape, sig_1d.shape)
self.assertTrue(np.array_equal(result, sig_1d))
result = remix(sig_2d, 1, channel=1)
self.assertTrue(np.array_equal(result, sig_2d[:, 1]), 0)

def test_values(self):
# mono signals
result = remix(sig_1d, 1)
Expand Down
20 changes: 20 additions & 0 deletions tests/test_io_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ def test_downmix(self):
self.assertTrue(sample_rate == 44100)
self.assertTrue(signal.shape == (182919,))

def test_channel_choice(self):
signal, sample_rate = load_wave_file(stereo_sample_file, channel=0)
self.assertTrue(signal.shape == (182919,))
self.assertTrue(np.allclose(signal[:4], [33, 35, 29, 36]))
signal, sample_rate = load_wave_file(stereo_sample_file, channel=1)
self.assertTrue(np.allclose(signal[:4], [38, 36, 34, 31]))

def test_upmix(self):
signal, sample_rate = load_wave_file(sample_file, num_channels=2)
self.assertTrue(np.allclose(signal[:5],
Expand Down Expand Up @@ -193,6 +200,11 @@ def test_values(self):
self.assertTrue(sample_rate == 44100)
self.assertTrue(signal.shape == (182919, 2))

def test_wave_channel_selection(self):
signal, sample_rate = load_audio_file(stereo_sample_file, channel=1)
self.assertTrue(signal.shape == (182919,))
self.assertTrue(np.allclose(signal[:4], [38, 36, 34, 31]))

def test_start_stop(self):
# test wave loader
signal, sample_rate = load_audio_file(sample_file, start=1. / 44100,
Expand Down Expand Up @@ -263,6 +275,14 @@ def test_resample(self):
# avconv results in a different length of 91450 samples
self.assertTrue(np.allclose(len(signal), 91460, atol=10))

def test_choose_channel(self):
signal, sample_rate = load_audio_file(flac_file,
sample_rate=22050,
num_channels=1, channel=0)
# avconv results in a different length of 91450 samples
self.assertTrue(np.allclose(len(signal), 91460, atol=10))
self.assertTrue(np.allclose(signal[:5], [34, 32, 37, 35, 32], atol=1))

def test_replaygain_disabled(self):
original = load_signal(flac_file)
signal = load_signal(rg_flac_file, replaygain_mode=None)
Expand Down

0 comments on commit 0a47099

Please sign in to comment.