Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ABX script for CPC #13

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion cpc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,6 @@ def __init__(self,
def forward(self, batchData, label):
cFeature, encodedData, label = self.cpc(batchData, label)
cFeature = self.nullspace(cFeature)
encodedData = self.nullspace(encodedData)
return cFeature, encodedData, label


Expand Down
61 changes: 54 additions & 7 deletions finetune_nullspace.sh
Original file line number Diff line number Diff line change
@@ -1,30 +1,77 @@
SAVE_DIR="/pio/scratch/1/i273233/linear_separability/cpc/gru_level2/cpc_official"
SPEAKERS="speakers_factorized"
PHONEMES="phonemes_nullspace"
SPEAKERS_NULLSPACE="speakers_nullspace"

DIM_INTER=$1
DATASET_PATH=false
TRAIN_SPLIT_FILE_PATH=false
VALIDATION_SPLIT_FILE_PATH=false
BASELINE_NO_CLUSTERING_CHECKPOINT_PATH=false
SAVE_DIR=false
DIM_INBETWEEN=false
FROM_STEP=$SPEAKERS
if [[ $# -ge 2 ]]; then
FROM_STEP=$2
PHONEME_ALIGNMENTS_FILE=false

print_usage() {
echo -e "Usage: ./finetune_nullspace.sh"
echo -e "\t-d DATASET_PATH (E.g. LIBRISPEECH_DATASET_PATH/train-clean-100)"
echo -e "\t-t TRAIN_SPLIT_FILE_PATH (E.g. LIBRISPEECH_TRAIN_CLEAN_100_TRAIN_SPLIT_FILE_PATH)"
echo -e "\t-v VALIDATION_SPLIT_FILE_PATH (E.g. LIBRISPEECH_TRAIN_CLEAN_100_TEST_SPLIT_FILE_PATH)"
echo -e "\t-c BASELINE_NO_CLUSTERING_CHECKPOINT_PATH"
echo -e "\t-o SAVE_DIR"
echo -e "\t-n DIM_INBETWEEN (Dimension of nullspace will be DIM_EMBEDDING - DIM_INBETWEEN)"
echo -e "\t-p PHONEME_ALIGNMENTS_FILE (Path to the file containing phonemes for the entire dataset)"
echo -e "OPTIONAL ARGS:"
echo -e "\t-f FROM_STEP (From which step do you want to start. Order: $SPEAKERS [default] -> $PHONEMES -> $SPEAKERS_NULLSPACE)"
}

while getopts 'd:t:v:c:o:n:f:p:' flag; do
case "${flag}" in
d) DATASET_PATH="${OPTARG}" ;;
t) TRAIN_SPLIT_FILE_PATH="${OPTARG}" ;;
v) VALIDATION_SPLIT_FILE_PATH="${OPTARG}" ;;
c) BASELINE_NO_CLUSTERING_CHECKPOINT_PATH="${OPTARG}" ;;
o) SAVE_DIR="${OPTARG}" ;;
n) DIM_INBETWEEN="${OPTARG}" ;;
f) FROM_STEP="${OPTARG}" ;;
p) PHONEME_ALIGNMENTS_FILE="${OPTARG}" ;;
*) print_usage
exit 1 ;;
esac
done

echo $DATASET_PATH $TRAIN_SPLIT_FILE_PATH $VALIDATION_SPLIT_FILE_PATH $BASELINE_NO_CLUSTERING_CHECKPOINT_PATH $SAVE_DIR $DIM_INBETWEEN $FROM_STEP $PHONEME_ALIGNMENTS_FILE

if [[ $DATASET_PATH == false || $TRAIN_SPLIT_FILE_PATH == false || $VALIDATION_SPLIT_FILE_PATH == false || $BASELINE_NO_CLUSTERING_CHECKPOINT_PATH == false || $SAVE_DIR == false || $DIM_INBETWEEN == false || $PHONEME_ALIGNMENTS_FILE == false ]]
then
echo "Either DATASET_PATH, TRAIN_SPLIT_FILE_PATH, VALIDATION_SPLIT_FILE_PATH, BASELINE_NO_CLUSTERING_CHECKPOINT_PATH, SAVE_DIR, DIM_INBETWEEN or PHONEME_ALIGNMENTS_FILE is not set."
print_usage
exit 1
fi

mkdir -p $SAVE_DIR

case $FROM_STEP in
$SPEAKERS)
echo $SPEAKERS
mkdir -p ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER} --mode $SPEAKERS --max_size_loaded 40000000 --n_process_loader 2 --model cpc --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/log.txt
mkdir -p ${SAVE_DIR}/${SPEAKERS}_${DIM_INBETWEEN}
python cpc/eval/linear_separability.py $DATASET_PATH $TRAIN_SPLIT_FILE_PATH $VALIDATION_SPLIT_FILE_PATH $BASELINE_NO_CLUSTERING_CHECKPOINT_PATH --pathCheckpoint ${SAVE_DIR}/${SPEAKERS}_${DIM_INBETWEEN} --mode $SPEAKERS --max_size_loaded 40000000 --n_process_loader 2 --model cpc --dim_inter $DIM_INBETWEEN --gru_level 2
;&
$PHONEMES)
echo $PHONEMES
mkdir -p ${SAVE_DIR}_${PHONEMES}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${PHONEMES}_${DIM_INTER} --mode $PHONEMES --max_size_loaded 40000000 --n_process_loader 2 --model cpc --pathPhone $zd/LibriSpeech/alignments2/converted_aligned_phones.txt --path_speakers_factorized ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/checkpoint_9.pt --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${PHONEMES}_${DIM_INTER}/log.txt
mkdir -p ${SAVE_DIR}/${PHONEMES}_${DIM_INBETWEEN}
python cpc/eval/linear_separability.py $DATASET_PATH $TRAIN_SPLIT_FILE_PATH $VALIDATION_SPLIT_FILE_PATH $BASELINE_NO_CLUSTERING_CHECKPOINT_PATH --pathCheckpoint ${SAVE_DIR}/${PHONEMES}_${DIM_INBETWEEN} --mode $PHONEMES --max_size_loaded 40000000 --n_process_loader 2 --model cpc --pathPhone $PHONEME_ALIGNMENTS_FILE --path_speakers_factorized ${SAVE_DIR}/${SPEAKERS}_${DIM_INBETWEEN}/checkpoint_9.pt --dim_inter $DIM_INBETWEEN --gru_level 2
;&
$SPEAKERS_NULLSPACE)
echo $SPEAKERS_NULLSPACE
mkdir -p ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER} --mode $SPEAKERS_NULLSPACE --max_size_loaded 40000000 --n_process_loader 2 --model cpc --path_speakers_factorized ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/checkpoint_9.pt --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER}/log.txt
mkdir -p ${SAVE_DIR}/${SPEAKERS_NULLSPACE}_${DIM_INBETWEEN}
python cpc/eval/linear_separability.py $DATASET_PATH $TRAIN_SPLIT_FILE_PATH $VALIDATION_SPLIT_FILE_PATH $BASELINE_NO_CLUSTERING_CHECKPOINT_PATH --pathCheckpoint ${SAVE_DIR}/${SPEAKERS_NULLSPACE}_${DIM_INBETWEEN} --mode $SPEAKERS_NULLSPACE --max_size_loaded 40000000 --n_process_loader 2 --model cpc --path_speakers_factorized ${SAVE_DIR}/${SPEAKERS}_${DIM_INBETWEEN}/checkpoint_9.pt --dim_inter $DIM_INBETWEEN --gru_level 2
;;
*)
echo "Invalid from step: ${FROM_STEP} while it should be either ${SPEAKERS}, ${PHONEMES} or ${SPEAKERS_NULLSPACE}"
;;
esac

echo "Checkpoint with nullspace is located in ${SAVE_DIR}/${PHONEMES}_${DIM_INBETWEEN}/checkpoint_9.pt"
echo "The results of all the experiments are located in ${SAVE_DIR}/DIRECTORY/checkpoint_logs.json"

exit 0
56 changes: 56 additions & 0 deletions scripts/create_ls_dataset_for_abx_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import sys
import shutil
import argparse
from pathlib import Path
import numpy as np
import soundfile as sf

def parse_args():
# Run parameters
parser = argparse.ArgumentParser()
parser.add_argument("librispeech_path", type=str,
help="Path to the root directory of LibriSpeech.")
parser.add_argument("zerospeech_dataset_path", type=str,
help="Path to the ZeroSpeech dataset.")
parser.add_argument("target_path", type=str,
help="Path to the output directory.")
parser.add_argument("--file_extension", type=str, default="flac",
help="Extension of the audio files in the dataset (default: flac).")
return parser.parse_args()

def main():
# Parse and print args
args = parse_args()
logger.info(args)

phonetic = "phonetic"
datasets = ["dev-clean", "dev-other", "test-clean", "test-other"]

for dataset in datasets:
print("> {}".format(dataset))
target_dirname = os.path.join(args.target_path, phonetic, dataset)
Path(target_dirname).mkdir(parents=True, exist_ok=True)

librispeech_dirname = os.path.join(args.librispeech_path, dataset)
files = [(filename, dirname) for dirname, _, files in os.walk(librispeech_dirname, followlinks=True) for filename in files if filename.endswith(args.file_extension)]
for i, (filename, dirname) in enumerate(files):
print("Progress {:2.1%}".format(i / len(files)), end="\r")
input_path = os.path.join(dirname, filename)
output_path = os.path.join(target_dirname, os.path.splitext(filename)[0] + ".wav")
data, sample_rate = sf.read(input_path)
sf.write(output_path, data, sample_rate)

if dataset.startswith("dev"):
source_item_path = os.path.join(args.zerospeech_dataset_path, phonetic, dataset, dataset + ".item")
target_item_path = os.path.join(target_dirname, dataset + ".item")
shutil.copy(source_item_path, target_item_path)


if __name__ == "__main__":
#import ptvsd
#ptvsd.enable_attach(('0.0.0.0', 7310))
#print("Attach debugger now")
#ptvsd.wait_for_attach()
main()

133 changes: 133 additions & 0 deletions scripts/embeddings_abx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python3 -u

import logging
import os
import sys
import argparse
from itertools import chain
from pathlib import Path
import time
import copy
import numpy as np
import soundfile as sf

from cpc.feature_loader import loadModel, FeatureModule

import torch
import torch.nn as nn
import torch.nn.functional as F

logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("zerospeech2021 abx")

def parse_args():
# Run parameters
parser = argparse.ArgumentParser()
parser.add_argument("path_checkpoint", type=str,
help="Path to the trained fairseq wav2vec2.0 model.")
parser.add_argument("path_data", type=str,
help="Path to the dataset that we want to compute ABX for.")
parser.add_argument("path_output_dir", type=str,
help="Path to the output directory.")
parser.add_argument("--debug", action="store_true",
help="Load only a very small amount of files for "
"debugging purposes.")
parser.add_argument("--cpu", action="store_true",
help="Run on a cpu machine.")
parser.add_argument("--file_extension", type=str, default="wav",
help="Extension of the audio files in the dataset (default: wav).")
parser.add_argument("--no_test", action="store_true",
help="Don't compute embeddings for test-* parts of dataset")
parser.add_argument('--gru_level', type=int, default=-1,
help='Hidden level of the LSTM autoregressive model to be taken'
'(default: -1, last layer).')
parser.add_argument('--nullspace', action='store_true',
help="Additionally load nullspace")
return parser.parse_args()

def main():
# Parse and print args
args = parse_args()
logger.info(args)

# Load the model
print("")
print(f"Loading model from {args.path_checkpoint}")

if args.gru_level is not None and args.gru_level > 0:
updateConfig = argparse.Namespace(nLevelsGRU=args.gru_level)
else:
updateConfig = None

model = loadModel([args.path_checkpoint], load_nullspace=args.nullspace, updateConfig=updateConfig)[0]

if args.gru_level is not None and args.gru_level > 0:
# Keep hidden units at LSTM layers on sequential batches
if args.nullspace:
model.cpc.gAR.keepHidden = True
else:
model.gAR.keepHidden = True

device = "cuda" if torch.cuda.is_available() and not args.cpu else "cpu"

# Register the hooks
layer_outputs = {}
def get_layer_output(name):
def hook(model, input, output):
if type(output) is tuple:
layer_outputs[name] = output[0].detach().squeeze(1).cpu().numpy()
elif type(output) is dict:
layer_outputs[name] = output["x"].detach().squeeze(0).cpu().numpy()
else:
layer_outputs[name] = output.detach().squeeze(0).cpu().numpy()
return hook

layer_names = []
layer_name = os.path.basename(os.path.dirname(args.path_checkpoint))
layer_names.append(layer_name)
if not args.nullspace:
model.gAR.register_forward_hook(get_layer_output(layer_name))
else:
model.nullspace.register_forward_hook(get_layer_output(layer_name))

model = model.eval().to(device)
print("Model loaded!")
print(model)

# Extract values from chosen layers and save them to files
phonetic = "phonetic"
datasets_path = os.path.join(args.path_data, phonetic)
datasets = os.listdir(datasets_path)
datasets = [dataset for dataset in datasets if not args.no_test or not dataset.startswith("test")]
print(datasets)

with torch.no_grad():
for dataset in datasets:
print("> {}".format(dataset))
dataset_path = os.path.join(datasets_path, dataset)
files = [f for f in os.listdir(dataset_path) if f.endswith(args.file_extension)]
for i, f in enumerate(files):
print("Progress {:2.1%}".format(i / len(files)), end="\r")
input_f = os.path.join(dataset_path, f)
x, sample_rate = sf.read(input_f)
x = torch.tensor(x).float().reshape(1,1,-1).to(device)
output = model(x, None)[0]

for layer_name, value in layer_outputs.items():
output_dir = os.path.join(args.path_output_dir, layer_name, phonetic, dataset)
Path(output_dir).mkdir(parents=True, exist_ok=True)
out_f = os.path.join(output_dir, os.path.splitext(f)[0] + ".txt")
np.savetxt(out_f, value)

if __name__ == "__main__":
#import ptvsd
#ptvsd.enable_attach(('0.0.0.0', 7310))
#print("Attach debugger now")
#ptvsd.wait_for_attach()
main()

Loading