-
Notifications
You must be signed in to change notification settings - Fork 5
/
kamiak_train.srun.template
85 lines (74 loc) · 3.06 KB
/
kamiak_train.srun.template
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
#SBATCH --job-name=train
#SBATCH --output=slurm_logs/train_%A_%a.out
#SBATCH --error=slurm_logs/train_%A_%a.err
#SBATCH --cpus-per-task={{cpus}}
#SBATCH --gres=gpu:{{gpus}}
#SBATCH --partition={{partitions}}
#SBATCH --time=0-06:00:00
#SBATCH --mem=10G
#SBATCH --array=0-{{max_array}}
. kamiak_config.sh
. kamiak_tensorflow_{{cpu_or_gpu}}.sh
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
# Errors
handle_terminate() { echo "Exiting"; exit 1; }
handle_error() { echo "Error occurred -- exiting"; exit 1; }
trap "handle_terminate" SIGTERM SIGINT
# Get suffix, i.e. files stored in kamiak-{models,logs}-suffix
suffix=$1; shift
[[ -z $suffix ]] && { echo "no suffix specified"; handle_error; }
methods=({{methods}})
debugnums=({{debugnums}})
uids=({{uids}})
datasets=({{datasets}})
sources=({{sources}})
targets=({{targets}})
options=({{options}})
additional_suffixes=({{additional_suffixes}}) # for tuning
# Check lengths are all the same
[[ ${#methods[@]} == ${#debugnums[@]} ]] || { echo "debugnums wrong length"; handle_error; }
[[ ${#methods[@]} == ${#uids[@]} ]] || { echo "uids wrong length"; handle_error; }
[[ ${#methods[@]} == ${#datasets[@]} ]] || { echo "datasets wrong length"; handle_error; }
[[ ${#methods[@]} == ${#sources[@]} ]] || { echo "sources wrong length"; handle_error; }
[[ ${#methods[@]} == ${#targets[@]} ]] || { echo "targets wrong length"; handle_error; }
[[ ${#methods[@]} == ${#options[@]} ]] || { echo "options wrong length"; handle_error; }
[[ ${#methods[@]} == ${#additional_suffixes[@]} ]] || { echo "additional_suffixes wrong length"; handle_error; }
# Make sure we're using the right number -- no longer check here since we
# check when we create the scripts in experiments_msda.py, thus allowing us to
# use --array=0-5 etc. for instance
# correct_min=0
# correct_max=$(( ${#methods[@]} - 1 ))
# [[ $SLURM_ARRAY_TASK_MIN == $correct_min ]] || { echo "array min should be $correct_min"; handle_error; }
# [[ $SLURM_ARRAY_TASK_MAX == $correct_max ]] || { echo "array max should be $correct_max"; handle_error; }
# Get this run of the array's values
index=$SLURM_ARRAY_TASK_ID
method="${methods[$index]}"
debugnum="${debugnums[$index]}"
uid="${uids[$index]}"
dataset="${datasets[$index]}"
source="${sources[$index]}"
target="${targets[$index]}"
option="${options[$index]}"
additional_suffix="${additional_suffixes[$index]}"
# For tuning, we append another suffix
suffix="$suffix$additional_suffix"
# Upper bound is actually "none" but without a target domain and with other args
if [[ $method == "upper" ]]; then
method="none"
source="$target"
target=""
fi
echo "$suffix #$SLURM_ARRAY_TASK_ID"
echo "Method: $method"
echo "DebugNum: $debugnum"
echo "Other args: $option $@"
echo "UID: $uid"
echo "$dataset $source --> $target"
cd "$remotedir"
# Note: spaces in $option separate different args, so don't quote
python3 -B main.py \
--logdir="$logFolder-$suffix" --modeldir="$modelFolder-$suffix" \
--method="$method" --dataset="$dataset" --sources="$source" \
--target="$target" --uid="$uid" --debugnum="$debugnum" \
--gpumem=0 $option "$@" || handle_error