-
Notifications
You must be signed in to change notification settings - Fork 0
/
deep.ae.R
203 lines (156 loc) · 7.9 KB
/
deep.ae.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#################################################
## Author: Chrysanthi Ainali
## Date: 22.03.2018
## last updated: 14.05.2018
################################################
### step 1:collection of autoencoders where each one learns the underlying manifold of a group of similar objects
## running deep learning model in H2O, autoencoders are trained simultaneously
# install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")))
## upgrading h2o packages
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
###
library(h2o)
# call h2o - use of all available cores
h2o.init(nthreads = -1)
h2o.no_progress() # Disable progress bars for Rmd
### example dataset
# This step takes a few seconds bc we have to download the data from the internet...
train_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/train.csv.gz"
test_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/test.csv.gz"
train <- h2o.importFile(train_file)
test <- h2o.importFile(test_file)
y <- "C785" #response column: digits 0-9
x <- setdiff(names(train), y) #vector of predictor column names
# Since the response is encoded as integers, we need to tell H2O that
# the response is in fact a categorical/factor column. Otherwise, it
# will train a regression model instead of multiclass classification.
train[,y] <- as.factor(train[,y])
test[,y] <- as.factor(test[,y])
######### parameters definition ###############
# x: is a vector containing names of the columns with training data
# y: the response column used as the dependent variable
# training_frame: specify the dataset used to build the model
# validation_frame: specify the dataset used to evaluate the accuracy of the model (optional)
# nfolds: specify the number of folds for cross-validation
# distribution: specify loss function - can take the values ‘bernoulli’, ‘multinomial’, ‘poisson’, ‘gamma’, ‘tweedie’, ‘laplace’, ‘huber’ or
# ‘gaussian’, while ‘AUTO’ automatically picks a parameter based on the data
# activation: possible values are ‘Tanh’, ‘TanhWithDropout’, ‘Rectifier’, ‘RectifierWithDropout’, ‘Maxout’ or ‘MaxoutWithDropout’
# sparse: a boolean value denoting a high degree of zeros
# autoencoder: specifies that we need a deep autoencoder instead of a feed-forward Network
# input_dropout_ratio: Specify the input layer dropout ratio to improve generalization. Suggested values are 0.1 or 0.2
# standardize: if TRUE then automatically is normalising the dataset
# epochs: specify the number of times to iterate the dataset
# hidden: specify the hidden layer sizes
# l1: specify the L1 regularization to add stability and improve generalization (minimizing the error function)
################################################
###### Simple model ######
# train the model with 2 hidden layers of 20 neurons each
model_fit1 <- h2o.deeplearning(x=x, y=y,
# model_id= model_fit1,
training_frame=train,
validation_frame=test,
distribution="multinomial",
activation="RectifierWithDropout",
hidden=c(20,20),
input_dropout_ratio=0.2,
sparse=TRUE,
l1=1e-5,
#stopping_rounds = 0, # disable early stopping
epochs=200)
### Train a DNN with early stopping ###
# turn on early stopping and specify the stopping criterion - use 5 fold cross validation
model_fit2 <- h2o.deeplearning(x = x,
y = y,
training_frame = train,
validation_frame = test,
model_id = "modl_fit2",
distribution="multinomial",
activation="RectifierWithDropout",
input_dropout_ratio=0.2,
sparse=TRUE,
l1=1e-5,
epochs = 50,
hidden = c(20,20),
nfolds = 5, #used for early stopping
score_interval = 1, #used for early stopping
stopping_rounds = 5, #used for early stopping
stopping_metric = "misclassification", #used for early stopping
stopping_tolerance = 1e-3, #used for early stopping
seed = 1)
# Performance comparison
dl_perf1 <- h2o.performance(model = model_fit1, newdata = test)
dl_perf2 <- h2o.performance(model = model_fit2, newdata = test)
# Retreive test set MSE
h2o.mse(dl_perf1)
h2o.mse(dl_perf2)
# Get the CV models from the `dl_fit3` object
cv_models <- sapply(model_fit2@model$cross_validation_models,
function(i) h2o.getModel(i$name))
# Plot the scoring history over time
plot(cv_models[[1]],
timestep = "epochs",
metric = "classification_error")
###### Hyperparameter optimization ######
## First define a grid of Deep Learning hyperparameters and specify the search_criteria
activation_opt <- c("Rectifier", "Maxout", "Tanh", "RectifierWithDropout")
#activation_opt <- c("Rectifier", "TahnWithDropout", "Tanh", "RectifierWithDropout")
l1_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01)
l2_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01)
hyper_params <- list(activation = activation_opt, l1 = l1_opt, l2 = l2_opt)
search_criteria <- list(strategy = "RandomDiscrete", max_runtime_secs = 600)
dl_grid <- h2o.grid("deeplearning", x = x, y = y,
grid_id = "dl_grid",
training_frame = train,
validation_frame = test,
nfolds=3,
seed = 1,
# autoencoder = TRUE,
hidden = c(20,20),
hyper_params = hyper_params,
search_criteria = search_criteria)
# evaluate performance
dl_gridperf <- h2o.getGrid(grid_id = "dl_grid",
sort_by = "accuracy",
decreasing = TRUE)
print(dl_gridperf)
# Grab the model_id for the top DL model, chosen by validation error
best_dl_model_id <- dl_gridperf@model_ids[[1]]
best_dl <- h2o.getModel(best_dl_model_id)
# evaluate the model performance
best_dl_perf <- h2o.performance(model = best_dl, newdata = test)
h2o.mse(best_dl_perf)
#### Run DNN with autoencoder applying unsupervised training first and then supervised
## Split the training data into two pieces: one that will be used for unsupervised pre-training and
## the other that will be used for supervised training
splits <- h2o.splitFrame(train, 0.5, seed = 1)
# first part of the data, without labels for unsupervised learning
train_unsupervised <- splits[[1]]
# second part of the data, with labels for supervised learning
train_supervised <- splits[[2]]
dim(train_supervised)
dim(train_unsupervised)
### train autoencoder model
hidden <- c(128, 64, 128)
ae_model <- h2o.deeplearning(x = x,
training_frame = train_unsupervised,
model_id = "mnist_autoencoder",
ignore_const_cols = FALSE,
activation = "Tanh", # Tanh is good for autoencoding
hidden = hidden,
autoencoder = TRUE)
## use pre-trained model for supervised DNN
## use of the weights from the unsupervised autoencoder model
fit1 <- h2o.deeplearning(x = x, y = y,
training_frame = train_supervised,
ignore_const_cols = FALSE,
hidden = hidden,
pretrained_autoencoder = "mnist_autoencoder")
perf1 <- h2o.performance(fit1, newdata = test)
h2o.mse(perf1)
## Dimension reduction
# use of h2o.deepfeatures()
###############################
# disconnect with h2o
# h2o.shutdown()
### step 2: