forked from NervanaSystems/neon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathANNOTATED_EXAMPLE.yaml
371 lines (363 loc) · 18.2 KB
/
ANNOTATED_EXAMPLE.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# ----------------------------------------------------------------------------
# Copyright 2014 Nervana Systems Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ----------------------------------------------------------------------------
# neon configuration files are written in YAML (a human readable markup
# language that supports key/value pairs). We actually make use of pyyaml and
# customize things slightly to support arbitrary python objects as values and
# so forth.
#
# Basic parameters are written in the format:
# key: value
# Where value can be either:
# - a scalar string, boolean, or number ex: 'hello', False, 25.6
# - a list of values ex: [1, 2, 3]
# - a dictionary of key/value pairs ex: { a: 1, b: 2, c: 3 }
# - a python object instantiation ex: !obj:transforms.Logistic {}
# note that object constructor arguments are passed between the curly braces
# as key/value pairs
#
# Parameters can be split over multiple lines, and comments can be added by
# prepending a # character.
#
# A common source of error is forgetting the ',' between parameters belonging
# to the same object
#
# To save on explicit repetition, you can reference parameters by assigning a
# variable alias, then dereference them where needed later. The syntax for
# doing so is:
# key: &var_ref value # creates a reference for key named var_ref
# new_key: *var_ref # dereferences var_ref to assign its val to new_key
# At the highest level, a neon YAML file consists of an experiment definition.
# Experiments are sub-classes of the python class
# neon.experiments.experiment.Experiment and carry out tasks like training a
# model, generating predictions, and so forth.
#
# Experiment types:
# -----------------
# FitExperiment - train a model to learn a set of parameters
# FitPredictErrorExperiment - as above, but also generate predictions and
# evaluate performance on those, potentially
# saving predictions to file.
# GradientChecker - Use finite differences to check the gradients of an
# experiment. Currently this can be carried out using
# the separately provided ``grad`` executable
# GenOutputExperiment - fit experiment where at the end we want to visualize
# the output compared to the input (useful for things
# like autoencoders or balance networks).
# WriteErrorToFile - fit experiment where at the end we save off the
# predictions to a file on disk
!obj:experiments.FitPredictErrorExperiment {
# Fit*Experiment parameters:
# --------------------------
# dataset - The input data to train/test against. types:
# CIFAR10 - 10 class object image dataset
# CIFAR100 - 100 class object image dataset
# Housing - Boston housing dataset
# Imageset - Generic macro batched image dataset support
# Iris - RA Fisher's 3 class flower dataset
# MNIST - handwritten digit image dataset
# MOBYDICK - Content of the book Moby Dick
# SPARSENET - small natural image dataset (Olshausen and Field)
# UniformRandom - synthetic random data
# ToyImages - synthetic image classification dataset
# weight_init - controls initial value setup for the model weights. types:
# UniformValGen - Uniform distribution
# AutoUniformValGen - Uniform distribution where range is set
# automatically based on the shape being
# inititalized
# SparseEigenValGen - Each weight has a configurable number of
# nonzero inputs and these weights are
# uniformly randomly initialized followed
# by a scaling based on the maximum
# eigenvalues found. See Sutskever2013 for
# details.
# GaussianValGen or NormalValGen - normal distribution
# NodeNormalizedValGen - initialization is as discussed in
# Glorot2010
# lrule - the learning rule used to update weight values. types:
# gradient_descent - vanilla gradient descent (supports weight decay)
# gradient_descent_pretrain - gradient descent that supports a
# separate pretrain learning rate
# gradient_descent_momentum - gradient descent that supports momentum
# based updates
# adadelta - Adadelta based updates. See Zeiler2012
# model - the type of machine learning algorithm to employ. types:
# MLP - feed-forward multi-layer perceptron neural network
# RBM - restricted Boltzmann machine
# RNN - Recurrent neural network
# Balance - Balance network
# Autoencoder - stacked autoencoder
# DBN - deep belief network
# logging - controls the amount and format of diagnositc information. Based
# off of python's logging module
#
# FitPredictErrorExperiment specific parameters:
# ----------------------------------------------
# predictions - which data types to compute and save predictions on.
# Should be a list of strings, where each string should be
# one of 'train', 'test', or 'validation'. Saved
# predictions will be written to disk in the same directory
# as the associated input dataset, and are written as python
# serialized object (*-inference.pkl) files.
# metrics - The performance measurements to compute for each specified data
# type. Should be a dict of dicts of Metric objects. Currently
# implemented metrics are defined below
# 'AUC' - Area under the ROC curve
# 'MisclassRate' - Proportion of instances where the output class
# predicted by the model does not match the
# expected value. Note that we can also consider
# matches as being those within the top x most
# probable by setting the error_rank parameter to
# x in the constructor. This rate produces values
# in the range [0, 1]
# 'MisclassPercentage' - Identical to MisclassRate but on the scale
# [0,100]
# 'MisclassSum' - Identical to MisclassRate but returns the integer
# number of misclassifications instead of the
# fraction.
# 'LogLossSum' - The negative log likelihood of the true labels
# given the predicted probabilities from the model.
# 'LogLossMean' - The negative log likelihood of the true labels
# give the predicted probabilities from the model.
# This value is normalized by the number of records
# examined.
dataset: &ds !obj:datasets.Iris {
# Iris dataset parameters
# -----------------------
# repo_path - on-disk repository location where this dataset resides
# (specify repository root containing the Iris dir, and not
# the Iris dir directly)
# sample_pct - if 100, full dataset will be used, otherwise we uniformly
# downsample records to the specified percentage
# validation_pct - creates separate validation dataset utilizing the
# specified percentage of the training set data
repo_path: '~/data',
sample_pct: 100,
},
metrics: {
# Metric objects
# --------------
# 'AUC' - Area under the ROC curve
# 'MisclassRate' - Proportion of instances where the output class predicted
# by the model does not match the expected value. Note
# that we can also consider matches as being those within
# the top x most probable by setting the error_rank
# parameter to x in the constructor. This rate produces
# values in the range [0, 1]
# 'MisclassPercentage' - Identical to MisclassRate but on the scale [0,100]
# 'MisclassSum' - Identical to MisclassRate but returns the integer number
# of misclassifications instead of the fraction.
# 'LogLossSum' - The negative log likelihood of the true labels given the
# predicted probabilities from the model.
# 'LogLossMean' - The negative log likelihood of the true labels given the
# predicted probabilities from the model. This value is
# normalized by the number of records examined.
train: [
!obj:metrics.AUC {},
!obj:metrics.MisclassPercentage {},
],
test: [
!obj:metrics.AUC {},
!obj:metrics.MisclassPercentage {},
],
},
weight_init: &wt_init !obj:params.UniformValGen {
# UniformValGen weight_init parameters
# ------------------------------------
# backend - backend to use to construct the values
# low - minimal uniform random value range endpoint
# high - maximal uniform random value range endpoint
# bias_init - initial values for biases (if used)
low: -1,
high: 1,
bias_init: 0.0,
},
lrule: &gdm {
# gradient_descent_momentum parameters
# ------------------------------------
# type - set this to graident_descent_momentum
# lr_params - parameters associated with this type of learning rate. types:
# learning_rate - fraction of backpropagated gradient values to
# add to this layers weights during each update
# (should be <= 1.0)
# momentum_params - dictionary containing several parameters that
# can be used to adjust the learning rate over
# time based on the curvature of gradient
# directions. Omission of the momentum_params dict
# will use no momentum. The dict parameters are:
# type - should be one of: constant, linear_monotone, nesterov
# initial_coef - initial momentum coefficient value. 0 disables
# momentum, but a positive value gives portion of
# previous iteration velocity to apply initially
# saturated_coef - final momentum coefficient value to apply
# once we reach the saturation epoch and
# beyond. Only applies to linear_monotone
# start_epoch - specify a number indicating at which epoch we
# begin applying momentum based updates
# saturate_epoch - specify a number (>= start_epoch) at which
# we will saturate momentum based updates.
# For linear_monotone momentum, we interpolate
# between initial and saturate momentum coef
# values until we reach the saturate_epoch.
type: gradient_descent_momentum,
lr_params: {
learning_rate: 0.5,
momentum_params: {
type: constant,
coef: 0,
},
},
},
lrule: &rms {
# rmsprop parameters
# ------------------------------------
# type - set this to rmsprop
# lr_params - parameters associated with this type of learning rate. types:
# learning_rate - Suggested value for learning rate is 0.001
# gamma - exponential decay for tracking the squared gradient over time
# default is 0.9
# epsilon - value added to denominator term for numerical stability
# default is 1e-6
# momentum_params - dictionary containing several parameters that
# can be used to adjust the learning rate over
# time based on the curvature of gradient
# directions. See the gradient_descent_momentum info
type: rmsprop,
lr_params: {
learning_rate: 0.001,
gamma: 0.9,
epsilon: 0.000001,
momentum_params: {
type: constant,
coef: 0,
},
},
},
model: !obj:models.MLP {
# MLP model parameters
# --------------------
# batch_size - number of training instances in each mini-batch. Should
# be no larger than the size of the training dataset.
# num_epochs - the number of iterations through the entire training set
# to make in training the model
# serialized_path - the full location and name to a serialized .prm file
# that we will use to cache this model. Speeds up
# subsequent use of this model for inference purposes.
# deserialize_path - the full location and name to deserialize a .prm file
# from that we will use to initialize this model.
# Defaults to serialized_path if deserialize_path
# doesn't exist
# serialize_schedule - either an int specifying the epoch interval at
# at which to save snapshots of the model, or a list
# of ints specifying the epochs at which to save.
# save_checkpoints - sets the number of checkpoint files to save, otherwise
# only the most recent checkpoint is retained.
# This should be set to an integer > 1. The files will
# be retained in the serialized_path directory and will
# have an extension "_cp<#>" added to the filename. The
# index number on the file name corresponds to the checkpoint
# count.
# epoch_metrics - Same as the metrics parameter for the experiment, but
# those are printed for each epochs.
# layers - list of neural network layer objects. Models will typically
# have a single DataLayer, followed by 1 or more hidden layers,
# then a final CostLayer.
num_epochs: 25,
batch_size: &bs 30,
serialized_path: './iris-mlp-simple.prm',
serialize_schedule: [3, 10, 20],
# save_checkpoints: 2,
layers: [
&datalayer !obj:layers.DataLayer {
# DataLayer parameters
# --------------------
# The data layer feeds data from the input dataset defined in the
# experiment.
# Parameters:
# name - string used to identify this layer in the logs.
# nout - number of connections output by this layer
name: d0,
nout: 4,
},
!obj:layers.FCLayer {
# FCLayer parameters
# ------------------
# Apart from Data and Cost layers, internal layers have parameters:
# name - string used to identify this layer in the logs.
# lrule_init - How updates get applied to the weights. See lrule
# defined at the Experiment level
# nout - number of connections output by this layer
# activation - (non-linear) transformation function to apply to the
# connections in this layer. types are:
# Logistic - sigmoidal squashing transform
# RectLin - ReLU (rectified linear unit) transform
# Tanh - hyperbolic tangent squashing transform
# SoftMax - generalized logistic squashing transform
# normalized over k dimensions
# weight_init - Controls how weights get initialized. See weight_init
# defined at the Experiment level
name: h0,
nout: 2,
lrule_init: *gdm,
weight_init: *wt_init,
activation: !obj:transforms.Logistic {},
},
&lastlayer !obj:layers.FCLayer {
name: output,
nout: 3,
lrule_init: *gdm,
weight_init: *wt_init,
activation: !obj:transforms.Logistic {},
},
&costlayer !obj:layers.CostLayer {
# CostLayer parameters
# --------------------
# This layer defines the cost or objective function to be optimized.
# Parameters:
# name - string used to identify this layer in the logs.
# ref_layer - the input data layer reference
# cost - the actual transform to use for the cost. Types include:
# CrossEntropy - typically used to measure the difference
# between two probability distributions.
# The default is the binomial version
# When binomial version is paired with a logistic
# activation in the previous layer,
# shortcut_deriv is True
# SumSquaredDiffs - computes the sum of squared differences.
name: cost,
ref_layer: *datalayer,
cost: !obj:transforms.CrossEntropy {},
},
],
},
logging: {
# logging parameters:
# -------------------
# level - numeric threshold that controls the types of messages that get
# displayed. All logging types with values greater than or equal
# to this value will appear.
# types of logging are:
# 0 - NOTSET
# 10 - DEBUG
# 20 - INFO
# 30 - WARNING
# 40 - ERROR
# 50 - CRITICAL
# filename - write logging to the specified file (instead of stdout)
# format - string giving what fields to display in each log entry. See
# python's logging record attributes for full details.
level: 20,
format: '%(asctime)-15s %(levelname)s:%(module)s - %(message)s'
},
}