From 9ecec99f3b80ea49c6dbbbe77269b88ec2beb805 Mon Sep 17 00:00:00 2001
From: Sander Dieleman <sanderdieleman@gmail.com>
Date: Thu, 8 Dec 2011 17:20:39 +0100
Subject: [PATCH] current state

---
 TODO                         | 222 ++++++++++++++++++++++++++
 convolutions                 |  81 ++++++++++
 morb/__init__.py             |   0
 morb/activation_functions.py |  29 ++++
 morb/base.py                 | 292 +++++++++++++++++++++++++++++++++++
 morb/monitors.py             |  68 ++++++++
 morb/param_updaters.py       |  70 +++++++++
 morb/parameters.py           | 239 ++++++++++++++++++++++++++++
 morb/rbms.py                 |  83 ++++++++++
 morb/samplers.py             |  77 +++++++++
 morb/stats_collectors.py     | 158 +++++++++++++++++++
 morb/trainers.py             |  52 +++++++
 morb/units.py                |  22 +++
 test.py                      |  56 +++++++
 test_advparams.py            | 215 ++++++++++++++++++++++++++
 test_crbm.py                 |  84 ++++++++++
 test_mnist.py                | 175 +++++++++++++++++++++
 test_mnist_convolutional.py  | 215 ++++++++++++++++++++++++++
 test_mnist_labeled.py        | 204 ++++++++++++++++++++++++
 test_sparsity.py             | 170 ++++++++++++++++++++
 test_utils.py                |  57 +++++++
 21 files changed, 2569 insertions(+)
 create mode 100644 TODO
 create mode 100644 convolutions
 create mode 100644 morb/__init__.py
 create mode 100644 morb/activation_functions.py
 create mode 100644 morb/base.py
 create mode 100644 morb/monitors.py
 create mode 100644 morb/param_updaters.py
 create mode 100644 morb/parameters.py
 create mode 100644 morb/rbms.py
 create mode 100644 morb/samplers.py
 create mode 100644 morb/stats_collectors.py
 create mode 100644 morb/trainers.py
 create mode 100644 morb/units.py
 create mode 100644 test.py
 create mode 100644 test_advparams.py
 create mode 100644 test_crbm.py
 create mode 100644 test_mnist.py
 create mode 100644 test_mnist_convolutional.py
 create mode 100644 test_mnist_labeled.py
 create mode 100644 test_sparsity.py
 create mode 100644 test_utils.py

diff --git a/TODO b/TODO
new file mode 100644
index 0000000..4116786
--- /dev/null
+++ b/TODO
@@ -0,0 +1,222 @@
+TODO: 3-way parameters en 3-way factored parameters - veranderen er nog andere dingen?
+
+TODO: gaussian units (sigma=1)
+
+TODO: truncated exponential units
+
+TODO: wat glue code om RBMs te stacken in DBNs.
+    - samplen van een DBN
+    - DBN training (laag-per-laag)
+    - DBN inference
+
+TODO: passen autoencoders ook binnen dit raamwerk? mss moeten bepaalde dingen dan hernoemd worden...
+
+TODO: github?
+
+TODO: optimised 1D convolution    
+    
+TODO: test 1D convolution
+
+TODO: dat FCRBM model voor modeling motion style eens proberen nabouwen!
+  ook interessant hierbij: parameter tying! kan dit in het framework gepast worden?
+  
+TODO: gaussian units met learnt variance (gescheiden en samen)
+  
+TODO: symmetric beta units
+
+
+
+
+- idee voor later: hierbij ook een 'chunking' framework voegen dat data in grote chunks kan omzetten voor efficient trainen op de GPU? + ook snapshotting erin verwerken? Dat zou voor een vrij compleet en proper framework zorgen.
+
+* TODO: write some nice latex documentation that explains all the different components and how they fit together, and especially which parts of the training/sampling algorithms they implement.
+
+
+
+
+geef aan de statscollector een lijst van Units objecten die de VISIBLES voorstellen, maw alle units die de data voorstellen en die dus initieel waarden zullen aannemen.
+
+Bepaal dan, a.h.v. de parameters-objecten, welke types units allemaal gesampled moeten worden (welke units de HIDDENS voorstellen).
+
+- dit hoeven niet noodzakelijk alle overblijvende units te zijn denk ik.
+- eerst bepalen we alle Parameters objecten die de visible units beinvloeden, en dan zoeken we alle ANDERE units die ook door deze parameters beinvloed worden.
+- als er bij de HIDDENS units zitten die ook bij de VISIBLES zitten, dan is er iets mis. Dan is er namelijk een afhankelijkheid tussen de visibles. exceptie gooien!
+
+- als er Parameters zijn die types HIDDENS verbinden, dan is er ook iets mis. Dan is er namelijk een afhankelijkheid tussen de hiddens. Exceptie gooien! (vb. aWb, bVc en cUa kan niet, we kunnen nooit b en c op basis van a samplen omdat ze niet onafhankelijk zijn van elkaar) - "
+
+- een geval waarbij de hiddens niet 'alle overblijvende units' zijn, is waar er 2 types visibles zijn, die niet tegelijkertijd gesampled moeten worden maar wel dezelfde HIDDENS delen. Dit kan bvb een soort van parameter sharing zijn.
+
+
+
+
+
+
+
+TRAINING
+
+split into multiple phases:
+
+- collect statistics, given input data and the model
+  * for CD-k, this is input visibles, sampled hiddens, and then visibles and hiddens after k steps
+  * for PCD, the negative term is different
+- use statistics to compute the weight updates
+  * for all types of CD, this is just getting the update form from the Parameters object and filling in the blanks.
+- update the weights according to some other hyperparameterised settings (this is where momentum, weight decay etc. are)
+
+
+trainingalgoritme abstraheren
+  - gradient descent with stopping criterion
+  - gradient descent with fixed number of epochs
+  - gradient descent ad infinitum
+  - zijn er nog?
+  - dit roept dan gewoon de paramupdaters op
+wat met monitoring costs?
+
+
+So let's build a hierarchy.
+
+ParamUpdater: composite object (possibly consisting of multiple other ParamUpdaters that are somehow combined (typical usecase: updates are summed)) that updates parameters given some input data
+
+DecayParamUpdater: provides a decay term
+
+MomentumParamUpdater: encapsulates another ParamUpdater and applies momentum to it
+
+CDParamUpdater: encapsulates CD-k or PCD weight update computation
+  * takes the input data, computes statistics by calling a StatsCollector on it
+  * gets the form of the update term from the Parameters object, fills in the blanks with the stats from the StatsCollector
+  * returns the resulting update
+  
+SparsityParamUpdater: encapsulates sparsity target regularisation
+
+SumParamUpdater: sums the updates obtained from its child paramupdaters. should check that the composing ParamUpdaters are for the same Parameters!
+
+ScaledParamUpdater: takes a ParamUpdater and a scale factor, and scales the updates by this factor. This will be the result of writing '0.005 * ParamUpdater()' for example.
+
+  
+
+StatsCollector: object that collects the statistics for CD, given input
+
+CDStatsCollector(k) < StatsCollector: collects statistics for CD-k
+
+PCDStatsCollector < StatsCollector: collects statistics for PCD
+
+!! since only the negative term differs between CDStatsCollector and PCDStatsCollector, maybe some overlapping code can be factored out here.
+
+- learning rate: should this be a ParamUpdater, or should it be kept outside? it's nice to be able to encapsulate this...
+
+TO IMPLEMENT:
+X base class ParamUpdater(Parameters p, [StatsCollector s])
+- subclasses:
+  X DecayParamUpdater(Parameters p)
+  * MomentumParamUpdater(ParamUpdater pu)
+  X CDParamUpdater(Parameters p, StatsCollector s)
+  * SparsityTargetParamUpdater(Parameters p, StatsCollector s, target) # this also needs stats!
+  * VarianceTargetParamUpdater(Parameters p, target) # maybe, as an exercise, since it doesn't really work anyway
+  X SumParamUpdater([ParamUpdater p])
+      X Maybe overload + on ParamUpdater
+      X also maybe overload some_float * ParamUpdater, with __rmul__ (also implement __mul__). That's a nice way to do regularisation parameters and learning rates (then they don't have to handled inside the ParamUpdater object itself)
+  
+- base class StatsCollector
+- subclasses:
+  X CDkStatsCollector(k)
+  * PCDStatsCollector  
+
+
+
+
+PROBLEM: multiple Parameters should be updated using the same collected stats, so the statscollector should only be run once per group of parameters. How can this be guaranteed if each ParamUpdater updates only a single Parameters object?
+Does it make sense instead to let a ParamUpdater update a group of Parameters objects?
+
+SOLUTION: maybe split the update process in a number of phases:
+
+- first, all ParamUpdaters (which each update ONLY ONE Parameters object) are inspected, and the StatsCollector objects they use are extracted: ParamUpdater.get_collectors() or something like that
+- then iterate:
+  * collect stats: run the extracted StatsCollectors (StatsCollector.collect() or something)
+  * run ParamUpdaters
+    - each ParamUpdater calls StatsCollector.get_stats() or get_current_stats() or something to that extent.
+  * That way, each type of stats collection happens only once and all params are updated based on the same stats. We just need to make sure that they each hold a reference to the same StatsCollector object.
+
+
+
+
+
+
+MODULAR RBM
+
+v = a(f(W, h) + g(vbias))
+h = a'(f'(W, v) + g'(hbias))
+
+ActivationFunction
+Sampler
+Units
+
+(unit data u + activation functions a(x) + samplers) = Units
+
+Parameters(list<Units> units)
+  has
+    - list<Units>: list of units that are related by the parameters
+    - a set of actual parameters (weights)
+  provides
+    - contribution to activation of each of the Units
+    - contribution to the energy function
+
+
+RBM
+  has
+    - list<Units>: a set of different types of units
+    - list<Parameters>: a set of different types of parameters that relate the Units
+  provides
+    - sampling a type of units (computing nonlinear activation and sampling)
+    - computing the energy function (summing the contributions for each of the Parameters)
+
+
+get unit values (this goes for all unit types):
+  - compute linear activation
+      x = sum_i(f(W_i, u_i))
+  - apply activation function  
+      a(x)
+  - sample
+      u ~ a(x)
+      
+Computing a linear activation consists of summing all the contributions of the different Parameters
+
+a Sampler can just be the identify function (mean field, to get truncated exponential units) or a bernoulli sampler (most common), a softmax sampler, a max-pooling sampler, ...
+
+Base classes: ActivationFunction, Sampler, Units, Parameters
+
+Ideally, there would be no clear distinction between visibles and hiddens in the lowest layers of abstraction - allowing for more advanced models like the 3-way factored RBM to be implemented. The visibles-hiddens distinction is a useful abstraction for many types of RBMs however, so it should be implemented on top (maybe have an RBM class and a vhRBM subclass).
+
+
+Units, ActivationFunction and Sampler are AWARE of the RBM they are part of - most of the logic is concentrated in the subclasses. while this does bring some dependencies that could technically be avoided, it should lead to a cleaner architecture. It's also nicer to be able to do rbm.units['v'].sample(h_data), for example.
+
+
+
+
+
+
+
+
+implement a system where different types of units, weights and sampling can be combined easily. If it's performant that's a plus, but it doesn't have to be (mainly for experimentation). This will make it easier to construct heterogeneous models like the TransRBM or (different types of) the ChromaRBM.
+- determine the elementary operations that can be performed on an RBM, with their inputs and outputs
+    * sample hiddens
+    * sample visibles
+    * compute activation
+
+- determine the axes of variation, i.e. what can be done differently to create a new type of model:
+    * the way the activations are computed
+    * the way samples are drawn
+    * the types of parameters (weights, biases, convolutional, ...)
+    * the learning algorithm itself (CD, persistent CD, ...)
+    
+- it would be good if certain hyperparameters (like the learning rate for different sets of trainable parameters)
+
+
+
+elementary RBM operations:
+  - train using CD-n, given unlabeled visibles data
+  - sample visibles from hiddens
+  - sample hiddens from visibles
+  
+  
+  
+
diff --git a/convolutions b/convolutions
new file mode 100644
index 0000000..ce70485
--- /dev/null
+++ b/convolutions
@@ -0,0 +1,81 @@
+W + 0.001 * (conv(v, sigmoid(conv(v, W) + bh)) - conv(sigmoid(conv + bv), sigmoid(conv + bh)))
+
+
+
+# conv input is (mb_size, input_maps, input height [numrows], input width [numcolumns])
+# conv input is (output_maps, input_maps, filter height [numrows], filter width [numcolumns])
+# conv output is (mb_size, output_maps, output height [numrows], output width [numcolumns])
+
+self.W = W # (hidden_maps, visible_maps, filter_height, filter_width)
+self.vu = units_list[0] # (mb_size, visible_maps, visible_height, visible_width)
+self.hu = units_list[1] # (mb_size, hidden_maps, hidden_height, hidden_width)
+
+
+hd = h0.dimshuffle(1,0,2,3)
+conv(v0, hd)
+
+v0: (mb_size, visible_maps, visible_height, visible_width)
+h0: (mb_size, hidden_maps, visible_height - filter_height + 1, visible_width - filter_width + 1)
+
+hd: (hidden_maps, mb_size, visible_height - filter_height + 1, visible_width - filter_width + 1)
+
+
+
+
+conv INPUT 1 (inputs):
+mb_size = mb_size
+input_maps = visible_maps
+input_height = visible_height
+input_width = visible_width
+
+conv INPUT 2 (filters):
+output_maps = hidden_maps
+input_maps = mb_size
+filter_height = visible_height - filter_height + 1
+filter_width = visible_width - filter_width + 1
+
+conv OUTPUT (outputs):
+mb_size = mb_size
+output_maps = hidden_maps
+output_height = filter_height
+output_width = filter_width
+
+
+DESIRED OUTPUT: (hidden_maps, visible_maps, filter_height, filter_width)
+
+
+
+-------------
+
+v0: (mb_size, visible_maps, visible_height, visible_width)
+h0: (mb_size, hidden_maps, visible_height - filter_height + 1, visible_width - filter_width + 1)
+
+vd: (visible_maps, mb_size, visible_height, visible_width)
+hd: (hidden_maps, mb_size, visible_height - filter_height + 1, visible_width - filter_width + 1)
+
+
+
+
+conv INPUT 1 (inputs): dimshuffle(1, 0, 2, 3)
+mb_size = visible_maps
+input_maps = mb_size
+input_height = visible_height
+input_width = visible_width
+
+conv INPUT 2 (filters): dimshuffle(1, 0, 2, 3)
+output_maps = hidden_maps
+input_maps = mb_size
+filter_height = visible_height - filter_height + 1
+filter_width = visible_width - filter_width + 1
+
+conv OUTPUT (outputs):
+mb_size = visible_maps
+output_maps = hidden_maps
+output_height = filter_height
+output_width = filter_width
+
+
+en dan nog de output dimshuffle(1, 0, 2, 3) en we zijn er!
+
+DESIRED OUTPUT: (hidden_maps, visible_maps, filter_height, filter_width)
+
diff --git a/morb/__init__.py b/morb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/morb/activation_functions.py b/morb/activation_functions.py
new file mode 100644
index 0000000..6944601
--- /dev/null
+++ b/morb/activation_functions.py
@@ -0,0 +1,29 @@
+from morb.base import activation_function
+
+import theano.tensor as T
+
+
+@activation_function
+def sigmoid(x):
+    return T.nnet.sigmoid(x)
+    
+@activation_function
+def identity(*x):
+    return x
+
+@activation_function
+def softmax(x):
+    # expected input dimensions:
+    # 0 = minibatches
+    # 1 = units
+    # 2 = states 
+    
+    r = x.reshape((x.shape[0]*x.shape[1], x.shape[2]))
+    # r 0 = minibatches * units
+    # r 1 = states
+    
+    # this is the expected input for theano.nnet.softmax
+    s = theano.nnet.softmax(r)
+    
+    # reshape back to original shape
+    return s.reshape(x.shape)
diff --git a/morb/base.py b/morb/base.py
new file mode 100644
index 0000000..737eb17
--- /dev/null
+++ b/morb/base.py
@@ -0,0 +1,292 @@
+import theano
+import theano.tensor as T
+
+### Base classes: sampling and modeling ###
+
+class ActivationFunction(object):
+    def apply(self, x):
+        raise NotImplementedError("ActivationFunction base class")
+    
+class Sampler(object):
+    def apply(self, a):
+        raise NotImplementedError("Sampler base class")
+    
+class Units(object):
+    # container class for a theano variable representing unit values, an ActivationFunction and a Sampler.
+    # a unit could just as well be represented as a tuple (u, ActivationFunction(), Sampler(), [CDSampler()]) but it's probably
+    # nicer to have a container class for this.
+    
+    def __init__(self, rbm, activation_function, sampler, name=None):
+        self.rbm = rbm
+        self.activation_function = activation_function
+        self.sampler = sampler
+        self.name = name
+        self.rbm.add_units(self)
+    
+    def linear_activation(self, vmap):
+        # args and kwargs represent the other Units of the RBM that these units are tied to.
+        terms = [param.activation_term_for(self, vmap) for param in self.rbm.params_affecting(self)]
+        # the linear activation is the sum of the activations for each of the parameters.
+        return sum(terms)
+        
+    def activation(self, vmap):
+        l = self.linear_activation(vmap)
+        return self.activation_function.apply(l)
+            
+    def sample(self, vmap, **kwargs):
+        a = self.activation(vmap)
+        return self.sampler.apply(a, **kwargs)
+        
+    def __repr__(self):
+        return "<morb:Units '%s' with sampler '%s' and activation function '%s'>" % (self.name, self.sampler.name, self.activation_function.name)
+          
+
+# decorators to create sampler and activation function objects from functions
+def sampler(func):
+    class s(Sampler):
+        def apply(self, a, *args, **kwargs):
+            return func(a, *args, **kwargs)
+    inst = s()
+    inst.name = func.__name__
+    return inst
+    
+def activation_function(func):
+    class f(ActivationFunction): 
+        def apply(self, x, *args, **kwargs):
+            return func(x, *args, **kwargs)
+    inst = f()
+    inst.name = func.__name__
+    return inst
+
+# function to quickly create a basic units class
+def units_type(activation_function, sampler):
+    class U(Units):
+        def __init__(self, rbm, name=None):
+            super(U, self).__init__(rbm, activation_function, sampler, name=name)
+            
+    return U
+        
+class Parameters(object):
+    def __init__(self, rbm, units_list, name=None):
+        self.rbm = rbm
+        self.units_list = units_list
+        self.terms = {} # terms is a dict of FUNCTIONS that take a vmap variable mapping dict.
+        self.name = name
+        self.rbm.add_parameters(self)
+        
+    def activation_term_for(self, units, vmap):
+        return self.terms[units](vmap)
+        
+    def energy_term(self, vmap):
+        raise NotImplementedError("Parameters base class")
+        
+    def gradient(self, vmap):
+        # gives the parameter update in terms of the statistics
+        raise NotImplementedError("Parameters base class")
+        
+    # @property
+    # def variables(self):
+    #     raise NotImplementedError("Parameters base class")
+    # # cannot do this because then it can't be assigned in subclasses :(
+        
+    def affects(self, units):
+        return (units in self.units_list)
+        
+    def affects_all(self, units_list):
+        return all(self.affects(units) for units in units_list)
+        
+    def __repr__(self):
+        units_names = ", ".join(("'%s'" % u.name) for u in self.units_list)
+        return "<morb:Parameters '%s' affecting %s>" % (self.name, units_names)
+
+
+### Base classes: training (parameter updates) ###
+
+
+class ParamUpdater(object):
+    # A ParamUpdater object updates a single Parameters object. Multiple ParamUpdaters can compute updates for a single Parameters object, which can then be aggregated by composite ParamUpdaters (like the SumParamUpdater)
+    def __init__(self, parameters, stats_collectors=[]):
+        # parameters is A SINGLE Parameters object. not a list.
+        self.parameters = parameters
+        self.stats_collectors = stats_collectors
+        
+    def get_update(self):
+        for s in self.stats_collectors:
+            if not s.collected:
+                raise RuntimeError("StatsCollector has not run!")
+        return self.calculate_update()
+        
+    def calculate_update(self):
+        raise NotImplementedError("ParamUpdater base class")
+        
+    def __add__(self, p2):
+        return SumParamUpdater([self, p2])
+        
+    def __neg__(self):
+        return ScaleParamUpdater(self, -1)
+        
+    def __sub__(self, p2):
+        return self + (-p2)
+        
+    def __rmul__(self, a):
+        return ScaleParamUpdater(self, a)
+        # a is assumed to be a scalar!
+        
+    def __mul__(self, a):
+        return ScaleParamUpdater(self, a)
+        # a is assumed to be a scalar!
+        
+    def __div__(self, a):
+        return self * (1.0/a)
+        
+    def __rdiv__(self, a):
+        return self * (1.0/a)
+        
+
+# this extension has to be here because it's used in the base class        
+class ScaleParamUpdater(ParamUpdater):
+    def __init__(self, pu, scaling_factor):
+        super(ScaleParamUpdater, self).__init__(pu.parameters, pu.stats_collectors)
+        self.pu = pu
+        self.scaling_factor = scaling_factor
+        
+    def calculate_update(self):
+        return [self.scaling_factor * d for d in self.pu.calculate_update()]
+
+# this extension has to be here because it's used in the base class
+class SumParamUpdater(ParamUpdater):
+    def __init__(self, param_updaters):
+        # assert that all param_updaters affect the same Parameters object, gather stats collectors
+        self.param_updaters = param_updaters
+        scs = []
+        for pu in param_updaters:
+            if pu.parameters != param_updaters[0].parameters:
+                raise RuntimeError("Cannot add ParamUpdaters that affect a different Parameters object together")        
+            scs.extend(pu.stats_collectors)
+        scs = set(scs) # we only need each collector once.
+        
+        super(SumParamUpdater, self).__init__(param_updaters[0].parameters, scs)
+        
+    def calculate_update(self):
+        updaters = self.param_updaters[:] # make a copy
+        first_updater = updaters.pop(0)
+        s = first_updater.calculate_update()
+        for pu in updaters: # iterate over the REMAINING updaters
+            s = [sd + d for sd, d in zip(s, pu.calculate_update())]
+        return s
+        
+
+   
+class StatsCollector(object):
+    def __init__(self, rbm, input_units, latent_units, context_units=[]):
+        self.reset()
+        self.rbm = rbm
+        self.input_units = input_units # the units that are supplied to the statscollector as input (i.e. the visibles)
+        self.latent_units = latent_units
+        self.context_units = context_units
+        self.stats = {}
+        # IMPORTANT: the stats collector object does not ascertain that there are no dependences between the input
+        # units themselves or between the latent units themselves. Check this yourself if necessary! Typically
+        # there should not be any.
+        
+        
+    def reset(self):
+        self.collected = False
+        self.stats = {}
+        
+    def collect(self, vmap):
+        self.calculate_stats(vmap)
+        self.collected = True
+        
+    def calculate_stats(self):
+        raise NotImplementedError("StatsCollector base class")
+    
+
+class Trainer(object):
+    def __init__(self, rbm, umap):
+        self.rbm = rbm
+        self.umap = umap
+
+    def get_theano_updates(self, vmap):
+        theano_updates = {}
+        # collect stats
+        stats_collectors = [s for pu in self.umap.values() for s in pu.stats_collectors]
+        for s in stats_collectors:
+            s.collect(vmap)
+            theano_updates.update(s.theano_updates)
+        
+        # calculate updates
+        for p, pu in self.umap.items():
+            updated_variables = [v + u for v, u in zip(p.variables, pu.get_update())]
+            theano_updates.update(dict(zip(p.variables, updated_variables)))
+
+        return theano_updates
+
+
+### Base classes: RBM container class ###
+
+class RBM(object):
+    def __init__(self):
+        self.units_list = []
+        self.params_list = []
+        
+    def add_units(self, units):
+        self.units_list.append(units)
+        
+    def add_parameters(self, params):
+        self.params_list.append(params)
+                
+    def params_affecting(self, units):
+        """
+        return a list of all Parameters that contribute a term to the activation of Units units.
+        """
+        return [param for param in self.params_list if param.affects(units)]
+        
+    def params_affecting_all(self, units_list):
+        """
+        return a list of all Parameters that contribute a term to the activations of ALL Units in the given units_list.
+        """
+        return [param for param in self.params_list if param.affects_all(units_list)]
+        
+    def dependent_units(self, given_units_list):
+        """
+        returns a list of all Units that are dependent on the given list of Units.
+        this is useful for block Gibbs sampling (where given_units_list is the set of
+        visible Units, and the set of hiddens is returned).
+        
+        Note that this method does not detect possible dependencies between the given Units
+        themselves, or the returned Units themselves! This check should be performed before
+        doing gibbs sampling. Alternatively, some back and forth sampling between these
+        dependent Units can be used (what's the correct name for this again?)
+        
+        Also, context (i.e. units that should never be sampled) has to be handled separately.
+        This method will include context as it only checks which units are linked to which
+        other units, and these links are not directional.
+        """
+        # first, find all the parameters affecting the Units in the given_units_list
+        # then, for each of these, add all the affected units
+        dependent_units_list = []
+        for u in given_units_list:
+            params_list = self.params_affecting(u)
+            for params in params_list:
+                dependent_units_list.extend(params.units_list)
+                
+        # finally, remove the given units and return the others
+        return set([u for u in dependent_units_list if u not in given_units_list])
+        # note that there are no dependency checks here.
+        
+    def energy(self, vmap):
+        terms = [params.energy_term(vmap) for params in self.params_list]
+        # the energy is the sum of the energy terms for each of the parameters.
+        return sum(terms)
+
+    def __repr__(self):
+        units_names = ", ".join(("'%s'" % u.name) for u in self.units_list)
+        params_names = ", ".join(("'%s'" % p.name) for p in self.params_list)
+        return "<morb:%s with units %s and parameters %s>" % (self.__class__.__name__, units_names, params_names)
+        
+
+# this is a placeholder base class, doesn't do anything by itself. yet.
+class Monitor(object):
+    def expression(self):
+        raise NotImplementedError("Monitor base class")
diff --git a/morb/monitors.py b/morb/monitors.py
new file mode 100644
index 0000000..423834c
--- /dev/null
+++ b/morb/monitors.py
@@ -0,0 +1,68 @@
+from morb.base import Monitor
+
+import theano
+import theano.tensor as T
+
+
+class ReconstructionMSEMonitor(Monitor):
+    def __init__(self, stats_collector, u):
+        self.stats_collector = stats_collector
+        self.u = u # the Units instance for which the reconstruction error should be computed.
+        
+    def expression(self):
+        data = self.stats_collector.stats['data'][self.u]
+        reconstruction = self.stats_collector.stats['model'][self.u]
+        return T.mean((data - reconstruction) ** 2)
+    
+    
+class ReconstructionCrossEntropyMonitor(Monitor):
+    def __init__(self, stats_collector, u):
+        self.stats_collector = stats_collector
+        self.u = u
+        
+    def expression(self):
+        data = self.stats_collector.stats['data'][self.u]
+        reconstruction_linear = self.stats_collector.stats['model_linear_activation'][self.u]
+        return T.mean(T.sum(data*T.log(T.nnet.sigmoid(reconstruction_linear)) +
+                      (1 - data)*T.log(1 - T.nnet.sigmoid(reconstruction_linear)), axis=1))
+                      
+        # without optimisation:
+        # return T.mean(T.sum(data*T.log(reconstruction) + (1 - data)*T.log(reconstruction), axis=1))
+    
+# 'trivial' monitors
+class ReconstructionMonitor(Monitor):
+    def __init__(self, stats_collector, u):
+        self. stats_collector = stats_collector
+        self.u = u
+        
+    def expression(self):
+        return self.stats_collector.stats['model'][self.u]
+
+class DataMonitor(Monitor):
+    def __init__(self, stats_collector, u):
+        self. stats_collector = stats_collector
+        self.u = u
+        
+    def expression(self):
+        return self.stats_collector.stats['data'][self.u]
+        
+        
+class DataEnergyMonitor(Monitor):
+    def __init__(self, stats_collector, rbm):
+        self. stats_collector = stats_collector
+        self.rbm = rbm
+        
+    def expression(self):
+        return self.rbm.energy(self.stats_collector.stats['data'])
+
+class ReconstructionEnergyMonitor(Monitor):
+    def __init__(self, stats_collector, rbm):
+        self. stats_collector = stats_collector
+        self.rbm = rbm
+        
+    def expression(self):
+        return self.rbm.energy(self.stats_collector.stats['model'])
+
+# TODO: pseudo likelihood? is that feasible?
+
+
diff --git a/morb/param_updaters.py b/morb/param_updaters.py
new file mode 100644
index 0000000..283eade
--- /dev/null
+++ b/morb/param_updaters.py
@@ -0,0 +1,70 @@
+from morb.base import ParamUpdater
+from morb.base import SumParamUpdater
+from morb.base import ScaleParamUpdater
+
+### PARAM UPDATERS ###
+
+class DecayParamUpdater(ParamUpdater):
+    def calculate_update(self):
+        return self.parameters.variables
+        # weight decay: the update == the parameter values themselves
+        # (decay constant is taken care of by ScaleParamUpdater)
+        
+
+class MomentumParamUpdater(ParamUpdater):
+    def calculate_update(self):
+        pass # TODO LATER: momentum: this updater has to have memory, so it's a challenge.
+    
+
+class CDParamUpdater(ParamUpdater):
+    def __init__(self, parameters, stats_collector):
+        super(CDParamUpdater, self).__init__(parameters, [stats_collector])
+        # this updater has only one stats collector, so make it more conveniently accessible
+        self.stats_collector = stats_collector 
+        
+    def calculate_update(self):
+        stats = self.stats_collector.stats
+                
+        positive_term = self.parameters.gradient(stats['data'])
+        negative_term = self.parameters.gradient(stats['model'])
+        
+        return [p - n for p, n in zip(positive_term, negative_term)]
+                
+    
+class SparsityParamUpdater(ParamUpdater):
+    def __init__(self, parameters, sparsity_targets, stats_collector):
+        # sparsity_targets is a dict mapping Units instances to their target activations
+        super(SparsityParamUpdater, self).__init__(parameters, [stats_collector])
+        self.stats_collector = stats_collector
+        self.sparsity_targets = sparsity_targets
+        
+    def calculate_update(self):
+        stats = self.stats_collector.stats
+        
+        # modify vmap: subtract target values
+        # this follows the formulation in 'Biasing RBMs to manipulate latent selectivity and sparsity' by Goh, Thome and Cord (2010), formulas (8) and (9).
+        vmap = stats['data'].copy()
+        for u, target in self.sparsity_targets.items():
+            if u in vmap:
+                vmap[u] -= target
+        
+        return [-p for p in self.parameters.gradient(vmap)] # minus sign is important!
+        
+    
+class VarianceTargetParamUpdater(ParamUpdater):
+    def calculate_update(self):
+        pass # TODO LATER
+        
+        
+    
+    
+"""
+DecayParamUpdater(Parameters p)
+  * MomentumParamUpdater(ParamUpdater pu)
+  * CDParamUpdater(Parameters p, StatsCollector s)
+  * SparsityTargetParamUpdater(Parameters p, StatsCollector s, target) # this also needs stats!
+  * VarianceTargetParamUpdater(Parameters p, target) # maybe, as an exercise, since it doesn't really work anyway
+  * SumParamUpdater([ParamUpdater p])
+  * ScaleParamUpdater([ParamUpdater p], scaling_factor)
+"""
+
diff --git a/morb/parameters.py b/morb/parameters.py
new file mode 100644
index 0000000..f359ac5
--- /dev/null
+++ b/morb/parameters.py
@@ -0,0 +1,239 @@
+from morb.base import Parameters
+
+import theano.tensor as T
+from theano.tensor.nnet import conv
+
+        
+class ProdParameters(Parameters):
+    def __init__(self, rbm, units_list, W, name=None):
+        super(ProdParameters, self).__init__(rbm, units_list, name=name)
+        assert len(units_list) == 2
+        self.W = W
+        self.variables = [self.W]
+        self.vu = units_list[0]
+        self.hu = units_list[1]
+        
+        self.terms[self.vu] = lambda vmap: T.dot(vmap[self.hu], W.T)
+        self.terms[self.hu] = lambda vmap: T.dot(vmap[self.vu], W)
+                
+    def gradient(self, vmap):
+        return [T.dot(vmap[self.vu].T, vmap[self.hu])]
+        
+    def energy_term(self, vmap):
+        return - T.sum(self.terms[self.hu](vmap) * vmap[self.hu])
+        # return - T.sum(T.dot(vmap[self.vu], self.W) * vmap[self.hu])
+        # T.sum sums both over the minibatch dimension and the hiddens dimension.
+        
+    
+class BiasParameters(Parameters):
+    def __init__(self, rbm, units, b, name=None):
+        super(BiasParameters, self).__init__(rbm, [units], name=name)
+        self.b = b
+        self.variables = [self.b]
+        self.u = units
+        
+        self.terms[self.u] = lambda vmap: self.b
+        
+    def gradient(self, vmap):
+        return [T.sum(vmap[self.u], axis=0)] # sum over axis 0
+        # this requires the Units instance to be represented by a matrix variable, i.e. a minibatch
+
+    def energy_term(self, vmap):
+        return - T.sum(T.dot(vmap[self.u], self.b)) # T.sum is for minibatches 
+        # bias is NOT TRANSPOSED because it's a vector, and apparently vectors are COLUMN vectors by default.
+
+
+class AdvancedProdParameters(Parameters):
+    def __init__(self, rbm, units_list, dimensions_list, W, name=None):
+        super(AdvancedProdParameters, self).__init__(rbm, units_list, name=name)
+        assert len(units_list) == 2
+        self.W = W
+        self.variables = [self.W]
+        self.vu = units_list[0]
+        self.hu = units_list[1]
+        self.vd = dimensions_list[0]
+        self.hd = dimensions_list[1]
+        self.Wd = self.vd + self.hd
+        
+        # there are vd visible dimensions and hd hidden dimensions, meaning that the weight matrix has
+        # vd + hd = Wd dimensions.
+        # the hiddens and visibles have hd+1 and vd+1 dimensions respectively, because the first dimension
+        # is reserved for minibatches!
+        self.terms[self.vu] = lambda vmap: T.tensordot(vmap[self.hu], W, axes=(range(1,self.hd+1),range(self.vd, self.Wd)))
+        self.terms[self.hu] = lambda vmap: T.tensordot(vmap[self.vu], W, axes=(range(1,self.vd+1),range(0, self.vd)))
+        
+    def gradient(self, vmap):
+        return [T.tensordot(vmap[self.vu], vmap[self.hu], axes=([0],[0]))] # only sum out the minibatch dimension.
+        
+    def energy_term(self, vmap):
+        # v_part = T.tensordot(vmap[self.vu], self.W, axes=(range(1, self.vd+1), range(0, self.vd)))
+        v_part = self.terms[self.hu](vmap)
+        neg_energy = T.tensordot(v_part, vmap[self.hu], axes=(range(0, self.hd+1), range(0, self.hd+1)))
+        # in this case, we also sum over the minibatches in the 2nd step, hence the ranges are hd+1 long.
+        return - neg_energy # don't forget to flip the sign!
+
+
+class AdvancedBiasParameters(Parameters):
+    def __init__(self, rbm, units, dimensions, b, name=None):
+        super(AdvancedBiasParameters, self).__init__(rbm, [units], name=name)
+        self.b = b
+        self.variables = [self.b]
+        self.u = units
+        self.ud = dimensions
+        
+        self.terms[self.u] = lambda vmap: self.b
+        
+    def gradient(self, vmap):
+        return [T.sum(vmap[self.u], axis=0)] # sum over minibatch axis
+        
+    def energy_term(self, vmap):
+        return - T.sum(T.tensordot(vmap[self.u], self.b, axes=(range(1, self.ud+1), range(0, self.ud))), axis=0)
+        
+
+class SharedBiasParameters(Parameters):
+    """
+    like AdvancedBiasParameters, but a given number of trailing dimensions are 'shared'.
+    """
+    def __init__(self, rbm, units, dimensions, shared_dimensions, b, name=None):
+        super(SharedBiasParameters, self).__init__(rbm, [units], name=name)
+        self.b = b
+        self.variables = [self.b]
+        self.u = units
+        self.ud = dimensions
+        self.sd = shared_dimensions
+        self.nd = self.ud - self.sd
+        
+        self.terms[self.u] = lambda vmap: T.shape_padright(self.b, self.sd)
+        
+    def _shared_axes(self, vmap):
+        d = vmap[self.u].ndim
+        return range(d - self.sd, d)
+        
+    def gradient(self, vmap):
+        axes = self._shared_axes(vmap)
+        return [T.sum(T.mean(vmap[self.u], axis=axes), axis=0)]
+        
+    def energy_term(self, vmap):
+        # b_padded = T.shape_padright(self.b, self.sd)
+        # return - T.sum(T.tensordot(vmap[self.u], b_padded, axes=(range(1, self.ud+1), range(0, self.ud))), axis=0)
+        # this does not work because tensordot cannot handle broadcastable dimensions.
+        # instead, the dimensions of b_padded which are broadcastable should be summed out afterwards.
+        # this comes down to the same thing. so:
+        t = T.tensordot(vmap[self.u], self.b, axes=(range(1, self.nd+1), range(0, self.nd)))
+        # now sum t over its trailing shared dimensions, which mimics broadcast + tensordot behaviour.
+        axes = range(t.ndim - self.sd, t.ndim)
+        """
+        print "DEBUG: axes = %s" % repr(axes)
+        print "DEBUG: dimensions = %d" % self.ud
+        print "DEBUG: shared dimensions = %d" % self.sd
+        print "DEBUG: remaining dimensions = %d" % self.nd
+        import pdb; pdb.set_trace()
+        """
+        t2 = T.sum(t, axis=axes)
+        # finally, sum out minibatch axis
+        return - T.sum(t2, axis=0)
+        
+
+               
+class Convolutional2DParameters(Parameters):
+    def __init__(self, rbm, units_list, W, shape_info=None, name=None):
+        # use the shape_info parameter to provide a dict with keys:
+        # hidden_maps, visible_maps, filter_height, filter_width, visible_height, visible_width, mb_size
+        
+        super(Convolutional2DParameters, self).__init__(rbm, units_list, name=name)
+        assert len(units_list) == 2
+        self.W = W # (hidden_maps, visible_maps, filter_height, filter_width)
+        self.variables = [self.W]
+        self.vu = units_list[0] # (mb_size, visible_maps, visible_height, visible_width)
+        self.hu = units_list[1] # (mb_size, hidden_maps, hidden_height, hidden_width)
+        self.shape_info = shape_info
+
+        # conv input is (output_maps, input_maps, filter height [numrows], filter width [numcolumns])
+        # conv input is (mb_size, input_maps, input height [numrows], input width [numcolumns])
+        # conv output is (mb_size, output_maps, output height [numrows], output width [numcolumns])
+        
+        def term_vu(vmap):
+            # input = hiddens, output = visibles so we need to swap dimensions
+            W_shuffled = self.W.dimshuffle(1, 0, 2, 3)
+            return conv.conv2d(vmap[self.hu], W_shuffled, border_mode='full', \
+                               image_shape=self.hidden_shape, filter_shape=self.filter_shape)
+            
+        def term_hu(vmap):
+            # input = visibles, output = hiddens, flip filters
+            W_flipped = self.W[:, :, ::-1, ::-1]
+            return conv.conv2d(vmap[self.vu], W_flipped, border_mode='valid', \
+                               image_shape=self.visible_shape, filter_shape=self.filter_shape)
+        
+        self.terms[self.vu] = term_vu
+        self.terms[self.hu] = term_hu
+    
+    @property    
+    def filter_shape(self):
+        keys = ['hidden_maps', 'visible_maps', 'filter_height', 'filter_width']
+        if self.shape_info is not None and all(k in self.shape_info for k in keys):
+            return (self.shape_info[k] for k in keys)
+        else:
+            return None
+
+    @property            
+    def visible_shape(self):
+        keys = ['mb_size', 'visible_maps', 'visible_height', 'visible_width']                
+        if self.shape_info is not None and all(k in self.shape_info for k in keys):
+            return (self.shape_info[k] for k in keys)
+        else:
+            return None
+
+    @property            
+    def hidden_shape(self):
+        keys = ['mb_size', 'hidden_maps', 'visible_height', 'visible_width']
+        if self.shape_info is not None and all(k in self.shape_info for k in keys):
+            hidden_height = self.shape_info['visible_height'] - self.shape_info['filter_height'] + 1
+            hidden_width = self.shape_info['visible_width'] - self.shape_info['filter_width'] + 1
+            return (self.shape_info['mb_size'], self.shape_info['hidden_maps'], hidden_height, hidden_width)
+        else:
+            return None
+                    
+    def gradient(self, vmap):
+        if self.visible_shape is not None:
+            i_shape = [self.visible_shape[k] for k in [1, 0, 2, 3]]
+        else:
+            i_shape = None
+    
+        if self.hidden_shape is not None:
+            f_shape = [self.hidden_shape[k] for k in [1, 0, 2, 3]]
+        else:
+            f_shape = None
+        
+        v_shuffled = vmap[self.vu].dimshuffle(1, 0, 2, 3)
+        h_shuffled = vmap[self.hu].dimshuffle(1, 0, 2, 3)
+        
+        c = conv.conv2d(v_shuffled, h_shuffled, border_mode='valid', image_shape=i_shape, filter_shape=f_shape)   
+        return [c.dimshuffle(1, 0, 2, 3)]
+        
+    def energy_term(self, vmap):
+        return - T.sum(self.terms[self.hu](vmap) * vmap[self.hu])
+        
+        
+        
+        
+# TODO: 1D convolution + optimisation
+
+
+
+class BetaParameters(Parameters):
+    def __init__(self, rbm, units_list, W1, W2, U1, U2, name=None):
+        super(BetaParameters, self).__init__(rbm, units_list, name=name)
+        assert len(units_list) == 2
+        self.W1, self.W2, self.U1, self.U2 = W1, W2, U1, U2
+        self.variables = [W1, W2, U1, U2]
+        vu, hu = units_list
+        
+        self.terms[vu] = lambda vmap: (T.dot(vmap[hu], W1.T) + T.dot(1 - vmap[hu], W2.T), T.dot(vmap[hu], U1.T) + T.dot(1 - vmap[hu], U2.T)) # dit zijn alfa en beta
+        self.terms[hu] = lambda vmap: T.dot(W1 - W2, T.log(vmap[vu])) + T.dot(U1 - U2, T.log(1 - vmap[vu])) # dit gaat door de sigmoid
+        
+    def gradient(self, vmap):
+        pass # TODO LATER: this update has 4 components, for W1, W2, U1 and U2!
+        
+    def energy_term(self, vmap):
+        # TODO
+        pass
diff --git a/morb/rbms.py b/morb/rbms.py
new file mode 100644
index 0000000..663846e
--- /dev/null
+++ b/morb/rbms.py
@@ -0,0 +1,83 @@
+from morb.base import RBM
+from morb import units, parameters
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+
+### RBMS ###
+
+class BinaryBinaryRBM(RBM): # the basic RBM, with binary visibles and binary hiddens
+    def __init__(self, n_visible, n_hidden):
+        super(BinaryBinaryRBM, self).__init__()
+        # data shape
+        self.n_visible = n_visible
+        self.n_hidden = n_hidden
+        # units
+        self.v = units.BinaryUnits(self, name='v') # visibles
+        self.h = units.BinaryUnits(self, name='h') # hiddens
+        # parameters
+        self.W = parameters.ProdParameters(self, [self.v, self.h], theano.shared(value = self._initial_W(), name='W'), name='W') # weights
+        self.bv = parameters.BiasParameters(self, self.v, theano.shared(value = self._initial_bv(), name='bv'), name='bv') # visible bias
+        self.bh = parameters.BiasParameters(self, self.h, theano.shared(value = self._initial_bh(), name='bh'), name='bh') # hidden bias
+        
+    def _initial_W(self):
+        return np.asarray( np.random.uniform(
+                   low   = -4*np.sqrt(6./(self.n_hidden+self.n_visible)),
+                   high  =  4*np.sqrt(6./(self.n_hidden+self.n_visible)),
+                   size  =  (self.n_visible, self.n_hidden)),
+                   dtype =  theano.config.floatX)
+        
+    def _initial_bv(self):
+        return np.zeros(self.n_visible, dtype = theano.config.floatX)
+        
+    def _initial_bh(self):
+        return np.zeros(self.n_hidden, dtype = theano.config.floatX)
+        
+        
+class SigmoidBinaryRBM(BinaryBinaryRBM):
+    def __init__(self, n_visible, n_hidden):
+        super(BinaryBinaryRBM, self).__init__() # skip binarybinaryRBM constructor, since we've overridden it.
+        # data shape
+        self.n_visible = n_visible
+        self.n_hidden = n_hidden
+        # units
+        self.v = units.SigmoidUnits(self, name='v') # visibles
+        self.h = units.BinaryUnits(self, name='h') # hiddens
+        # parameters
+        self.W = parameters.ProdParameters(self, [self.v, self.h], theano.shared(value = self._initial_W(), name='W'), name='W') # weights
+        self.bv = parameters.BiasParameters(self, self.v, theano.shared(value = self._initial_bv(), name='bv'), name='bv') # visible bias
+        self.bh = parameters.BiasParameters(self, self.h, theano.shared(value = self._initial_bh(), name='bh'), name='bh') # hidden bias
+        
+
+
+class BinaryBinaryCRBM(BinaryBinaryRBM):
+    def __init__(self, n_visible, n_hidden, n_context):
+        super(BinaryBinaryCRBM, self).__init__(n_visible, n_hidden)
+        # data shape
+        self.n_context = n_context
+        # units
+        self.x = units.BinaryUnits(self, name='x') # context
+        # parameters
+        self.A = parameters.ProdParameters(self, [self.x, self.v], theano.shared(value = self._initial_A(), name='A'), name='A') # context-to-visible weights
+        self.B = parameters.ProdParameters(self, [self.x, self.h], theano.shared(value = self._initial_B(), name='B'), name='B') # context-to-hidden weights
+
+    def _initial_A(self):
+        return np.zeros((self.n_context, self.n_visible), dtype = theano.config.floatX)
+
+    def _initial_B(self):
+        return np.zeros((self.n_context, self.n_hidden), dtype = theano.config.floatX)
+
+
+
+class GaussianBinaryRBM(RBM): # Gaussian visible units
+    def __init__(self):
+        super(GaussianBinaryRBM, self).__init__()
+        # units
+        # TODO
+        # parameters
+        # TODO    
+
+
diff --git a/morb/samplers.py b/morb/samplers.py
new file mode 100644
index 0000000..80612f3
--- /dev/null
+++ b/morb/samplers.py
@@ -0,0 +1,77 @@
+from morb.base import sampler
+
+import theano
+import theano.tensor as T
+
+# from theano.tensor.shared_randomstreams import RandomStreams
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams # veel sneller
+import numpy as np
+
+numpy_rng = np.random.RandomState(123)
+theano_rng = RandomStreams(numpy_rng.randint(2**30))
+
+
+@sampler
+def bernoulli(a, **kwargs):
+    return theano_rng.binomial(size=a.shape, n=1, p=a, dtype=theano.config.floatX) 
+    
+@sampler
+def bernoulli_mf(a, **kwargs):
+    # if sampling in CD, use mean field
+    if 'cd' in kwargs and kwargs['cd'] == True:
+        return a
+    else:
+        return bernoulli.apply(a)
+        
+@sampler
+def bernoulli_always_mf(a, **kwargs):
+    # this can be used for bernoulli visibles that are actually used to model continuous data in [0,1].
+    # WARNING: NEVER USE THIS FOR HIDDENS. Use bernoulli_mf instead.
+    return a
+    
+gaussian_always_mf = bernoulli_always_mf # this comes down to the same thing;
+# for both the bernoulli and the gaussian distribution, the parameter is the mean, so
+# working with the mean comes down to using the parameter directly instead of sampling.
+        
+
+@sampler
+def multinomial(a):
+    # 0 = minibatches
+    # 1 = units
+    # 2 = states
+    r = a.reshape((a.shape[0]*a.shape[1], a.shape[2]))   
+    
+    # verwachte input theano_rng.multinomial:
+    # 0 = units + minibatches
+    # 1 = states
+    s = self.theano_rng.multinomial(n=1, pvals=r, dtype=theano.config.floatX)
+    
+    return s.reshape(a.shape)
+    
+
+@sampler
+def multinomial_with_zero(a):
+    # like multinomial, but include a zero energy state (so it's possible that the outcome is all zeros.
+    r = a.reshape((a.shape[0]*a.shape[1], a.shape[2]))   
+    r0 = T.concatenate([r, T.zeros_like(r)[:, 0:1]], axis=1) # add row of zeros for zero state
+    s0 = self.theano_rng.multinomial(n=1, pvals=r0, dtype=theano.config.floatX)
+    s = s0[:, :-1] # cut off zero state column
+    return s.reshape(a.shape) # reshape to original shape
+
+
+
+
+
+
+
+def tmin(a, b):
+    return T.switch(a < b, a, b)
+    
+def tmax(a, b):
+    return T.switch(a > b, a, b)
+    
+@sampler
+def beta(a, b): # TODO LATER: MOET MET TENSOREN KUNNEN WERKEN
+    # best op basis van een gamma_sampler doen
+    # TODO LATER
+    pass
diff --git a/morb/stats_collectors.py b/morb/stats_collectors.py
new file mode 100644
index 0000000..e5770f5
--- /dev/null
+++ b/morb/stats_collectors.py
@@ -0,0 +1,158 @@
+from morb.base import StatsCollector
+
+import theano
+
+class CDkStatsCollector(StatsCollector):
+    def __init__(self, rbm, input_units, latent_units, context_units=[], k=1, mean_field_for_visibles=True, mean_field_for_stats=True):
+        super(CDkStatsCollector, self).__init__(rbm, input_units, latent_units, context_units)
+        self.k = k               
+        self.mean_field_for_visibles = mean_field_for_visibles
+        # with 'mean_field_for_visibles', we can control whether hiddens are sampled based on visibles samples or visible means in the CD iterations.
+        # This requires that the Units instances have samplers that return means when cd=True.
+        # disabling this option is useful when one doesn't want to apply mean field during the gibbs sampling,
+        # but the statistics should be mean field nevertheless to improve convergence.
+        self.mean_field_for_stats = mean_field_for_stats
+        # 'mean_field_for_stats' controls whether the returned statistics are means or samples. You will almost
+        # always want to leave this enabled, as mean stats improve convergence. Note that a sampler that responds
+        # to cd=True is required, else this does nothing.
+        
+    def calculate_stats(self, v0_vmap):
+        # first we need to get the context, because we will have to merge it into the other vmaps.
+        context_vmap = dict((u, v0_vmap[u]) for u in self.context_units)
+    
+        h0_linear_activation_vmap = dict((h, h.linear_activation(v0_vmap)) for h in self.latent_units)
+        h0_activation_vmap = dict((h, h.activation(v0_vmap)) for h in self.latent_units)
+        h0_sample_cd_vmap = dict((h, h.sample(v0_vmap, cd=True)) for h in self.latent_units) # with mean field
+        h0_sample_vmap = dict((h, h.sample(v0_vmap)) for h in self.latent_units) # without mean field
+        
+        # add context
+        h0_linear_activation_vmap.update(context_vmap)
+        h0_activation_vmap.update(context_vmap)
+        h0_sample_cd_vmap.update(context_vmap)
+        h0_sample_vmap.update(context_vmap)
+        
+        exp_input = [v0_vmap[u] for u in self.input_units]
+        exp_context = [v0_vmap[u] for u in self.context_units]
+        exp_latent = [h0_sample_vmap[u] for u in self.latent_units]
+        
+        # scan requires a function that returns theano expressions, so we cannot pass vmaps in or out. annoying.
+        def gibbs_hvh(*args):
+            h0_sample_vmap = dict(zip(self.latent_units, args)) # these must be without mf!
+            
+            v1_in_vmap = h0_sample_vmap.copy()
+            v1_in_vmap.update(context_vmap)
+            
+            v1_linear_activation_vmap = dict((v, v.linear_activation(v1_in_vmap)) for v in self.input_units)
+            v1_activation_vmap = dict((v, v.activation(v1_in_vmap)) for v in self.input_units)
+            v1_sample_cd_vmap = dict((v, v.sample(v1_in_vmap, cd=True)) for v in self.input_units) # with mf
+            v1_sample_vmap = dict((v, v.sample(v1_in_vmap)) for v in self.input_units) # without mf
+            
+            if self.mean_field_for_visibles:
+                h1_in_vmap = v1_sample_cd_vmap.copy()
+                h1_in_vmap.update(context_vmap)
+                
+                # use the mean field version of the visibles to sample hiddens from visibles
+                h1_linear_activation_vmap = dict((h, h.linear_activation(h1_in_vmap)) for h in self.latent_units)
+                h1_activation_vmap = dict((h, h.activation(h1_in_vmap)) for h in self.latent_units)
+                h1_sample_cd_vmap = dict((h, h.sample(h1_in_vmap, cd=True)) for h in self.latent_units) # with mf
+                h1_sample_vmap = dict((h, h.sample(h1_in_vmap)) for h in self.latent_units) # without mf
+            else:
+                h1_in_vmap = v1_sample_vmap.copy()
+                h1_in_vmap.update(context_vmap)
+                
+                # use the sampled visibles to sample hiddens from visibles
+                h1_linear_activation_vmap = dict((h, h.linear_activation(h1_in_vmap)) for h in self.latent_units)
+                h1_activation_vmap = dict((h, h.activation(h1_in_vmap)) for h in self.latent_units)
+                h1_sample_cd_vmap = dict((h, h.sample(h1_in_vmap, cd=True)) for h in self.latent_units) # with mf
+                h1_sample_vmap = dict((h, h.sample(h1_in_vmap)) for h in self.latent_units) # without mf
+                
+
+            # get the v1 values in a fixed order
+            v1_linear_activation_values = [v1_linear_activation_vmap[u] for u in self.input_units]
+            v1_activation_values = [v1_activation_vmap[u] for u in self.input_units]
+            v1_sample_cd_values = [v1_sample_cd_vmap[u] for u in self.input_units]
+            v1_sample_values = [v1_sample_vmap[u] for u in self.input_units]
+            
+            # same for the h1 values
+            h1_linear_activation_values = [h1_linear_activation_vmap[u] for u in self.latent_units]
+            h1_activation_values = [h1_activation_vmap[u] for u in self.latent_units]
+            h1_sample_cd_values = [h1_sample_cd_vmap[u] for u in self.latent_units]
+            h1_sample_values = [h1_sample_vmap[u] for u in self.latent_units]
+            
+            return v1_linear_activation_values + v1_activation_values + v1_sample_cd_values + v1_sample_values + \
+                   h1_linear_activation_values + h1_activation_values + h1_sample_cd_values + h1_sample_values
+        
+        # The 'outputs_info' keyword argument of scan configures how the function outputs are mapped to the inputs.
+        # in this case, we want the h1_sample_vmap values to map onto the function arguments, so they become
+        # h0_sample_vmap values in the next iteration. To this end, we construct outputs_info as follows:
+        outputs_info = [None] * (len(exp_input)*4) + [None] * (len(exp_latent)*3) + list(exp_latent)
+        # 'None' indicates that this output is not used in the next iteration.
+        # We need the non-cd samples as input! so h1_sample_vmap becomes h)_sample_vmap
+        
+        exp_output_all_list, updates = theano.scan(gibbs_hvh, outputs_info = outputs_info, n_steps = self.k)
+        # we only need the final outcomes, not intermediary values
+        exp_output_list = [out[-1] for out in exp_output_all_list]
+                
+        # reconstruct vmaps from the exp_output_list.
+        n_input, n_latent = len(self.input_units), len(self.latent_units)
+        vk_linear_activation_vmap = dict(zip(self.input_units, exp_output_list[0:n_input]))
+        vk_activation_vmap = dict(zip(self.input_units, exp_output_list[n_input:2*n_input]))
+        vk_sample_cd_vmap = dict(zip(self.input_units, exp_output_list[2*n_input:3*n_input]))
+        vk_sample_vmap = dict(zip(self.input_units, exp_output_list[3*n_input:4*n_input]))
+        hk_linear_activation_vmap = dict(zip(self.latent_units, exp_output_list[4*n_input:4*n_input+n_latent]))
+        hk_activation_vmap = dict(zip(self.latent_units, exp_output_list[4*n_input+n_latent:4*n_input+2*n_latent]))
+        hk_sample_cd_vmap = dict(zip(self.latent_units, exp_output_list[4*n_input+2*n_latent:4*n_input+3*n_latent]))
+        hk_sample_vmap = dict(zip(self.latent_units, exp_output_list[4*n_input+3*n_latent:4*n_input+4*n_latent]))
+                
+        # TODO: some of these are not used... maybe they'll come in handy later? If not, remove them.
+        
+        self.theano_updates = updates
+        
+        linear_activation_data_vmap = v0_vmap.copy()
+        linear_activation_data_vmap.update(h0_linear_activation_vmap)
+        linear_activation_model_vmap = vk_linear_activation_vmap.copy()
+        linear_activation_model_vmap.update(context_vmap)
+        linear_activation_model_vmap.update(hk_linear_activation_vmap)
+        
+        activation_data_vmap = v0_vmap.copy()
+        activation_data_vmap.update(h0_activation_vmap)
+        activation_model_vmap = vk_activation_vmap.copy()
+        activation_model_vmap.update(context_vmap)
+        activation_model_vmap.update(hk_activation_vmap)
+        
+        
+        # store the computed stats in a dictionary of vmaps.
+        if self.mean_field_for_stats:
+            stats_data_vmap = v0_vmap.copy()
+            stats_data_vmap.update(h0_sample_cd_vmap)
+            stats_model_vmap = vk_sample_cd_vmap.copy()
+            stats_model_vmap.update(context_vmap)
+            stats_model_vmap.update(hk_sample_cd_vmap)
+            self.stats = {
+              'data': stats_data_vmap,
+              'model': stats_model_vmap,
+            }
+        else:
+            stats_data_vmap = v0_vmap.copy()
+            stats_data_vmap.update(h0_sample_vmap)
+            stats_model_vmap = vk_sample_vmap.copy()
+            stats_model_vmap.update(context_vmap)
+            stats_model_vmap.update(hk_sample_vmap)
+            self.stats = {
+              'data': stats_data_vmap,
+              'model': stats_model_vmap,
+            }
+            
+        self.stats['data_linear_activation'] = linear_activation_data_vmap
+        self.stats['model_linear_activation'] = linear_activation_model_vmap
+        self.stats['data_activation'] = activation_data_vmap
+        self.stats['model_activation'] = activation_model_vmap
+            
+        # TODO: add activation and linear activation to the stats as well, both positive and negative
+        
+
+        
+
+class PCDStatsCollector(StatsCollector):
+    def calculate_stats(self):
+        pass # access self.rbm #TODO later
diff --git a/morb/trainers.py b/morb/trainers.py
new file mode 100644
index 0000000..4a6b6e0
--- /dev/null
+++ b/morb/trainers.py
@@ -0,0 +1,52 @@
+from morb.base import Trainer
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+class MinibatchTrainer(Trainer):
+    # use self.rbm, self.umap, self.get_updates(vmap)
+    def compile_function(self, initial_vmap, monitors=[], name='func', mb_size=32, update=True, mode=None):
+        # setting update=False is useful when compiling a function for evaluation only, i.e. no training.
+        # this is interesting when one wants to evaluate training progress on a validation set, for example.
+        if update:
+            updates = self.get_theano_updates(initial_vmap)        
+        else:
+            updates = {}
+        
+        # initialise data sets         
+        data_sets = {}
+        for u, v in initial_vmap.items():
+            shape = (1,) * v.ndim
+            data_sets[u] = theano.shared(value = np.zeros(shape, dtype=theano.config.floatX),
+                                          name="dataset for '%s'"  % u.name)
+                                          
+        index = T.lscalar() # index to a minibatch
+        
+        # construct givens for the compiled theano function - mapping variables to data
+        givens = dict((initial_vmap[u], data_sets[u][index*mb_size:(index+1)*mb_size]) for u in initial_vmap)
+        monitor_expressions = [m.expression() for m in monitors]
+            
+        TF = theano.function([index], monitor_expressions,
+            updates = updates, givens = givens, name = name, mode = mode)    
+                
+        def func(dmap):
+            # dmap is a dict that maps unit types on their respective datasets (numeric).
+            units_list = dmap.keys()
+            data_sizes = [int(np.ceil(dmap[u].shape[0] / float(mb_size))) for u in units_list]
+            
+            if data_sizes.count(data_sizes[0]) != len(data_sizes): # check if all data sizes are equal
+                raise RuntimeError("The sizes of the supplied datasets for the different input units are not equal.")
+
+            data_cast = [dmap[u].astype(theano.config.floatX) for u in units_list]
+            
+            for i, u in enumerate(units_list):
+                data_sets[u].set_value(data_cast[i], borrow=True)
+                
+            for batch_index in xrange(min(data_sizes)):
+                yield TF(batch_index)
+                
+        return func
+                        
+
diff --git a/morb/units.py b/morb/units.py
new file mode 100644
index 0000000..6116b5b
--- /dev/null
+++ b/morb/units.py
@@ -0,0 +1,22 @@
+from morb.base import units_type, Units
+from morb import samplers, activation_functions
+
+BinaryUnits = units_type(activation_functions.sigmoid, samplers.bernoulli_mf)
+
+SigmoidUnits = units_type(activation_functions.sigmoid, samplers.bernoulli_always_mf)
+
+GaussianUnits = units_type(activation_functions.identity, samplers.gaussian_always_mf)
+
+SoftmaxUnits = units_type(activation_functions.softmax, samplers.multinomial)
+
+# TODO LATER: BetaUnits
+
+# TODO LATER: RELUUnits
+      
+class SymmetricBetaUnits(Units): # hoah
+    # Symmetric because the hiddens switch between two beta distributions, not because
+    # the parameters of the distribution are chosen to be equal (this is not the case).
+    pass
+    
+class ReLUUnits(Units): # hoah
+    pass
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..a1986d5
--- /dev/null
+++ b/test.py
@@ -0,0 +1,56 @@
+import morb
+from morb import rbms, stats_collectors, param_updaters, trainers, monitors
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+from test_utils import generate_data, get_context
+
+
+# DEBUGGING
+
+from theano import ProfileMode
+# mode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+# mode = theano.compile.DebugMode(check_py_code=False, require_matching_strides=False)
+mode = None
+
+
+# generate data
+data = generate_data(200) # np.random.randint(2, size=(10000, n_visible))
+
+n_visible = data.shape[1]
+n_hidden = 100
+
+
+rbm = rbms.BinaryBinaryRBM(n_visible, n_hidden)
+# sc10 = stats_collectors.CDkStatsCollector(rbm, [rbm.v], k=10) # CD-10 stats collector
+
+# calculate CD-10 stats symbolically:
+# s = sc10.calculate_stats({ rbm.v: T.vector('v') })
+
+# try to calculate weight updates using CD-1 stats
+sc = stats_collectors.CDkStatsCollector(rbm, input_units=[rbm.v], latent_units=[rbm.h], k=1)
+
+umap = {}
+for params in rbm.params_list:
+    # pu =  0.001 * (param_updaters.CDParamUpdater(params, sc) + 0.02 * param_updaters.DecayParamUpdater(params))
+    pu =  0.001 * param_updaters.CDParamUpdater(params, sc)
+    umap[params] = pu
+
+ 
+t = trainers.MinibatchTrainer(rbm, umap)
+m = monitors.ReconstructionMSEMonitor(sc, rbm.v)
+train = t.compile_function({ rbm.v: T.matrix('v') }, mb_size=32, monitors=[m], name='train', mode=mode)
+
+epochs = 50
+
+for epoch in range(epochs):
+    print "Epoch %d" % epoch
+    costs = [m for m in train({ rbm.v: data })]
+    print "MSE = %.4f" % np.mean(costs)
+
diff --git a/test_advparams.py b/test_advparams.py
new file mode 100644
index 0000000..d1fa361
--- /dev/null
+++ b/test_advparams.py
@@ -0,0 +1,215 @@
+import morb
+from morb import rbms, stats_collectors, param_updaters, trainers, monitors, units, parameters
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+import gzip, cPickle
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+from test_utils import generate_data, get_context
+import time
+
+# DEBUGGING
+
+from theano import ProfileMode
+# mode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+# mode = theano.compile.DebugMode(check_py_code=False, require_matching_strides=False)
+mode = None
+
+
+# load data
+print ">> Loading dataset..."
+
+f = gzip.open('mnist.pkl.gz','rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+
+train_set_x, train_set_y = train_set
+valid_set_x, valid_set_y = train_set
+test_set_x, test_set_y = train_set
+
+
+# TODO DEBUG
+train_set_x = train_set_x[:10000]
+valid_set_x = valid_set_x[:1000]
+
+
+n_visible = train_set_x.shape[1]
+n_hidden = 100
+
+
+print ">> Constructing RBM..."
+
+class AdvRBM(morb.base.RBM): # the basic RBM, with binary visibles and binary hiddens
+    def __init__(self, n_visible, n_hidden):
+        super(AdvRBM, self).__init__()
+        # data shape
+        self.n_visible = n_visible
+        self.n_hidden = n_hidden
+        # units
+        self.v = units.SigmoidUnits(self, name='v') # visibles
+        self.h = units.BinaryUnits(self, name='h') # hiddens
+        # parameters
+        self.W = parameters.AdvancedProdParameters(self, [self.v, self.h], [1,1], theano.shared(value = self._initial_W(), name='W'), name='W') # weights
+        self.bv = parameters.AdvancedBiasParameters(self, self.v, 1, theano.shared(value = self._initial_bv(), name='bv'), name='bv') # visible bias
+        self.bh = parameters.AdvancedBiasParameters(self, self.h, 1, theano.shared(value = self._initial_bh(), name='bh'), name='bh') # hidden bias
+        
+    def _initial_W(self):
+        return np.asarray( np.random.uniform(
+                   low   = -4*np.sqrt(6./(self.n_hidden+self.n_visible)),
+                   high  =  4*np.sqrt(6./(self.n_hidden+self.n_visible)),
+                   size  =  (self.n_visible, self.n_hidden)),
+                   dtype =  theano.config.floatX)
+        
+    def _initial_bv(self):
+        return np.zeros(self.n_visible, dtype = theano.config.floatX)
+        
+    def _initial_bh(self):
+        return np.zeros(self.n_hidden, dtype = theano.config.floatX)
+        
+
+
+
+
+
+
+
+rbm = AdvRBM(n_visible, n_hidden)
+# rbm = rbms.SigmoidBinaryRBM(n_visible, n_hidden)
+
+# try to calculate weight updates using CD-1 stats
+print ">> Constructing contrastive divergence updaters..."
+sc = stats_collectors.CDkStatsCollector(rbm, input_units=[rbm.v], latent_units=[rbm.h], k=1)
+
+umap = {}
+for params in rbm.params_list:
+    # pu =  0.001 * (param_updaters.CDParamUpdater(params, sc) + 0.02 * param_updaters.DecayParamUpdater(params))
+    pu =  0.001 * param_updaters.CDParamUpdater(params, sc)
+    umap[params] = pu
+
+print ">> Compiling functions..."
+t = trainers.MinibatchTrainer(rbm, umap)
+m = monitors.ReconstructionMSEMonitor(sc, rbm.v)
+m_data = monitors.DataMonitor(sc, rbm.v)
+m_model = monitors.ReconstructionMonitor(sc, rbm.v)
+e_data = monitors.DataEnergyMonitor(sc, rbm)
+e_model = monitors.ReconstructionEnergyMonitor(sc, rbm)
+
+initial_vmap = { rbm.v: T.matrix('v') }
+
+# train = t.compile_function(initial_vmap, mb_size=32, monitors=[m], name='train', mode=mode)
+train = t.compile_function(initial_vmap, mb_size=100, monitors=[m, e_data, e_model], name='train', mode=mode)
+evaluate = t.compile_function(initial_vmap, mb_size=100, monitors=[m, m_data, m_model, e_data, e_model], name='evaluate', update=False, mode=mode)
+
+
+
+
+
+
+def plot_data(d):
+    plt.figure(5)
+    plt.clf()
+    plt.imshow(d.reshape((28,28)), interpolation='gaussian')
+    plt.draw()
+
+
+def sample_evolution(start, ns=100): # start = start data
+    sample = t.compile_function(initial_vmap, mb_size=1, monitors=[m_model], name='evaluate', update=False, mode=mode)
+    
+    data = start
+    plot_data(data)
+    
+
+    while True:
+        for k in range(ns):
+            for x in sample({ rbm.v: data }): # draw a new sample
+                data = x[0]
+            
+        plot_data(data)
+        
+
+
+
+
+
+
+
+
+
+# TRAINING 
+
+epochs = 200
+print ">> Training for %d epochs..." % epochs
+
+start_time = time.time()
+
+mses_train_so_far = []
+mses_valid_so_far = []
+edata_train_so_far = []
+emodel_train_so_far = []
+edata_so_far = []
+emodel_so_far = []
+
+for epoch in range(epochs):
+    monitoring_data_train = [(cost, energy_data, energy_model) for cost, energy_data, energy_model in train({ rbm.v: train_set_x })]
+    mses_train, edata_train_list, emodel_train_list = zip(*monitoring_data_train)
+    mse_train = np.mean(mses_train)
+    edata_train = np.mean(edata_train_list)
+    emodel_train = np.mean(emodel_train_list)
+    
+    monitoring_data = [(cost, data, model, energy_data, energy_model) for cost, data, model, energy_data, energy_model in evaluate({ rbm.v: valid_set_x })]
+    mses_valid, vdata, vmodel, edata, emodel = zip(*monitoring_data)
+    mse_valid = np.mean(mses_valid)
+    edata_valid = np.mean(edata)
+    emodel_valid = np.mean(emodel)
+    
+    # plotting
+    mses_train_so_far.append(mse_train)
+    mses_valid_so_far.append(mse_valid)
+    edata_so_far.append(edata_valid)
+    emodel_so_far.append(emodel_valid)
+    edata_train_so_far.append(edata_train)
+    emodel_train_so_far.append(emodel_train)
+    
+    plt.figure(1)
+    plt.clf()
+    plt.plot(mses_train_so_far, label='train')
+    plt.plot(mses_valid_so_far, label='validation')
+    plt.title("MSE")
+    plt.legend()
+    plt.draw()
+    
+    plt.figure(4)
+    plt.clf()
+    plt.plot(edata_so_far, label='validation / data')
+    plt.plot(emodel_so_far, label='validation / model')
+    plt.plot(edata_train_so_far, label='train / data')
+    plt.plot(emodel_train_so_far, label='train / model')
+    plt.title("energy")
+    plt.legend()
+    plt.draw()
+    
+    # plot some samples
+    plt.figure(2)
+    plt.clf()
+    plt.imshow(vdata[0][0].reshape((28, 28)))
+    plt.draw()
+    plt.figure(3)
+    plt.clf()
+    plt.imshow(vmodel[0][0].reshape((28, 28)))
+    plt.draw()
+
+    
+    print "Epoch %d" % epoch
+    print "%.2f seconds" % (time.time() - start_time)
+    print "training set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_train, edata_train, emodel_train)
+    print "validation set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_valid, edata_valid, emodel_valid)
+
+
+
+
diff --git a/test_crbm.py b/test_crbm.py
new file mode 100644
index 0000000..29a6eb2
--- /dev/null
+++ b/test_crbm.py
@@ -0,0 +1,84 @@
+import morb
+from morb import rbms, stats_collectors, param_updaters, trainers, monitors
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+from test_utils import generate_data, get_context
+
+# DEBUGGING
+
+from theano import ProfileMode
+# mode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+# mode = theano.compile.DebugMode(check_py_code=False, require_matching_strides=False)
+mode = None
+
+
+# generate data
+print ">> Generating dataset..."
+data = generate_data(1000) # np.random.randint(2, size=(10000, n_visible))
+data_context = get_context(data)
+
+data_train = data[:-1000, :]
+data_eval = data[-1000:, :]
+data_context_train = data_context[:-1000, :]
+data_context_eval = data_context[-1000:, :]
+
+n_visible = data.shape[1]
+n_context = data_context.shape[1]
+n_hidden = 100
+
+
+print ">> Constructing RBM..."
+rbm = rbms.BinaryBinaryCRBM(n_visible, n_hidden, n_context)
+# sc10 = stats_collectors.CDkStatsCollector(rbm, [rbm.v], k=10) # CD-10 stats collector
+
+# calculate CD-10 stats symbolically:
+# s = sc10.calculate_stats({ rbm.v: T.vector('v') })
+
+# try to calculate weight updates using CD-1 stats
+print ">> Constructing contrastive divergence updaters..."
+sc = stats_collectors.CDkStatsCollector(rbm, input_units=[rbm.v], latent_units=[rbm.h], context_units=[rbm.x], k=1)
+
+umap = {}
+for params in rbm.params_list:
+    # pu =  0.001 * (param_updaters.CDParamUpdater(params, sc) + 0.02 * param_updaters.DecayParamUpdater(params))
+    pu =  0.0005 * param_updaters.CDParamUpdater(params, sc)
+    umap[params] = pu
+
+print ">> Compiling functions..."
+t = trainers.MinibatchTrainer(rbm, umap)
+m = monitors.ReconstructionMSEMonitor(sc, rbm.v)
+mce = monitors.ReconstructionCrossEntropyMonitor(sc, rbm.v)
+
+initial_vmap = { rbm.v: T.matrix('v'), rbm.x: T.matrix('x') }
+
+# train = t.compile_function(initial_vmap, mb_size=32, monitors=[m], name='train', mode=mode)
+train = t.compile_function(initial_vmap, mb_size=32, monitors=[m, mce], name='train', mode=mode)
+evaluate = t.compile_function(initial_vmap, mb_size=32, monitors=[m, mce], name='evaluate', mode=mode)
+
+epochs = 200
+print ">> Training for %d epochs..." % epochs
+
+
+for epoch in range(epochs):
+    costs_train = [costs for costs in train({ rbm.v: data_train, rbm.x: data_context_train })]
+    costs_eval = [costs for costs in evaluate({ rbm.v: data_eval, rbm.x: data_context_eval })]
+    mses_train, ces_train = zip(*costs_train)
+    mses_eval, ces_eval = zip(*costs_eval)
+    
+    mse_train = np.mean(mses_train)
+    ce_train = np.mean(ces_train)
+    mse_eval = np.mean(mses_eval)
+    ce_eval = np.mean(ces_eval)
+    
+    print "Epoch %d" % epoch
+    print "training set: MSE = %.6f, CE = %.6f" % (mse_train, ce_train)
+    print "validation set: MSE = %.6f, CE = %.6f" % (mse_eval, ce_eval)
+
+
diff --git a/test_mnist.py b/test_mnist.py
new file mode 100644
index 0000000..1297347
--- /dev/null
+++ b/test_mnist.py
@@ -0,0 +1,175 @@
+import morb
+from morb import rbms, stats_collectors, param_updaters, trainers, monitors
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+import gzip, cPickle
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+from test_utils import generate_data, get_context
+
+# DEBUGGING
+
+from theano import ProfileMode
+# mode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+# mode = theano.compile.DebugMode(check_py_code=False, require_matching_strides=False)
+mode = None
+
+
+# load data
+print ">> Loading dataset..."
+
+f = gzip.open('mnist.pkl.gz','rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+
+train_set_x, train_set_y = train_set
+valid_set_x, valid_set_y = valid_set
+test_set_x, test_set_y = test_set
+
+
+# TODO DEBUG
+train_set_x = train_set_x[:10000]
+valid_set_x = valid_set_x[:1000]
+
+
+n_visible = train_set_x.shape[1]
+n_hidden = 100
+
+
+print ">> Constructing RBM..."
+rbm = rbms.SigmoidBinaryRBM(n_visible, n_hidden)
+
+# try to calculate weight updates using CD-1 stats
+print ">> Constructing contrastive divergence updaters..."
+sc = stats_collectors.CDkStatsCollector(rbm, input_units=[rbm.v], latent_units=[rbm.h], k=1)
+
+umap = {}
+for params in rbm.params_list:
+    # pu =  0.001 * (param_updaters.CDParamUpdater(params, sc) + 0.02 * param_updaters.DecayParamUpdater(params))
+    pu =  0.001 * param_updaters.CDParamUpdater(params, sc)
+    umap[params] = pu
+
+print ">> Compiling functions..."
+t = trainers.MinibatchTrainer(rbm, umap)
+m = monitors.ReconstructionMSEMonitor(sc, rbm.v)
+m_data = monitors.DataMonitor(sc, rbm.v)
+m_model = monitors.ReconstructionMonitor(sc, rbm.v)
+e_data = monitors.DataEnergyMonitor(sc, rbm)
+e_model = monitors.ReconstructionEnergyMonitor(sc, rbm)
+
+initial_vmap = { rbm.v: T.matrix('v') }
+
+# train = t.compile_function(initial_vmap, mb_size=32, monitors=[m], name='train', mode=mode)
+train = t.compile_function(initial_vmap, mb_size=100, monitors=[m, e_data, e_model], name='train', mode=mode)
+evaluate = t.compile_function(initial_vmap, mb_size=100, monitors=[m, m_data, m_model, e_data, e_model], name='evaluate', update=False, mode=mode)
+
+
+
+
+
+
+def plot_data(d):
+    plt.figure(5)
+    plt.clf()
+    plt.imshow(d.reshape((28,28)), interpolation='gaussian')
+    plt.draw()
+
+
+def sample_evolution(start, ns=100): # start = start data
+    sample = t.compile_function(initial_vmap, mb_size=1, monitors=[m_model], name='evaluate', update=False, mode=mode)
+    
+    data = start
+    plot_data(data)
+    
+
+    while True:
+        for k in range(ns):
+            for x in sample({ rbm.v: data }): # draw a new sample
+                data = x[0]
+            
+        plot_data(data)
+        
+
+
+
+
+
+
+
+
+
+# TRAINING 
+
+epochs = 200
+print ">> Training for %d epochs..." % epochs
+
+mses_train_so_far = []
+mses_valid_so_far = []
+edata_train_so_far = []
+emodel_train_so_far = []
+edata_so_far = []
+emodel_so_far = []
+
+for epoch in range(epochs):
+    monitoring_data_train = [(cost, energy_data, energy_model) for cost, energy_data, energy_model in train({ rbm.v: train_set_x })]
+    mses_train, edata_train_list, emodel_train_list = zip(*monitoring_data_train)
+    mse_train = np.mean(mses_train)
+    edata_train = np.mean(edata_train_list)
+    emodel_train = np.mean(emodel_train_list)
+    
+    monitoring_data = [(cost, data, model, energy_data, energy_model) for cost, data, model, energy_data, energy_model in evaluate({ rbm.v: valid_set_x })]
+    mses_valid, vdata, vmodel, edata, emodel = zip(*monitoring_data)
+    mse_valid = np.mean(mses_valid)
+    edata_valid = np.mean(edata)
+    emodel_valid = np.mean(emodel)
+    
+    # plotting
+    mses_train_so_far.append(mse_train)
+    mses_valid_so_far.append(mse_valid)
+    edata_so_far.append(edata_valid)
+    emodel_so_far.append(emodel_valid)
+    edata_train_so_far.append(edata_train)
+    emodel_train_so_far.append(emodel_train)
+    
+    plt.figure(1)
+    plt.clf()
+    plt.plot(mses_train_so_far, label='train')
+    plt.plot(mses_valid_so_far, label='validation')
+    plt.title("MSE")
+    plt.legend()
+    plt.draw()
+    
+    plt.figure(4)
+    plt.clf()
+    plt.plot(edata_so_far, label='validation / data')
+    plt.plot(emodel_so_far, label='validation / model')
+    plt.plot(edata_train_so_far, label='train / data')
+    plt.plot(emodel_train_so_far, label='train / model')
+    plt.title("energy")
+    plt.legend()
+    plt.draw()
+    
+    # plot some samples
+    plt.figure(2)
+    plt.clf()
+    plt.imshow(vdata[0][0].reshape((28, 28)))
+    plt.draw()
+    plt.figure(3)
+    plt.clf()
+    plt.imshow(vmodel[0][0].reshape((28, 28)))
+    plt.draw()
+
+    
+    print "Epoch %d" % epoch
+    print "training set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_train, edata_train, emodel_train)
+    print "validation set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_valid, edata_valid, emodel_valid)
+
+
+
+
diff --git a/test_mnist_convolutional.py b/test_mnist_convolutional.py
new file mode 100644
index 0000000..f6f9089
--- /dev/null
+++ b/test_mnist_convolutional.py
@@ -0,0 +1,215 @@
+import morb
+from morb import rbms, stats_collectors, param_updaters, trainers, monitors, units, parameters
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+import gzip, cPickle
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+from test_utils import generate_data, get_context
+
+# DEBUGGING
+
+from theano import ProfileMode
+# mode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+# mode = theano.compile.DebugMode(check_py_code=False, require_matching_strides=False)
+mode = None
+
+
+# load data
+print ">> Loading dataset..."
+
+f = gzip.open('mnist.pkl.gz','rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+
+train_set_x, train_set_y = train_set
+valid_set_x, valid_set_y = valid_set
+test_set_x, test_set_y = test_set
+
+
+# TODO DEBUG
+train_set_x = train_set_x[:10000]
+valid_set_x = valid_set_x[:1000]
+
+
+# reshape data for convolutional RBM
+train_set_x = train_set_x.reshape((train_set_x.shape[0], 1, 28, 28))
+valid_set_x = valid_set_x.reshape((valid_set_x.shape[0], 1, 28, 28))
+test_set_x = test_set_x.reshape((test_set_x.shape[0], 1, 28, 28))
+
+
+
+visible_maps = 1
+hidden_maps = 100 # 50
+filter_height = 28 # 8
+filter_width = 28 # 8
+mb_size = 10
+
+
+
+print ">> Constructing RBM..."
+fan_in = visible_maps * filter_height * filter_width
+
+"""
+initial_W = numpy.asarray(
+            self.numpy_rng.uniform(
+                low = - numpy.sqrt(3./fan_in),
+                high = numpy.sqrt(3./fan_in),
+                size = self.filter_shape
+            ), dtype=theano.config.floatX)
+"""
+numpy_rng = np.random.RandomState(123)
+initial_W = np.asarray(
+            numpy_rng.normal(
+                0, 0.5 / np.sqrt(fan_in),
+                size = (hidden_maps, visible_maps, filter_height, filter_width)
+            ), dtype=theano.config.floatX)
+initial_bv = np.zeros(visible_maps, dtype = theano.config.floatX)
+initial_bh = np.zeros(hidden_maps, dtype = theano.config.floatX)
+
+
+# rbms.SigmoidBinaryRBM(n_visible, n_hidden)
+rbm = morb.base.RBM()
+rbm.v = units.SigmoidUnits(rbm, name='v') # visibles
+rbm.h = units.BinaryUnits(rbm, name='h') # hiddens
+rbm.W = parameters.Convolutional2DParameters(rbm, [rbm.v, rbm.h], theano.shared(value=initial_W, name='W'), name='W')
+# one bias per map (so shared across width and height):
+rbm.bv = parameters.SharedBiasParameters(rbm, rbm.v, 3, 2, theano.shared(value=initial_bv, name='bv'), name='bv')
+rbm.bh = parameters.SharedBiasParameters(rbm, rbm.h, 3, 2, theano.shared(value=initial_bh, name='bh'), name='bh')
+
+
+
+# try to calculate weight updates using CD-1 stats
+print ">> Constructing contrastive divergence updaters..."
+sc = stats_collectors.CDkStatsCollector(rbm, input_units=[rbm.v], latent_units=[rbm.h], k=1)
+
+umap = {}
+for params in rbm.params_list:
+    # pu =  0.001 * (param_updaters.CDParamUpdater(params, sc) + 0.02 * param_updaters.DecayParamUpdater(params))
+    pu =  0.001 * param_updaters.CDParamUpdater(params, sc)
+    umap[params] = pu
+
+print ">> Compiling functions..."
+t = trainers.MinibatchTrainer(rbm, umap)
+m = monitors.ReconstructionMSEMonitor(sc, rbm.v)
+m_data = monitors.DataMonitor(sc, rbm.v)
+m_model = monitors.ReconstructionMonitor(sc, rbm.v)
+e_data = monitors.DataEnergyMonitor(sc, rbm)
+e_model = monitors.ReconstructionEnergyMonitor(sc, rbm)
+
+initial_vmap = { rbm.v: T.tensor4('v') }
+
+# train = t.compile_function(initial_vmap, mb_size=32, monitors=[m], name='train', mode=mode)
+train = t.compile_function(initial_vmap, mb_size=10, monitors=[m, e_data, e_model], name='train', mode=mode)
+evaluate = t.compile_function(initial_vmap, mb_size=10, monitors=[m, m_data, m_model, e_data, e_model], name='evaluate', update=False, mode=mode)
+
+
+
+
+
+
+def plot_data(d):
+    plt.figure(5)
+    plt.clf()
+    plt.imshow(d.reshape((28,28)), interpolation='gaussian')
+    plt.draw()
+
+
+def sample_evolution(start, ns=100): # start = start data
+    sample = t.compile_function(initial_vmap, mb_size=1, monitors=[m_model], name='evaluate', update=False, mode=mode)
+    
+    data = start
+    plot_data(data)
+    
+
+    while True:
+        for k in range(ns):
+            for x in sample({ rbm.v: data }): # draw a new sample
+                data = x[0]
+            
+        plot_data(data)
+        
+
+
+
+
+
+
+
+
+
+# TRAINING 
+
+epochs = 200
+print ">> Training for %d epochs..." % epochs
+
+mses_train_so_far = []
+mses_valid_so_far = []
+edata_train_so_far = []
+emodel_train_so_far = []
+edata_so_far = []
+emodel_so_far = []
+
+for epoch in range(epochs):
+    monitoring_data_train = [(cost, energy_data, energy_model) for cost, energy_data, energy_model in train({ rbm.v: train_set_x })]
+    mses_train, edata_train_list, emodel_train_list = zip(*monitoring_data_train)
+    mse_train = np.mean(mses_train)
+    edata_train = np.mean(edata_train_list)
+    emodel_train = np.mean(emodel_train_list)
+    
+    monitoring_data = [(cost, data, model, energy_data, energy_model) for cost, data, model, energy_data, energy_model in evaluate({ rbm.v: valid_set_x })]
+    mses_valid, vdata, vmodel, edata, emodel = zip(*monitoring_data)
+    mse_valid = np.mean(mses_valid)
+    edata_valid = np.mean(edata)
+    emodel_valid = np.mean(emodel)
+    
+    # plotting
+    mses_train_so_far.append(mse_train)
+    mses_valid_so_far.append(mse_valid)
+    edata_so_far.append(edata_valid)
+    emodel_so_far.append(emodel_valid)
+    edata_train_so_far.append(edata_train)
+    emodel_train_so_far.append(emodel_train)
+    
+    plt.figure(1)
+    plt.clf()
+    plt.plot(mses_train_so_far, label='train')
+    plt.plot(mses_valid_so_far, label='validation')
+    plt.title("MSE")
+    plt.legend()
+    plt.draw()
+    
+    plt.figure(4)
+    plt.clf()
+    plt.plot(edata_so_far, label='validation / data')
+    plt.plot(emodel_so_far, label='validation / model')
+    plt.plot(edata_train_so_far, label='train / data')
+    plt.plot(emodel_train_so_far, label='train / model')
+    plt.title("energy")
+    plt.legend()
+    plt.draw()
+    
+    # plot some samples
+    plt.figure(2)
+    plt.clf()
+    plt.imshow(vdata[0][0].reshape((28, 28)))
+    plt.draw()
+    plt.figure(3)
+    plt.clf()
+    plt.imshow(vmodel[0][0].reshape((28, 28)))
+    plt.draw()
+
+    
+    print "Epoch %d" % epoch
+    print "training set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_train, edata_train, emodel_train)
+    print "validation set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_valid, edata_valid, emodel_valid)
+
+
+
+
diff --git a/test_mnist_labeled.py b/test_mnist_labeled.py
new file mode 100644
index 0000000..16db68e
--- /dev/null
+++ b/test_mnist_labeled.py
@@ -0,0 +1,204 @@
+import morb
+from morb import rbms, stats_collectors, param_updaters, trainers, monitors, units, parameters
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+import gzip, cPickle
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+from test_utils import generate_data, get_context, one_hot
+
+# DEBUGGING
+
+from theano import ProfileMode
+# mode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+# mode = theano.compile.DebugMode(check_py_code=False, require_matching_strides=False)
+mode = None
+
+
+# load data
+print ">> Loading dataset..."
+
+f = gzip.open('mnist.pkl.gz','rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+
+train_set_x, train_set_y = train_set
+valid_set_x, valid_set_y = valid_set
+test_set_x, test_set_y = test_set
+
+# convert labels to one hot representation
+train_set_y_oh = one_hot(np.atleast_2d(train_set_y).T)
+valid_set_y_oh = one_hot(np.atleast_2d(valid_set_y).T)
+test_set_y_oh = one_hot(np.atleast_2d(test_set_y).T)
+
+# dim 0 = minibatches, dim 1 = units, dim 2 = states
+train_set_y_oh = train_set_y_oh.reshape((train_set_y_oh.shape[0], 1, train_set_y_oh.shape[1]))
+valid_set_y_oh = valid_set_y_oh.reshape((valid_set_y_oh.shape[0], 1, valid_set_y_oh.shape[1]))
+test_set_y_oh = test_set_y_oh.reshape((test_set_y_oh.shape[0], 1, test_set_y_oh.shape[1]))
+
+
+# make the sets a bit smaller for testing purposes
+train_set_x = train_set_x[:10000]
+train_set_y_oh = train_set_y_oh[:10000]
+valid_set_x = valid_set_x[:1000]
+valid_set_y_oh = valid_set_y_oh[:1000]
+
+
+
+
+n_visible = train_set_x.shape[1]
+n_hidden = 100
+n_states = train_set_y_oh.shape[2]
+
+
+print ">> Constructing RBM..."
+rbm = rbms.SigmoidBinaryRBM(n_visible, n_hidden)
+
+# add softmax unit for context
+rbm.s = units.SoftmaxUnits(rbm, name='s')
+
+# link context and hiddens
+initial_Ws = np.asarray( np.random.uniform(
+                   low   = -4*np.sqrt(6./(n_hidden+1+n_states)),
+                   high  =  4*np.sqrt(6./(n_hidden+1+n_states)),
+                   size  =  (1, n_states, n_hidden)),
+                   dtype =  theano.config.floatX)
+rbm.Ws = parameters.AdvancedProdParameters(rbm, [rbm.s, rbm.h], [2, 1], theano.shared(value = initial_Ws, name='Ws'), name='Ws')
+
+# try to calculate weight updates using CD-1 stats
+print ">> Constructing contrastive divergence updaters..."
+sc = stats_collectors.CDkStatsCollector(rbm, input_units=[rbm.v], latent_units=[rbm.h], context_units=[rbm.s], k=1)
+
+umap = {}
+for params in rbm.params_list:
+    # pu =  0.001 * (param_updaters.CDParamUpdater(params, sc) + 0.02 * param_updaters.DecayParamUpdater(params))
+    pu =  0.001 * param_updaters.CDParamUpdater(params, sc)
+    umap[params] = pu
+
+print ">> Compiling functions..."
+t = trainers.MinibatchTrainer(rbm, umap)
+m = monitors.ReconstructionMSEMonitor(sc, rbm.v)
+m_data = monitors.DataMonitor(sc, rbm.v)
+m_model = monitors.ReconstructionMonitor(sc, rbm.v)
+e_data = monitors.DataEnergyMonitor(sc, rbm)
+e_model = monitors.ReconstructionEnergyMonitor(sc, rbm)
+
+initial_vmap = { rbm.v: T.matrix('v'), rbm.s: T.tensor3('s') }
+
+# train = t.compile_function(initial_vmap, mb_size=32, monitors=[m], name='train', mode=mode)
+train = t.compile_function(initial_vmap, mb_size=100, monitors=[m, e_data, e_model], name='train', mode=mode)
+evaluate = t.compile_function(initial_vmap, mb_size=100, monitors=[m, m_data, m_model, e_data, e_model], name='evaluate', update=False, mode=mode)
+
+
+
+
+
+
+def plot_data(d):
+    plt.figure(5)
+    plt.clf()
+    plt.imshow(d.reshape((28,28)), interpolation='gaussian')
+    plt.draw()
+
+
+def sample_evolution(start, cls, ns=100): # start = start data
+    sample = t.compile_function(initial_vmap, mb_size=1, monitors=[m_model], name='evaluate', update=False, mode=mode)
+    
+    data = start
+    plot_data(data)
+    
+    label = one_hot(np.atleast_2d(cls), dim=10)
+    label = label.reshape((label.shape[0], 1, label.shape[1]))
+    
+
+    while True:
+        for k in range(ns):
+            for x in sample({ rbm.v: data, rbm.s: label }): # draw a new sample
+                data = x[0]
+            
+        plot_data(data)
+        
+
+
+
+
+
+
+
+
+
+# TRAINING 
+
+epochs = 200
+print ">> Training for %d epochs..." % epochs
+
+mses_train_so_far = []
+mses_valid_so_far = []
+edata_train_so_far = []
+emodel_train_so_far = []
+edata_so_far = []
+emodel_so_far = []
+
+for epoch in range(epochs):
+    monitoring_data_train = [(cost, energy_data, energy_model) for cost, energy_data, energy_model in train({ rbm.v: train_set_x, rbm.s: train_set_y_oh })]
+    mses_train, edata_train_list, emodel_train_list = zip(*monitoring_data_train)
+    mse_train = np.mean(mses_train)
+    edata_train = np.mean(edata_train_list)
+    emodel_train = np.mean(emodel_train_list)
+    
+    monitoring_data = [(cost, data, model, energy_data, energy_model) for cost, data, model, energy_data, energy_model in evaluate({ rbm.v: valid_set_x, rbm.s: valid_set_y_oh })]
+    mses_valid, vdata, vmodel, edata, emodel = zip(*monitoring_data)
+    mse_valid = np.mean(mses_valid)
+    edata_valid = np.mean(edata)
+    emodel_valid = np.mean(emodel)
+    
+    # plotting
+    mses_train_so_far.append(mse_train)
+    mses_valid_so_far.append(mse_valid)
+    edata_so_far.append(edata_valid)
+    emodel_so_far.append(emodel_valid)
+    edata_train_so_far.append(edata_train)
+    emodel_train_so_far.append(emodel_train)
+    
+    plt.figure(1)
+    plt.clf()
+    plt.plot(mses_train_so_far, label='train')
+    plt.plot(mses_valid_so_far, label='validation')
+    plt.title("MSE")
+    plt.legend()
+    plt.draw()
+    
+    plt.figure(4)
+    plt.clf()
+    plt.plot(edata_so_far, label='validation / data')
+    plt.plot(emodel_so_far, label='validation / model')
+    plt.plot(edata_train_so_far, label='train / data')
+    plt.plot(emodel_train_so_far, label='train / model')
+    plt.title("energy")
+    plt.legend()
+    plt.draw()
+    
+    # plot some samples
+    plt.figure(2)
+    plt.clf()
+    plt.imshow(vdata[0][0].reshape((28, 28)))
+    plt.draw()
+    plt.figure(3)
+    plt.clf()
+    plt.imshow(vmodel[0][0].reshape((28, 28)))
+    plt.draw()
+
+    
+    print "Epoch %d" % epoch
+    print "training set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_train, edata_train, emodel_train)
+    print "validation set: MSE = %.6f, data energy = %.2f, model energy = %.2f" % (mse_valid, edata_valid, emodel_valid)
+
+
+
+
diff --git a/test_sparsity.py b/test_sparsity.py
new file mode 100644
index 0000000..adb20dd
--- /dev/null
+++ b/test_sparsity.py
@@ -0,0 +1,170 @@
+import morb
+from morb import rbms, stats_collectors, param_updaters, trainers, monitors
+
+import theano
+import theano.tensor as T
+
+import numpy as np
+
+import gzip, cPickle
+
+import matplotlib.pyplot as plt
+plt.ion()
+
+from test_utils import generate_data, get_context, plot_data
+
+# DEBUGGING
+
+from theano import ProfileMode
+# mode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+# mode = theano.compile.DebugMode(check_py_code=False, require_matching_strides=False)
+mode = None
+
+
+# load data
+print ">> Loading dataset..."
+
+f = gzip.open('mnist.pkl.gz','rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+
+train_set_x, train_set_y = train_set
+valid_set_x, valid_set_y = valid_set
+test_set_x, test_set_y = test_set
+
+
+# TODO DEBUG
+train_set_x = train_set_x[:1000]
+valid_set_x = valid_set_x[:100]
+
+
+n_visible = train_set_x.shape[1]
+n_hidden = 100
+
+
+print ">> Constructing RBM..."
+rbm = rbms.SigmoidBinaryRBM(n_visible, n_hidden)
+
+# try to calculate weight updates using CD-1 stats
+print ">> Constructing contrastive divergence updaters..."
+sc = stats_collectors.CDkStatsCollector(rbm, input_units=[rbm.v], latent_units=[rbm.h], k=1)
+
+sparsity_targets = { rbm.h: 0.1 }
+
+
+eta = 0.001 # learning rate
+sparsity_cost = 0.5
+
+umap = {}
+
+umap[rbm.W] = eta * param_updaters.CDParamUpdater(rbm.W, sc) \
+            + eta * sparsity_cost * param_updaters.SparsityParamUpdater(rbm.W, sparsity_targets, sc)
+umap[rbm.bh] = eta * param_updaters.CDParamUpdater(rbm.bh, sc) \
+             + eta * sparsity_cost * param_updaters.SparsityParamUpdater(rbm.bh, sparsity_targets, sc)
+umap[rbm.bv] = eta * param_updaters.CDParamUpdater(rbm.bv, sc)
+
+"""
+for params in rbm.params_list:
+    # pu =  0.001 * (param_updaters.CDParamUpdater(params, sc) + 0.02 * param_updaters.DecayParamUpdater(params))
+    pu =  0.001 * param_updaters.CDParamUpdater(params, sc)
+    umap[params] = pu
+"""
+
+print ">> Compiling functions..."
+t = trainers.MinibatchTrainer(rbm, umap)
+m = monitors.ReconstructionMSEMonitor(sc, rbm.v)
+m_model = monitors.ReconstructionMonitor(sc, rbm.h)
+
+initial_vmap = { rbm.v: T.matrix('v') }
+
+# train = t.compile_function(initial_vmap, mb_size=32, monitors=[m], name='train', mode=mode)
+train = t.compile_function(initial_vmap, mb_size=100, monitors=[m, m_model], name='train', mode=mode)
+evaluate = t.compile_function(initial_vmap, mb_size=100, monitors=[m, m_model], name='evaluate', update=False, mode=mode)
+
+
+
+
+def sample_evolution(start, ns=100): # start = start data
+    sample = t.compile_function(initial_vmap, mb_size=1, monitors=[m_model], name='evaluate', update=False, mode=mode)
+    
+    data = start
+    plot_data(data)
+    
+
+    while True:
+        for k in range(ns):
+            for x in sample({ rbm.v: data }): # draw a new sample
+                data = x[0]
+            
+        plot_data(data)
+        
+
+
+
+
+
+
+
+# TRAINING 
+
+epochs = 200
+print ">> Training for %d epochs..." % epochs
+
+mses_train_so_far = []
+mses_valid_so_far = []
+mact_train_so_far = []
+mact_valid_so_far = []
+
+for epoch in range(epochs):
+    monitoring_data_train = [(cost, m_model) for cost, m_model in train({ rbm.v: train_set_x })]
+    mses_train, m_model_train_list = zip(*monitoring_data_train)
+    mse_train = np.mean(mses_train)
+    mean_activation_train = np.mean([np.mean(m) for m in m_model_train_list])
+    
+    monitoring_data = [(cost, m_model) for cost, m_model in evaluate({ rbm.v: valid_set_x })]
+    mses_valid, m_model_valid_list = zip(*monitoring_data)
+    mse_valid = np.mean(mses_valid)
+    mean_activation_valid = np.mean([np.mean(m) for m in m_model_valid_list])
+    
+    # plotting
+    mses_train_so_far.append(mse_train)
+    mses_valid_so_far.append(mse_valid)
+    mact_train_so_far.append(mean_activation_train)
+    mact_valid_so_far.append(mean_activation_valid)
+    
+    plt.figure(1)
+    plt.clf()
+    plt.plot(mses_train_so_far, label='train')
+    plt.plot(mses_valid_so_far, label='validation')
+    plt.title("MSE")
+    plt.legend()
+    plt.draw()
+    
+    plt.figure(4)
+    plt.clf()
+    plt.plot(mact_train_so_far, label='train')
+    plt.plot(mact_valid_so_far, label='validation')
+    plt.title("Mean activation of hiddens")
+    plt.legend()
+    plt.draw()
+    
+    """
+    # plot some samples
+    plt.figure(2)
+    plt.clf()
+    plt.imshow(vdata[0][0].reshape((28, 28)))
+    plt.draw()
+    plt.figure(3)
+    plt.clf()
+    plt.imshow(vmodel[0][0].reshape((28, 28)))
+    plt.draw()
+    """
+
+    
+    print "Epoch %d" % epoch
+    print "training set: MSE = %.6f, mean hidden activation = %.6f" % (mse_train, mean_activation_train)
+    print "validation set: MSE = %.6f, mean hidden activation = %.6f" % (mse_valid, mean_activation_valid)
+
+
+
+
diff --git a/test_utils.py b/test_utils.py
new file mode 100644
index 0000000..c2b2622
--- /dev/null
+++ b/test_utils.py
@@ -0,0 +1,57 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def generate_data(N):
+    """Creates a noisy dataset with some simple pattern in it."""
+    T = N * 38
+    u = np.mat(np.zeros((T, 20)))
+    for i in range(1, T, 38):
+        if i % 76 == 1:
+            u[i - 1:i + 19, :] = np.eye(20)
+            u[i + 18:i + 38, :] = np.eye(20)[np.arange(19, -1, -1)]
+            u[i - 1:i + 19, :] += np.eye(20)[np.arange(19, -1, -1)] 
+        else:
+            u[i - 1:i + 19, 1] = 1
+            u[i + 18:i + 38, 8] = 1
+    return u
+
+def get_context(u, N=4):
+    T, D = u.shape
+    x = np.zeros((T, D * N))
+    for i in range(N - 1, T):
+        dat = u[i - 1, :]
+        for j in range(2, N + 1):
+            dat = np.concatenate((dat, u[i - j, :]), 1)
+        x[i, :] = dat
+    return x
+
+
+
+
+def plot_data(d):
+    plt.figure(5)
+    plt.clf()
+    plt.imshow(d.reshape((28,28)), interpolation='gaussian')
+    plt.draw()
+
+
+
+
+def one_hot(vec, dim=None):
+    """
+    Convert a column vector with indices (normalised) to a one-hot representation.
+    Each row is a one-hot vector corresponding to an element in the original vector.
+    """
+    length = len(vec)
+    
+    if dim is None: # default dimension is the maximal dimension needed to represent 'vec'
+        dim = np.max(vec) + 1
+        
+    m = np.tile(np.arange(dim), (length, 1))
+    return (vec == m)
+
+
+
+
+