fix to dropout scaling

2015-12-13 19:53:10 +00:00 · 2015-12-13 19:53:10 +00:00 · ca8c429973
commit ca8c429973
parent 9969b47ba4
2 changed files with 11 additions and 6 deletions
--- a/mlp/layers.py
+++ b/mlp/layers.py
@ -113,18 +113,18 @@ class MLP(object):
        if p_inp < 1:
            d_inp = self.rng.binomial(1, p_inp, size=x.shape)

-        self.activations[0] = p_inp_scaler*d_inp*x
+        self.activations[0] = p_inp_scaler*d_inp*x #it's OK to scale the inputs by p_inp_scaler here
        self.activations[1] = self.layers[0].fprop(self.activations[0])
        for i in xrange(1, len(self.layers)):
            d_hid = 1
            if p_hid < 1:
                d_hid = self.rng.binomial(1, p_hid, size=self.activations[i].shape)
-            self.activations[i] *= p_hid_scaler * d_hid
-            self.activations[i+1] = self.layers[i].fprop(self.activations[i])
+            self.activations[i] *= d_hid #but not the hidden activations, since the non-linearity grad *may* explicitly depend on them
+            self.activations[i+1] = self.layers[i].fprop(p_hid_scaler*self.activations[i])

        return self.activations[-1]

-    def bprop(self, cost_grad):
+    def bprop(self, cost_grad, dp_scheduler=None):
        """
        :param cost_grad: matrix -- grad of the cost w.r.t y
        :return: None, the deltas are kept in the model
@ -144,10 +144,15 @@ class MLP(object):
        self.deltas[top_layer_idx], ograds = self.layers[top_layer_idx - 1].\
            bprop_cost(self.activations[top_layer_idx], cost_grad, self.cost)

+        p_hid_scaler = 1.0
+        if dp_scheduler is not None:
+            p_inp, p_hid = dp_scheduler.get_rate()
+            p_hid_scaler /= p_hid
+
        # then back-prop through remaining layers
        for i in xrange(top_layer_idx - 1, 0, -1):
            self.deltas[i], ograds = self.layers[i - 1].\
-                bprop(self.activations[i], ograds)
+                bprop(self.activations[i], ograds*p_hid_scaler)

    def add_layer(self, layer):
        self.layers.append(layer)
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@ -124,7 +124,7 @@ class SGDOptimiser(Optimiser):
            cost_grad = model.cost.grad(y, t)

            # do backward pass through the model
-            model.bprop(cost_grad)
+            model.bprop(cost_grad, self.dp_scheduler)

            #update the model, here we iterate over layers
            #and then over each parameter in the layer