From ca8c429973cd89cee53b064f6c63a6b609ebe8ef Mon Sep 17 00:00:00 2001
From: pswietojanski
Date: Sun, 13 Dec 2015 19:53:10 +0000
Subject: [PATCH] fix to dropout scaling
---
mlp/layers.py | 15 ++++++++++-----
mlp/optimisers.py | 2 +-
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/mlp/layers.py b/mlp/layers.py
index f87dab2..1e46058 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -113,18 +113,18 @@ class MLP(object):
if p_inp < 1:
d_inp = self.rng.binomial(1, p_inp, size=x.shape)
- self.activations[0] = p_inp_scaler*d_inp*x
+ self.activations[0] = p_inp_scaler*d_inp*x #it's OK to scale the inputs by p_inp_scaler here
self.activations[1] = self.layers[0].fprop(self.activations[0])
for i in xrange(1, len(self.layers)):
d_hid = 1
if p_hid < 1:
d_hid = self.rng.binomial(1, p_hid, size=self.activations[i].shape)
- self.activations[i] *= p_hid_scaler * d_hid
- self.activations[i+1] = self.layers[i].fprop(self.activations[i])
+ self.activations[i] *= d_hid #but not the hidden activations, since the non-linearity grad *may* explicitly depend on them
+ self.activations[i+1] = self.layers[i].fprop(p_hid_scaler*self.activations[i])
return self.activations[-1]
- def bprop(self, cost_grad):
+ def bprop(self, cost_grad, dp_scheduler=None):
"""
:param cost_grad: matrix -- grad of the cost w.r.t y
:return: None, the deltas are kept in the model
@@ -144,10 +144,15 @@ class MLP(object):
self.deltas[top_layer_idx], ograds = self.layers[top_layer_idx - 1].\
bprop_cost(self.activations[top_layer_idx], cost_grad, self.cost)
+ p_hid_scaler = 1.0
+ if dp_scheduler is not None:
+ p_inp, p_hid = dp_scheduler.get_rate()
+ p_hid_scaler /= p_hid
+
# then back-prop through remaining layers
for i in xrange(top_layer_idx - 1, 0, -1):
self.deltas[i], ograds = self.layers[i - 1].\
- bprop(self.activations[i], ograds)
+ bprop(self.activations[i], ograds*p_hid_scaler)
def add_layer(self, layer):
self.layers.append(layer)
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index 5078821..3200238 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -124,7 +124,7 @@ class SGDOptimiser(Optimiser):
cost_grad = model.cost.grad(y, t)
# do backward pass through the model
- model.bprop(cost_grad)
+ model.bprop(cost_grad, self.dp_scheduler)
#update the model, here we iterate over layers
#and then over each parameter in the layer