diff --git a/mlp/errors.py b/mlp/errors.py index 5ef95f7..8412d4c 100644 --- a/mlp/errors.py +++ b/mlp/errors.py @@ -154,7 +154,9 @@ class CrossEntropySoftmaxError(object): Returns: Scalar error function value. """ - probs = np.exp(outputs) + # subtract max inside exponential to improve numerical stability - + # when we divide through by sum this term cancels + probs = np.exp(outputs - outputs.max(-1)[:, None]) probs /= probs.sum(-1)[:, None] return -np.mean(np.sum(targets * np.log(probs), axis=1)) diff --git a/mlp/layers.py b/mlp/layers.py index efb081a..9b7996f 100644 --- a/mlp/layers.py +++ b/mlp/layers.py @@ -341,7 +341,9 @@ class SoftmaxLayer(Layer): Returns: outputs: Array of layer outputs of shape (batch_size, output_dim). """ - exp_inputs = np.exp(inputs) + # subtract max inside exponential to improve numerical stability - + # when we divide through by sum this term cancels + exp_inputs = np.exp(inputs - inputs.max(-1)[:, None]) return exp_inputs / exp_inputs.sum(-1)[:, None] def bprop(self, inputs, outputs, grads_wrt_outputs):