diff --git a/mlp/errors.py b/mlp/errors.py
index 5ef95f7..8412d4c 100644
--- a/mlp/errors.py
+++ b/mlp/errors.py
@@ -154,7 +154,9 @@ class CrossEntropySoftmaxError(object):
         Returns:
             Scalar error function value.
         """
-        probs = np.exp(outputs)
+        # subtract max inside exponential to improve numerical stability -
+        # when we divide through by sum this term cancels
+        probs = np.exp(outputs - outputs.max(-1)[:, None])
         probs /= probs.sum(-1)[:, None]
         return -np.mean(np.sum(targets * np.log(probs), axis=1))
 
diff --git a/mlp/layers.py b/mlp/layers.py
index efb081a..9b7996f 100644
--- a/mlp/layers.py
+++ b/mlp/layers.py
@@ -341,7 +341,9 @@ class SoftmaxLayer(Layer):
         Returns:
             outputs: Array of layer outputs of shape (batch_size, output_dim).
         """
-        exp_inputs = np.exp(inputs)
+        # subtract max inside exponential to improve numerical stability -
+        # when we divide through by sum this term cancels
+        exp_inputs = np.exp(inputs - inputs.max(-1)[:, None])
         return exp_inputs / exp_inputs.sum(-1)[:, None]
 
     def bprop(self, inputs, outputs, grads_wrt_outputs):