diff --git a/mlp/costs.py b/mlp/errors.py
similarity index 56%
rename from mlp/costs.py
rename to mlp/errors.py
index bd103b3..5ef95f7 100644
--- a/mlp/costs.py
+++ b/mlp/errors.py
@@ -1,20 +1,22 @@
 # -*- coding: utf-8 -*-
-"""Model costs.
+"""Error functions.
 
-This module defines cost functions, with the aim of model training being to
-minimise the cost function given a set of inputs and target outputs. The cost
-functions typically measure some concept of distance between the model outputs
-and target outputs.
+This module defines error functions, with the aim of model training being to
+minimise the error function given a set of inputs and target outputs.
+
+The error functions will typically measure some concept of distance between the
+model outputs and target outputs, averaged over all data points in the data set
+or batch.
 """
 
 import numpy as np
 
 
-class MeanSquaredErrorCost(object):
-    """Mean squared error cost."""
+class SumOfSquaredDiffsError(object):
+    """Sum of squared differences (squared Euclidean distance) error."""
 
     def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
+        """Calculates error function given a batch of outputs and targets.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
@@ -26,148 +28,149 @@ class MeanSquaredErrorCost(object):
         return 0.5 * np.mean(np.sum((outputs - targets)**2, axis=1))
 
     def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
+        """Calculates gradient of error function with respect to outputs.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Gradient of cost function with respect to outputs.
+            Gradient of error function with respect to outputs.
         """
-        return outputs - targets
+        return (outputs - targets) / outputs.shape[0]
 
     def __repr__(self):
         return 'MeanSquaredErrorCost'
 
 
-class BinaryCrossEntropyCost(object):
-    """Binary cross entropy cost."""
+class BinaryCrossEntropyError(object):
+    """Binary cross entropy error."""
 
     def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
+        """Calculates error function given a batch of outputs and targets.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Scalar cost function value.
+            Scalar error function value.
         """
         return -np.mean(
             targets * np.log(outputs) + (1. - targets) * np.log(1. - ouputs))
 
     def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
+        """Calculates gradient of error function with respect to outputs.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Gradient of cost function with respect to outputs.
+            Gradient of error function with respect to outputs.
         """
-        return (1. - targets) / (1. - outputs) - (targets / outputs)
+        return ((1. - targets) / (1. - outputs) -
+                (targets / outputs)) / outputs.shape[0]
 
     def __repr__(self):
-        return 'BinaryCrossEntropyCost'
+        return 'BinaryCrossEntropyError'
 
 
-class BinaryCrossEntropySigmoidCost(object):
-    """Binary cross entropy cost with logistic sigmoid applied to outputs."""
+class BinaryCrossEntropySigmoidError(object):
+    """Binary cross entropy error with logistic sigmoid applied to outputs."""
 
     def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
+        """Calculates error function given a batch of outputs and targets.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Scalar cost function value.
+            Scalar error function value.
         """
         probs = 1. / (1. + np.exp(-outputs))
         return -np.mean(
             targets * np.log(probs) + (1. - targets) * np.log(1. - probs))
 
     def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
+        """Calculates gradient of error function with respect to outputs.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Gradient of cost function with respect to outputs.
+            Gradient of error function with respect to outputs.
         """
         probs = 1. / (1. + np.exp(-outputs))
-        return probs - targets
+        return (probs - targets) / outputs.shape[0]
 
     def __repr__(self):
-        return 'BinaryCrossEntropySigmoidCost'
+        return 'BinaryCrossEntropySigmoidError'
 
 
-class CrossEntropyCost(object):
-    """Multi-class cross entropy cost."""
+class CrossEntropyError(object):
+    """Multi-class cross entropy error."""
 
     def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
+        """Calculates error function given a batch of outputs and targets.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Scalar cost function value.
+            Scalar error function value.
         """
         return -np.mean(np.sum(targets * np.log(outputs), axis=1))
 
     def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
+        """Calculates gradient of error function with respect to outputs.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Gradient of cost function with respect to outputs.
+            Gradient of error function with respect to outputs.
         """
-        return -targets / outputs
+        return -(targets / outputs) / outputs.shape[0]
 
     def __repr__(self):
-        return 'CrossEntropyCost'
+        return 'CrossEntropyError'
 
 
-class CrossEntropySoftmaxCost(object):
-    """Multi-class cross entropy cost with Softmax applied to outputs."""
+class CrossEntropySoftmaxError(object):
+    """Multi-class cross entropy error with Softmax applied to outputs."""
 
     def __call__(self, outputs, targets):
-        """Calculates cost function given a batch of outputs and targets.
+        """Calculates error function given a batch of outputs and targets.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Scalar cost function value.
+            Scalar error function value.
         """
         probs = np.exp(outputs)
         probs /= probs.sum(-1)[:, None]
         return -np.mean(np.sum(targets * np.log(probs), axis=1))
 
     def grad(self, outputs, targets):
-        """Calculates gradient of cost function with respect to outputs.
+        """Calculates gradient of error function with respect to outputs.
 
         Args:
             outputs: Array of model outputs of shape (batch_size, output_dim).
             targets: Array of target outputs of shape (batch_size, output_dim).
 
         Returns:
-            Gradient of cost function with respect to outputs.
+            Gradient of error function with respect to outputs.
         """
         probs = np.exp(outputs)
         probs /= probs.sum(-1)[:, None]
-        return probs - targets
+        return (probs - targets) / outputs.shape[0]
 
     def __repr__(self):
-        return 'CrossEntropySoftmaxCost'
+        return 'CrossEntropySoftmaxError'
diff --git a/mlp/learning_rules.py b/mlp/learning_rules.py
index 4156c23..22f2bcb 100644
--- a/mlp/learning_rules.py
+++ b/mlp/learning_rules.py
@@ -10,16 +10,17 @@ import numpy as np
 class GradientDescentLearningRule(object):
     """Simple (stochastic) gradient descent learning rule.
 
-    For a scalar loss function `L(p[0], p_[1] ... )` of some set of potentially
-    multidimensional parameters this attempts to find a local minimum of the
-    loss function by applying updates to each parameter of the form
+    For a scalar error function `E(p[0], p_[1] ... )` of some set of
+    potentially multidimensional parameters this attempts to find a local
+    minimum of the loss function by applying updates to each parameter of the
+    form
 
-        p[i] := p[i] - learning_rate * dL/dp[i]
+        p[i] := p[i] - learning_rate * dE/dp[i]
 
     With `learning_rate` a positive scaling parameter.
 
-    The loss function used in successive applications of these updates may be a
-    stochastic estimator of the true loss function (e.g. when the loss with
+    The error function used in successive applications of these updates may be
+    a stochastic estimator of the true error function (e.g. when the error with
     respect to only a subset of data-points is calculated) in which case this
     will correspond to a stochastic gradient descent learning rule.
     """
diff --git a/mlp/optimisers.py b/mlp/optimisers.py
index 4ce9e4d..123c3a5 100644
--- a/mlp/optimisers.py
+++ b/mlp/optimisers.py
@@ -17,30 +17,30 @@ logger = logging.getLogger(__name__)
 class Optimiser(object):
     """Basic model optimiser."""
 
-    def __init__(self, model, cost, learning_rule, train_dataset,
+    def __init__(self, model, error, learning_rule, train_dataset,
                  valid_dataset=None, data_monitors=None):
         """Create a new optimiser instance.
 
         Args:
             model: The model to optimise.
-            cost: The scalar cost function to minimise.
+            error: The scalar error function to minimise.
             learning_rule: Gradient based learning rule to use to minimise
-                cost.
+                error.
             train_dataset: Data provider for training set data batches.
             valid_dataset: Data provider for validation set data batches.
             data_monitors: Dictionary of functions evaluated on targets and
                 model outputs (averaged across both full training and
                 validation data sets) to monitor during training in addition
-                to the cost. Keys should correspond to a string label for
+                to the error. Keys should correspond to a string label for
                 the statistic being evaluated.
         """
         self.model = model
-        self.cost = cost
+        self.error = error
         self.learning_rule = learning_rule
         self.learning_rule.initialise(self.model.params)
         self.train_dataset = train_dataset
         self.valid_dataset = valid_dataset
-        self.data_monitors = OrderedDict([('cost', cost)])
+        self.data_monitors = OrderedDict([('error', error)])
         if data_monitors is not None:
             self.data_monitors.update(data_monitors)
 
@@ -48,13 +48,13 @@ class Optimiser(object):
         """Do a single training epoch.
 
         This iterates through all batches in training dataset, for each
-        calculating the gradient of the estimated loss given the batch with
+        calculating the gradient of the estimated error given the batch with
         respect to all the model parameters and then updates the model
         parameters according to the learning rule.
         """
         for inputs_batch, targets_batch in self.train_dataset:
             activations = self.model.fprop(inputs_batch)
-            grads_wrt_outputs = self.cost.grad(activations[-1], targets_batch)
+            grads_wrt_outputs = self.error.grad(activations[-1], targets_batch)
             grads_wrt_params = self.model.grads_wrt_params(
                 activations, grads_wrt_outputs)
             self.learning_rule.update_params(grads_wrt_params)